In [14]:
import pandas as pd

# 1. LOAD DATA
# Menggunakan sep=';' karena data Anda dipisahkan titik koma
try:
    df = pd.read_csv('Credit_Card_Dataset.csv', sep=';', on_bad_lines='skip')
    print("✅ Data berhasil dimuat.")
except Exception as e:
    print(f"❌ Error: {e}")

# 2. MEMBERSIHKAN KOLOM 'AMOUNT'
# Masalah: Format '4189.27.00'. Kita harus buang '.00' di belakang agar bisa jadi angka.
# Langkah: Ubah ke string -> Hapus '.00' di akhir -> Ubah ke Float
df['Amount'] = df['Amount'].astype(str).str.replace(r'\.00$', '', regex=True)
df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')

# 3. KONVERSI WAKTU
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'], errors='coerce')

# 4. HAPUS KOLOM SAMPAH
# Menghapus kolom kosong (Unnamed) akibat tanda ;;;;; di akhir baris csv
df = df.dropna(axis=1, how='all')

# Tampilkan info data yang sudah bersih
print("\n--- Info Data Bersih ---")
df.info()
df.head()

✅ Data berhasil dimuat.

--- Info Data Bersih ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   TransactionID    99999 non-null   float64       
 1   TransactionDate  100000 non-null  datetime64[ns]
 2   Amount           99825 non-null   float64       
 3   MerchantID       100000 non-null  int64         
 4   TransactionType  100000 non-null  object        
 5   Location         100000 non-null  object        
 6   IsFraud          100000 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(2), object(2)
memory usage: 5.3+ MB


Unnamed: 0,TransactionID,TransactionDate,Amount,MerchantID,TransactionType,Location,IsFraud
0,1.0,2024-04-03 14:15:35.462794,4189.27,688,refund,San Antonio,0
1,2.0,2024-03-19 13:20:35.462824,2659.71,109,refund,Dallas,0
2,3.0,2024-01-08 10:08:35.462834,784.0,394,purchase,New York,0
3,4.0,2024-04-13 23:50:35.462850,3514.04,944,purchase,Philadelphia,0
4,5.0,2024-07-12 18:51:35.462858,369.07,475,purchase,Phoenix,0


In [15]:
# Memisahkan data Fraud dan Normal
fraud_data = df[df['IsFraud'] == 1]
normal_data = df[df['IsFraud'] == 0]

print("=== RINGKASAN DATA ===")
print(f"Total Transaksi: {len(df)}")
print(f"Jumlah Fraud   : {len(fraud_data)} ({len(fraud_data)/len(df)*100:.2f}%)")
print(f"Jumlah Normal  : {len(normal_data)} ({len(normal_data)/len(df)*100:.2f}%)")

print("\n=== PERBANDINGAN NOMINAL UANG (AMOUNT) ===")
# Membuat tabel perbandingan statistik
stats_comparison = df.groupby('IsFraud')['Amount'].describe().transpose()
stats_comparison.columns = ['Normal (0)', 'Fraud (1)']
print(stats_comparison)

=== RINGKASAN DATA ===
Total Transaksi: 100000
Jumlah Fraud   : 1000 (1.00%)
Jumlah Normal  : 99000 (99.00%)

=== PERBANDINGAN NOMINAL UANG (AMOUNT) ===
         Normal (0)    Fraud (1)
count  98826.000000   999.000000
mean    2501.223630  2519.635906
std     1439.750017  1457.210227
min        1.020000     5.560000
25%     1253.672500  1250.155000
50%     2500.335000  2543.360000
75%     3745.257500  3815.310000
max     4999.770000  4985.880000


In [16]:
# 1. Analisis Berdasarkan Tipe Transaksi (Purchase vs Refund)
print("=== PERSENTASE FRAUD PER TIPE TRANSAKSI ===")
type_analysis = df.groupby('TransactionType')['IsFraud'].agg(['count', 'sum', 'mean'])
type_analysis.columns = ['Total Transaksi', 'Jumlah Fraud', 'Rate Fraud (%)']
type_analysis['Rate Fraud (%)'] = type_analysis['Rate Fraud (%)'] * 100
print(type_analysis)

print("\n" + "="*40 + "\n")

# 2. Analisis Berdasarkan Lokasi (Top 5 Kota dengan Fraud Terbanyak)
print("=== TOP 5 KOTA DENGAN KASUS FRAUD TERTINGGI ===")
loc_analysis = df[df['IsFraud'] == 1]['Location'].value_counts().head(5)
print(loc_analysis)

=== PERSENTASE FRAUD PER TIPE TRANSAKSI ===
                 Total Transaksi  Jumlah Fraud  Rate Fraud (%)
TransactionType                                               
purchase                   49869           493         0.98859
refund                     50131           507         1.01135


=== TOP 5 KOTA DENGAN KASUS FRAUD TERTINGGI ===
Location
New York     116
San Diego    115
Houston      105
Phoenix       99
Dallas        99
Name: count, dtype: int64


In [17]:
# Menghitung batas wajar data
Q1 = df['Amount'].quantile(0.25)
Q3 = df['Amount'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

# Menghitung outlier
outliers = df[(df['Amount'] < lower_bound) | (df['Amount'] > upper_bound)]

print(f"=== DETEKSI OUTLIER (ANOMALI NOMINAL) ===")
print(f"Batas Bawah Wajar : {lower_bound:.2f}")
print(f"Batas Atas Wajar  : {upper_bound:.2f}")
print(f"Jumlah Transaksi Outlier: {len(outliers)}")
print(f"Persentase Outlier: {len(outliers)/len(df)*100:.2f}%")

print("\nContoh 5 Transaksi dengan Nominal Terbesar (Mencurigakan):")
print(df.sort_values('Amount', ascending=False)[['TransactionID', 'Amount', 'IsFraud', 'Location']].head(5))

=== DETEKSI OUTLIER (ANOMALI NOMINAL) ===
Batas Bawah Wajar : -2484.77
Batas Atas Wajar  : 7484.39
Jumlah Transaksi Outlier: 0
Persentase Outlier: 0.00%

Contoh 5 Transaksi dengan Nominal Terbesar (Mencurigakan):
       TransactionID   Amount  IsFraud     Location
37148        37149.0  4999.77        0  San Antonio
86214        86215.0  4999.73        0       Dallas
43484        43485.0  4999.73        0      Chicago
86525        86526.0  4999.52        0      Phoenix
31354        31355.0  4999.51        0     San Jose


In [18]:
print("=== MATRIKS KORELASI ===")
# Hanya mengambil kolom numerik
numeric_df = df.select_dtypes(include=['float64', 'int64'])
correlation = numeric_df.corr()

# Menampilkan korelasi terhadap kolom 'IsFraud'
print(correlation['IsFraud'].sort_values(ascending=False))

=== MATRIKS KORELASI ===
IsFraud          1.000000
Amount           0.001273
MerchantID       0.001157
TransactionID   -0.000207
Name: IsFraud, dtype: float64
