In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")


In [2]:
# Detect outliers using IQR
def detect_outliers(df):
    outliers = {}
    for col in df.select_dtypes(include=np.number).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers[col] = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))].shape[0]
    return pd.Series(outliers).sort_values(ascending=False)

In [3]:
# Set visualization style
sns.set_style("whitegrid")

# Load the dataset
file_path = "dataset/creditcard.csv"

try:
    df = pd.read_csv(file_path)
    print(f"Loaded dataset with shape: {df.shape}")
except Exception as e:
    print(f"Error loading dataset: {e}")

# Display first few rows
display(df.head())

# Check for missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

if missing_values.empty:
    print("No missing values found.")
else:
    print("Missing values found:\n", missing_values)
    df.fillna(df.median(), inplace=True)  # Fill missing values with median

# Remove duplicate rows
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(f"Removed {before - after} duplicate rows.")

outliers = detect_outliers(df)
print("Outliers detected:\n", outliers)

# Standardize numerical features (except 'Class')
scaler = StandardScaler()
num_features = [col for col in df.columns if col not in ['Class']]
df[num_features] = scaler.fit_transform(df[num_features])

# Save cleaned dataset
df.to_csv("dataset/cleaned_creditcard.csv", index=False)
print(f"Saved cleaned dataset: cleaned_creditcard.csv")


Loaded dataset with shape: (284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


No missing values found.
Removed 1081 duplicate rows.
Outliers detected:
 V27       38799
Amount    31685
V28       30094
V20       27553
V8        23904
V6        22886
V23       18467
V12       15282
V21       14401
V14       14060
V2        13390
V5        12221
V4        11094
V19       10150
V10        9345
V7         8839
V9         8199
V16        8180
V18        7468
V17        7353
V1         6948
V26        5665
V25        5333
V24        4758
V13        3362
V3         3306
V15        2884
V22        1298
V11         735
Class       473
Time          0
dtype: int64
Saved cleaned dataset: cleaned_creditcard.csv
