In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import seaborn as sns
import os
import duckdb


In [3]:
cleaned_parquet = "../data/cleaned_data/cleaned_fraud.parquet"

print(f"ðŸ“Š Original size: {os.path.getsize(cleaned_parquet) / (1024**3):.2f} GB")

con = duckdb.connect()

ðŸ“Š Original size: 0.14 GB


In [None]:
result = con.execute(f"""
            SELECT * FROM '{cleaned_parquet}'
            LIMIT 5
            """).fetch_df()

print(result)

con.execute(f"""
            SELECT * FROM '{cleaned_parquet}'
            LIMIT 5
            """).fetch_df()

  sender_account receiver_account  amount transaction_type merchant_category  \
0      ACC420214        ACC222629  318.12       withdrawal        restaurant   
1      ACC759858        ACC433871   25.03         transfer            online   
2      ACC702235        ACC658588    5.33         transfer            online   
3      ACC818001        ACC846452  261.11          payment     entertainment   
4      ACC293626        ACC440136   28.61         transfer            retail   

  location device_used  is_fraud  time_since_last_transaction  \
0    Tokyo         pos     False                 -4797.552868   
1    Dubai         pos     False                  3705.738348   
2  Toronto         pos     False                  2158.906433   
3    Tokyo         atm     False                   -71.393848   
4   London         pos     False                  1400.413482   

   spending_deviation_score  velocity_score  geo_anomaly_score  \
0                     -0.94              16               0.64

In [4]:
con.execute(f"""
            SELECT * FROM '{cleaned_parquet}'
            LIMIT 5
            """).fetch_df()

Unnamed: 0,sender_account,receiver_account,amount,transaction_type,merchant_category,location,device_used,is_fraud,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel,ip_address,device_hash,year,month,day_of_month,hour,day_of_week
0,ACC420214,ACC222629,318.12,withdrawal,restaurant,Tokyo,pos,False,-4797.552868,-0.94,16,0.64,UPI,88.85.250.147,D3353785,2023,4,25,14,2
1,ACC759858,ACC433871,25.03,transfer,online,Dubai,pos,False,3705.738348,-0.56,1,0.48,ACH,89.235.76.67,D4950912,2023,8,17,1,4
2,ACC702235,ACC658588,5.33,transfer,online,Toronto,pos,False,2158.906433,0.77,7,0.18,ACH,132.247.155.53,D9285320,2023,12,28,23,4
3,ACC818001,ACC846452,261.11,payment,entertainment,Tokyo,atm,False,-71.393848,0.43,12,0.41,wire_transfer,186.251.230.65,D4842173,2023,8,18,9,5
4,ACC293626,ACC440136,28.61,transfer,retail,London,pos,False,1400.413482,-1.48,18,0.53,UPI,233.115.221.14,D7106200,2023,10,30,9,1


In [None]:
%pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4


* Analysis: Checks the class distribution of is_fraud and scans for patterns (like negative values in time_since_last_transaction, which might indicate data issues or special flags).
   * Preprocessing:
       * Drops high-cardinality ID columns (sender_account, ip_address, etc.) that don't help with generalization.
       * Encodes categorical variables (transaction_type, location, etc.) using OneHotEncoder.
       * Scales numerical variables (amount, velocity_score, etc.) using StandardScaler.
   * Preparation: Splits the data into training and testing sets before applying SMOTE (best practice to avoid data leakage), and then applies SMOTE to the training set to balance the
     classes.

In [5]:
# Load full dataset
df = con.execute(f"SELECT * FROM '{cleaned_parquet}'").fetch_df()

# 1. EDA & Pattern Detection
print("Original Class Distribution:")
print(df['is_fraud'].value_counts(normalize=True))

# Check for negative time values (Pattern Detection)
neg_time = df[df['time_since_last_transaction'] < 0]
print(f"\nRows with negative time_since_last_transaction: {len(neg_time)}")
if len(neg_time) > 0:
    print("Negative values might indicate data errors or specific flags. Treating as valid numeric for now.")

Original Class Distribution:
is_fraud
False    0.956244
True     0.043756
Name: proportion, dtype: float64

Rows with negative time_since_last_transaction: 2051331
Negative values might indicate data errors or specific flags. Treating as valid numeric for now.


In [6]:


# 2. Data Preparation for SMOTE
# - Drop high cardinality identifiers
# - Encode categorical variables
# - Scale numerical variables

categorical_cols = ['transaction_type', 'merchant_category', 'location', 'device_used', 'payment_channel']
numerical_cols = ['amount', 'time_since_last_transaction', 'spending_deviation_score', 'velocity_score', 'geo_anomaly_score', 'hour', 'day_of_week']
drop_cols = ['sender_account', 'receiver_account', 'ip_address', 'device_hash', 'year', 'month', 'day_of_month']

# Separate Features and Target
X = df.drop(columns=['is_fraud'] + drop_cols, errors='ignore')
y = df['is_fraud']

# Split Data (Best Practice: Split BEFORE SMOTE)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ])

# Fit on Train, Transform on Train and Test
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)




In [None]:
# Reconstruct DataFrames for SMOTE
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
feature_names = numerical_cols + list(cat_feature_names)

X_train_df = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_df = pd.DataFrame(X_test_processed, columns=feature_names)

print(f"\nProcessed Training Shape: {X_train_df.shape}")

# 3. Apply SMOTE
try:
    from imblearn.over_sampling import SMOTE
    print("\nApplying SMOTE...")
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_df, y_train)
    
    print("Class distribution after SMOTE:")
    print(y_train_resampled.value_counts(normalize=True))
    print(f"New Training Shape: {X_train_resampled.shape}")
    
except ImportError:
    print("\n'imbalanced-learn' library is not installed. To install run:")
    print("!pip install imbalanced-learn")


Processed Training Shape: (3282789, 35)

Applying SMOTE...




Class distribution after SMOTE:
is_fraud
False    0.5
True     0.5
Name: proportion, dtype: float64
New Training Shape: (6278294, 35)


In [None]:
# Next Steps