In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#for preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline


#models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, roc_auc_score


In [None]:
# load dataset
df=pd.read_csv (r'../data/raw/synthetic_fraud_dataset.csv')
df.head()

In [None]:
# Quick overview
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst 5 rows:")
display(df.head())

In [None]:
# Data types and non-null counts
print("\nData types and missing values:")
print(df.info())

In [None]:
print("\nMissing values per column:")
print(df.isna().sum())

In [None]:
# Summary for numeric and categorical columns
print("\nNumeric summary:")
display(df.describe())

print("\nCategorical summary:")
display(df.describe(include='object'))

In [None]:
# Count how many fraud vs non-fraud
print("\nFraud label distribution:")
print(df['Fraud_Label'].value_counts())


In [None]:

# Plot if you want
df['Fraud_Label'].value_counts().plot(kind='bar', title='Fraud vs Non-Fraud')
plt.xlabel('Fraud_Label')
plt.ylabel('Count')
plt.show()

In [None]:
#we see how imbalanced it is

df['Fraud_Label'].value_counts()
df['Fraud_Label'].value_counts(normalize=True)

In [None]:
# Quick correlation heatmap for numeric columns
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(numeric_only=True), cmap='coolwarm', annot=False)
plt.title("Numeric Feature Correlation")
plt.show()

In [None]:
df.info()
df.nunique()

In [None]:
#Define Features and Target
X = df.drop(columns=['Fraud_Label', 'Transaction_ID', 'User_ID', 'Timestamp'])
y = df['Fraud_Label']

In [None]:
# Train and Test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# we seperate Numerical Columns

numeric_cols = ['Risk_Score', 'Failed_Transaction_Count_7d']

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# New preprocessor: only scale these two numeric features
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols)
])

In [None]:
# we build the Pipeline 

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(
        random_state=42,
        n_estimators=200,
        max_depth=None
    ))
])

In [None]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred))

In [None]:
# below is the old codes

In [None]:
# df['Failed_to_Total_Ratio'] = df['Failed_Transaction_Count_7d'] / (df['Daily_Transaction_Count'] + 1)


In [None]:
# 1. Correlation of each column with the target
df.corr(numeric_only=True)['Fraud_Label'].sort_values(ascending=False).head(10)


In [None]:
# this is a supervised machine learning, Random Forest with scaling 

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
# --- Drop leakage features ---
leak_features = ['Failed_Transaction_Count_7d', 'Risk_Score', 'Failed_to_Total_Ratio']
X = df.drop(columns=leak_features + ['Fraud_Label'], errors='ignore')
y = df['Fraud_Label']

# --- Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- Detect column types ---
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# --- Build preprocessor ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# --- Build pipeline ---
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

# --- Train and evaluate ---
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probs = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, preds))
print("ROC-AUC:", roc_auc_score(y_test, probs))


In [None]:
df = df.drop_duplicates()
df = df.dropna(subset=['Fraud_Label'])  # target must not be missing
df['Timestamp'] = pd.to_datetime(df['Timestamp'])


In [None]:
#missing values in columns
df['Account_Balance'] = df['Account_Balance'].fillna(df['Account_Balance'].median())
df['Device_Type'] = df['Device_Type'].fillna('Unknown')

In [None]:
df['Hour'] = df['Timestamp'].dt.hour
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode categoricals
df = pd.get_dummies(df, columns=['Transaction_Type', 'Device_Type', 'Merchant_Category', 'Card_Type', 'Authentication_Method'], drop_first=True)

# Scale numeric columns
scaler = StandardScaler()
numeric_cols = ['Transaction_Amount', 'Account_Balance', 'Card_Age', 'Transaction_Distance', 'Risk_Score']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Fraud_Label', axis=1)
y = df['Fraud_Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
categorical_cols = X.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols.tolist())


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Separate target
y = df['Fraud_Label']
X = df.drop(['Fraud_Label', 'Transaction_ID', 'User_ID', 'Timestamp'], axis=1)

# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessor
preprocessor = ColumnTransformer([
    ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('numeric', 'passthrough', numeric_cols)
])

# Pipeline (preprocessing + model)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train
model.fit(X_train, y_train)


In [None]:
X = df.drop(['Transaction_ID', 'User_ID', 'Timestamp'], axis=1)

In [None]:
categorical_cols = X.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols.tolist())

In [None]:
#Simplified RandomForest (no scaling)

from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split 
#Separate target 
y = df['Fraud_Label'] 
X = df.drop(['Fraud_Label', 'Transaction_ID', 'User_ID', 'Timestamp'], axis=1) 
# Identify categorical and numeric columns 
categorical_cols = X.select_dtypes(include=['object']).columns 
numeric_cols = X.select_dtypes(exclude=['object']).columns 
#Preprocessor 
preprocessor = ColumnTransformer([ ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_cols), ('numeric', 'passthrough', numeric_cols) ]) 
#Pipeline (preprocessing + model) 
model = Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', RandomForestClassifier()) ]) 
#Split data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) 
#Train 
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

# Predict on the test set
y_pred = model.predict(X_test)

# Predicted probabilities (for ROC-AUC)
y_prob = model.predict_proba(X_test)[:, 1]

# Print performance metrics
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


In [None]:
# Multi-Model Benchmark (LR, RF, GB)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score

# --- Split data ---
X = df.drop('Fraud_Label', axis=1)
y = df['Fraud_Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- Detect column types ---
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include=['object','category']).columns

# --- Shared preprocessor ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

# --- Define models inside pipelines ---
pipelines = {
    "LogisticRegression": Pipeline([
        ('preprocess', preprocessor),
        ('model', LogisticRegression(max_iter=1000))
    ]),
    "RandomForest": Pipeline([
        ('preprocess', preprocessor),
        ('model', RandomForestClassifier(random_state=42))
    ]),
    "GradientBoosting": Pipeline([
        ('preprocess', preprocessor),
        ('model', GradientBoostingClassifier(random_state=42))
    ])
}

# --- Train and evaluate each ---
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    probs = pipe.predict_proba(X_test)[:, 1]
    print(f"\n{name}")
    print(classification_report(y_test, preds))
    print("ROC-AUC:", roc_auc_score(y_test, probs))
    print("-"*60)


In [None]:
# Pick one trained pipeline, e.g. the random forest
rf_pipeline = pipelines["RandomForest"]

# Get feature names after preprocessing
ohe = rf_pipeline.named_steps['preprocess'].named_transformers_['cat']
encoded_cat_cols = list(ohe.get_feature_names_out(cat_cols))
all_features = list(num_cols) + encoded_cat_cols

# Get importances
importances = rf_pipeline.named_steps['model'].feature_importances_
feat_imp = pd.Series(importances, index=all_features).sort_values(ascending=False)

print(feat_imp.head(20))

In [None]:
import matplotlib.pyplot as plt

feat_imp.head(15).plot(kind='barh', figsize=(8,6))
plt.title("Top 15 Feature Importances")
plt.show()

In [None]:
#Optimize Model Hyperparameters

# Use GridSearchCV or RandomizedSearchCV to tune the best model:

from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [5, 10, 15, None],
    'model__min_samples_split': [2, 5, 10]
}

search = RandomizedSearchCV(
    rf_pipeline, param_grid,
    n_iter=10, scoring='roc_auc',
    cv=3, random_state=42, n_jobs=-1
)
search.fit(X_train, y_train)
print("Best ROC-AUC:", search.best_score_)
print("Best Params:", search.best_params_)

In [None]:
# from fraud_pipeline import FraudPipeline

# pipeline = FraudPipeline(model='random_forest')
# pipeline.fit("paypal_data.csv")
# # pipeline.predict("new_stripe_data.csv", output="fraud_scores.csv")