In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#for preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# load dataset
df=pd.read_csv (r'../data/raw/synthetic_fraud_dataset.csv')
df.head()

In [None]:
# Quick overview
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst 5 rows:")
display(df.head())

In [None]:
# Data types and non-null counts
print("\nData types and missing values:")
print(df.info())

In [None]:
print("\nMissing values per column:")
print(df.isna().sum())

In [None]:
# Summary for numeric and categorical columns
print("\nNumeric summary:")
display(df.describe())

print("\nCategorical summary:")
display(df.describe(include='object'))

In [None]:
# Count how many fraud vs non-fraud
print("\nFraud label distribution:")
print(df['Fraud_Label'].value_counts())


In [None]:

# Plot if you want
df['Fraud_Label'].value_counts().plot(kind='bar', title='Fraud vs Non-Fraud')
plt.xlabel('Fraud_Label')
plt.ylabel('Count')
plt.show()

In [None]:
# Quick correlation heatmap for numeric columns
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(numeric_only=True), cmap='coolwarm', annot=False)
plt.title("Numeric Feature Correlation")
plt.show()

In [None]:
df = df.drop_duplicates()
df = df.dropna(subset=['Fraud_Label'])  # target must not be missing
df['Timestamp'] = pd.to_datetime(df['Timestamp'])


In [None]:
#missing values in columns
df['Account_Balance'] = df['Account_Balance'].fillna(df['Account_Balance'].median())
df['Device_Type'] = df['Device_Type'].fillna('Unknown')

In [None]:
df['Hour'] = df['Timestamp'].dt.hour
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode categoricals
df = pd.get_dummies(df, columns=['Transaction_Type', 'Device_Type', 'Merchant_Category', 'Card_Type', 'Authentication_Method'], drop_first=True)

# Scale numeric columns
scaler = StandardScaler()
numeric_cols = ['Transaction_Amount', 'Account_Balance', 'Card_Age', 'Transaction_Distance', 'Risk_Score']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Fraud_Label', axis=1)
y = df['Fraud_Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
categorical_cols = X.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols.tolist())


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Separate target
y = df['Fraud_Label']
X = df.drop(['Fraud_Label', 'Transaction_ID', 'User_ID', 'Timestamp'], axis=1)

# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessor
preprocessor = ColumnTransformer([
    ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('numeric', 'passthrough', numeric_cols)
])

# Pipeline (preprocessing + model)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train
model.fit(X_train, y_train)


In [None]:
X = df.drop(['Transaction_ID', 'User_ID', 'Timestamp'], axis=1)

In [None]:
categorical_cols = X.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols.tolist())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(name)
    print(classification_report(y_test, preds))
    print("AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))
    print("-"*50)

In [None]:
import joblib
joblib.dump(best_model, 'fraud_model.pkl')

In [None]:
# from fraud_pipeline import FraudPipeline

# pipeline = FraudPipeline(model='random_forest')
# pipeline.fit("paypal_data.csv")
# # pipeline.predict("new_stripe_data.csv", output="fraud_scores.csv")