In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc


In [44]:

# Load your dataset
df = pd.read_csv('../data/processed/df_fraud_processed.csv')

# Quick look
df.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,time_to_purchase,purchase_hour,purchase_dayofweek,is_weekend,ip_int
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,75111.366667,2,5,True,
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,299.066667,1,0,False,
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,0.016667,18,3,False,
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,8201.416667,13,0,False,
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,72691.016667,18,2,False,


In [45]:
# Replace 'class' with your actual target column name
target_col = 'class'

# Drop irrelevant columns (adjust as needed)
drop_cols = ['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address']  # example

X = df.drop(columns=drop_cols + [target_col])
y = df[target_col]



In [46]:
drop_cols = ['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address']
df_cleaned = df.drop(columns=drop_cols)

# 2. One-hot encode
cat_cols = ['source', 'browser', 'sex']
df_encoded = pd.get_dummies(df_cleaned, columns=cat_cols, drop_first=True)

# 3. Split features and target
target_col = 'class'  # or whatever your target is
X = df_encoded.drop(columns=[target_col])
y = df_encoded[target_col]

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [51]:
# print nan values in X_train
print("NaN values in X_train:", X_train.isnull().sum().sum())

print(X_train.shape)
print(y_train.shape)


NaN values in X_train: 0
(120889, 14)
(120889,)


In [48]:
X_train = X_train.fillna(0)  # or use mean/median if that makes more sense


 Fix  class imbalance properly appling SMOTE/undersampling

In [49]:
from imblearn.over_sampling import SMOTE


# 1. Initialize SMOTE
smote = SMOTE(random_state=42)

# 2. Resample the training data only
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f'Before SMOTE, counts of label 1: {sum(y_train==1)}')
print(f'After SMOTE, counts of label 1: {sum(y_train_resampled==1)}')

# 3. Use X_train_resampled, y_train_resampled to train your models instead of X_train, y_train


Before SMOTE, counts of label 1: 11321
After SMOTE, counts of label 1: 109568


In [52]:
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr.fit(X_train_resampled, y_train_resampled)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [64]:
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    min_samples_leaf=10,
    random_state=42
)
rf.fit(X_train, y_train)


In [63]:
def evaluate_model(model, X, y):
    preds_proba = model.predict_proba(X)[:, 1]
    preds = model.predict(X)

    print("Confusion Matrix:")
    print(confusion_matrix(y, preds))
    print("\nClassification Report:")
    print(classification_report(y, preds))

    precision, recall, _ = precision_recall_curve(y, preds_proba)
    auc_pr = auc(recall, precision)
    print(f"AUC-PR: {auc_pr:.4f}")
print("Logistic Regression Performance:")
evaluate_model(lr, X_train_resampled, y_train_resampled)

print("\nRandom Forest Performance:")
evaluate_model(rf, X_train_resampled, y_train_resampled)


Logistic Regression Performance:
Confusion Matrix:
[[78145 31423]
 [27535 82033]]

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.71      0.73    109568
           1       0.72      0.75      0.74    109568

    accuracy                           0.73    219136
   macro avg       0.73      0.73      0.73    219136
weighted avg       0.73      0.73      0.73    219136

AUC-PR: 0.8055

Random Forest Performance:
Confusion Matrix:
[[109020    548]
 [ 25862  83706]]

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.99      0.89    109568
           1       0.99      0.76      0.86    109568

    accuracy                           0.88    219136
   macro avg       0.90      0.88      0.88    219136
weighted avg       0.90      0.88      0.88    219136

AUC-PR: 0.9500


In [65]:
# save the models
import joblib
joblib.dump(lr, 'logistic_regression_model.pkl')
joblib.dump(rf, 'random_forest_model.pkl')

['random_forest_model.pkl']