In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from geopy.distance import geodesic
from sklearn.cluster import KMeans

train_df = pd.read_csv("./data/processed_train.csv")
test_df = pd.read_csv("./data/processed_test.csv")

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Assuming 'train_df' includes both features and the target ('is_fraud')
X = train_df.drop(['is_fraud', 'Id', 'city_pop_cluster'], axis=1)
y = train_df['is_fraud']

num_cols = ['amt', 'Time_Delta', 'distance_to_prev', 'location_consistency', 'dist_to_home', 'amt_anomaly_score_cat', 'amt_anomaly_score_merch', 'amt_relative_avg', 'fraud_rate_cat', 'fraud_rate_merch', 'fraud_similarity', 'normal_similarity']

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [3]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
# Initialize the XGBClassifier
xgb_clf = XGBClassifier(colsample_bytree=0.9, learning_rate=0.2, max_depth=3, n_estimators=900, use_label_encoder=False, eval_metric='logloss', random_state=42)

xgb_clf.fit(X_train, y_train)

y_pred_xgb = xgb_clf.predict(X_val)
print(f1_score(y_val, y_pred_xgb))
print(classification_report(y_val, y_pred_xgb))

0.8789625360230547
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     96876
         1.0       0.96      0.81      0.88       375

    accuracy                           1.00     97251
   macro avg       0.98      0.91      0.94     97251
weighted avg       1.00      1.00      1.00     97251



In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
import pickle
from xgboost import XGBClassifier
import numpy as np
from sklearn.ensemble import BaggingClassifier
# Create a VotingClassifier
from sklearn.ensemble import VotingClassifier

rf1 = RandomForestClassifier(class_weight='balanced', n_estimators=300, max_depth=None, min_samples_split=10, min_samples_leaf=4, criterion='entropy', n_jobs=-1, random_state=42)

rf1.fit(X_train, y_train)

y_pred_rf1 = rf1.predict(X_val)
print(f1_score(y_val, y_pred_rf1))
print(classification_report(y_val, y_pred_rf1))

0.8132183908045977
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     96876
         1.0       0.88      0.75      0.81       375

    accuracy                           1.00     97251
   macro avg       0.94      0.88      0.91     97251
weighted avg       1.00      1.00      1.00     97251



In [11]:
rf2 = RandomForestClassifier(class_weight='balanced', n_estimators=900, max_depth=None, min_samples_split=10, min_samples_leaf=1, criterion='entropy', n_jobs=-1, verbose=3)

rf2.fit(X_train, y_train)

y_pred_rf2 = rf2.predict(X_val)
print(f1_score(y_val, y_pred_rf2))
print(classification_report(y_val, y_pred_rf2))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


building tree 1 of 900building tree 2 of 900

building tree 3 of 900
building tree 4 of 900
building tree 5 of 900
building tree 6 of 900
building tree 7 of 900
building tree 8 of 900
building tree 9 of 900
building tree 10 of 900
building tree 11 of 900
building tree 12 of 900
building tree 13 of 900
building tree 14 of 900
building tree 15 of 900
building tree 16 of 900
building tree 17 of 900building tree 18 of 900

building tree 19 of 900
building tree 20 of 900
building tree 21 of 900
building tree 22 of 900
building tree 23 of 900
building tree 24 of 900
building tree 25 of 900
building tree 26 of 900
building tree 27 of 900
building tree 28 of 900
building tree 29 of 900
building tree 30 of 900
building tree 31 of 900
building tree 32 of 900
building tree 33 of 900
building tree 34 of 900
building tree 35 of 900
building tree 36 of 900
building tree 37 of 900
building tree 38 of 900
building tree 39 of 900
building tree 40 of 900
building tree 41 of 900
building tree 42 of 900
b

[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    3.7s


building tree 113 of 900
building tree 114 of 900
building tree 115 of 900
building tree 116 of 900
building tree 117 of 900
building tree 118 of 900
building tree 119 of 900
building tree 120 of 900
building tree 121 of 900
building tree 122 of 900
building tree 123 of 900
building tree 124 of 900
building tree 125 of 900
building tree 126 of 900
building tree 127 of 900
building tree 128 of 900
building tree 129 of 900
building tree 130 of 900
building tree 131 of 900
building tree 132 of 900
building tree 133 of 900
building tree 134 of 900
building tree 135 of 900
building tree 136 of 900
building tree 137 of 900
building tree 138 of 900
building tree 139 of 900
building tree 140 of 900
building tree 141 of 900
building tree 142 of 900
building tree 143 of 900
building tree 144 of 900
building tree 145 of 900
building tree 146 of 900
building tree 147 of 900
building tree 148 of 900
building tree 149 of 900
building tree 150 of 900
building tree 151 of 900
building tree 152 of 900


[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:    9.6s


building tree 277 of 900
building tree 278 of 900
building tree 279 of 900
building tree 280 of 900
building tree 281 of 900
building tree 282 of 900
building tree 283 of 900
building tree 284 of 900
building tree 285 of 900
building tree 286 of 900
building tree 287 of 900
building tree 288 of 900
building tree 289 of 900
building tree 290 of 900
building tree 291 of 900
building tree 292 of 900
building tree 293 of 900
building tree 294 of 900
building tree 295 of 900
building tree 296 of 900
building tree 297 of 900
building tree 298 of 900
building tree 299 of 900
building tree 300 of 900
building tree 301 of 900
building tree 302 of 900
building tree 303 of 900
building tree 304 of 900
building tree 305 of 900
building tree 306 of 900
building tree 307 of 900
building tree 308 of 900
building tree 309 of 900
building tree 310 of 900
building tree 311 of 900
building tree 312 of 900
building tree 313 of 900
building tree 314 of 900
building tree 315 of 900
building tree 316 of 900


[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:   17.9s


building tree 498 of 900
building tree 499 of 900
building tree 500 of 900
building tree 501 of 900
building tree 502 of 900
building tree 503 of 900
building tree 504 of 900
building tree 505 of 900
building tree 506 of 900
building tree 507 of 900
building tree 508 of 900
building tree 509 of 900
building tree 510 of 900
building tree 511 of 900
building tree 512 of 900
building tree 513 of 900
building tree 514 of 900
building tree 515 of 900
building tree 516 of 900
building tree 517 of 900
building tree 518 of 900
building tree 519 of 900
building tree 520 of 900
building tree 521 of 900
building tree 522 of 900
building tree 523 of 900
building tree 524 of 900
building tree 525 of 900
building tree 526 of 900
building tree 527 of 900
building tree 528 of 900
building tree 529 of 900
building tree 530 of 900
building tree 531 of 900
building tree 532 of 900
building tree 533 of 900
building tree 534 of 900
building tree 535 of 900
building tree 536 of 900
building tree 537 of 900


[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   29.1s


building tree 790 of 900
building tree 791 of 900
building tree 792 of 900
building tree 793 of 900
building tree 794 of 900
building tree 795 of 900
building tree 796 of 900
building tree 797 of 900
building tree 798 of 900
building tree 799 of 900
building tree 800 of 900
building tree 801 of 900
building tree 802 of 900
building tree 803 of 900
building tree 804 of 900
building tree 805 of 900
building tree 806 of 900
building tree 807 of 900
building tree 808 of 900
building tree 809 of 900
building tree 810 of 900
building tree 811 of 900
building tree 812 of 900
building tree 813 of 900
building tree 814 of 900
building tree 815 of 900
building tree 816 of 900
building tree 817 of 900
building tree 818 of 900
building tree 819 of 900
building tree 820 of 900
building tree 821 of 900
building tree 822 of 900
building tree 823 of 900
building tree 824 of 900
building tree 825 of 900
building tree 826 of 900
building tree 827 of 900
building tree 828 of 900
building tree 829 of 900


[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   34.4s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  96 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 256 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 480 tasks      | elapsed:    0.2s


0.8091603053435115
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     96876
         1.0       0.95      0.71      0.81       375

    accuracy                           1.00     97251
   macro avg       0.97      0.85      0.90     97251
weighted avg       1.00      1.00      1.00     97251



[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    0.3s
[Parallel(n_jobs=16)]: Done 900 out of 900 | elapsed:    0.3s finished


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
import pickle
from xgboost import XGBClassifier
import numpy as np
from sklearn.ensemble import BaggingClassifier
# Create a VotingClassifier
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[('rf1', rf1), ('rf2', rf2), ('xgb', xgb_clf)],
    voting='soft',
    weights=[1, 1, 2],
    n_jobs=-1
)

# Train the ensemble model
vot_clf = voting_clf.fit(X_train, y_train)

# # Evaluate the model
# scores = cross_val_score(voting_clf, X, y, cv=5, scoring='f1_micro', n_jobs=-1, verbose=3)
# print("F1 Score: ", scores.mean())
y_pred_vote = vot_clf.predict(X_val)
f1_score_vote = f1_score(y_val, y_pred_vote)
print(f1_score_vote)

# with open('best_vote_model.obj', 'wb') as f:
#     pickle.dump(vot_clf, f)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


building tree 1 of 900building tree 2 of 900
building tree 3 of 900

building tree 4 of 900building tree 5 of 900

building tree 6 of 900
building tree 7 of 900
building tree 8 of 900
building tree 9 of 900
building tree 10 of 900
building tree 11 of 900
building tree 12 of 900
building tree 13 of 900building tree 14 of 900

building tree 15 of 900
building tree 16 of 900
building tree 17 of 900
building tree 18 of 900building tree 19 of 900

building tree 20 of 900
building tree 21 of 900
building tree 22 of 900
building tree 23 of 900
building tree 24 of 900
building tree 25 of 900building tree 26 of 900

building tree 27 of 900
building tree 28 of 900
building tree 29 of 900building tree 30 of 900

building tree 31 of 900
building tree 32 of 900
building tree 33 of 900building tree 34 of 900

building tree 35 of 900
building tree 36 of 900
building tree 37 of 900
building tree 38 of 900
building tree 39 of 900
building tree 40 of 900
building tree 41 of 900
building tree 42 of 900
b

[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    8.0s


building tree 116 of 900
building tree 117 of 900
building tree 118 of 900
building tree 119 of 900
building tree 120 of 900
building tree 121 of 900building tree 122 of 900

building tree 123 of 900
building tree 124 of 900
building tree 125 of 900
building tree 126 of 900
building tree 127 of 900
building tree 128 of 900
building tree 129 of 900
building tree 130 of 900
building tree 131 of 900
building tree 132 of 900
building tree 133 of 900
building tree 134 of 900
building tree 135 of 900
building tree 136 of 900
building tree 137 of 900building tree 138 of 900
building tree 139 of 900

building tree 140 of 900
building tree 141 of 900
building tree 142 of 900
building tree 143 of 900
building tree 144 of 900
building tree 145 of 900
building tree 146 of 900
building tree 147 of 900
building tree 148 of 900
building tree 149 of 900
building tree 150 of 900
building tree 151 of 900
building tree 152 of 900
building tree 153 of 900
building tree 154 of 900
building tree 155 of 900


[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:   20.1s


building tree 276 of 900
building tree 277 of 900
building tree 278 of 900
building tree 279 of 900
building tree 280 of 900
building tree 281 of 900
building tree 282 of 900
building tree 283 of 900
building tree 284 of 900
building tree 285 of 900
building tree 286 of 900
building tree 287 of 900
building tree 288 of 900
building tree 289 of 900
building tree 290 of 900
building tree 291 of 900
building tree 292 of 900
building tree 293 of 900
building tree 294 of 900
building tree 295 of 900
building tree 296 of 900
building tree 297 of 900
building tree 298 of 900
building tree 299 of 900
building tree 300 of 900
building tree 301 of 900
building tree 302 of 900
building tree 303 of 900
building tree 304 of 900
building tree 305 of 900
building tree 306 of 900
building tree 307 of 900
building tree 308 of 900
building tree 309 of 900
building tree 310 of 900
building tree 311 of 900
building tree 312 of 900
building tree 313 of 900
building tree 314 of 900
building tree 315 of 900


[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:   29.6s


building tree 499 of 900
building tree 500 of 900
building tree 501 of 900
building tree 502 of 900
building tree 503 of 900
building tree 504 of 900
building tree 505 of 900
building tree 506 of 900
building tree 507 of 900
building tree 508 of 900
building tree 509 of 900
building tree 510 of 900
building tree 511 of 900
building tree 512 of 900
building tree 513 of 900
building tree 514 of 900
building tree 515 of 900
building tree 516 of 900
building tree 517 of 900
building tree 518 of 900
building tree 519 of 900
building tree 520 of 900
building tree 521 of 900
building tree 522 of 900
building tree 523 of 900building tree 524 of 900

building tree 525 of 900
building tree 526 of 900
building tree 527 of 900
building tree 528 of 900
building tree 529 of 900
building tree 530 of 900
building tree 531 of 900
building tree 532 of 900
building tree 533 of 900
building tree 534 of 900
building tree 535 of 900
building tree 536 of 900
building tree 537 of 900
building tree 538 of 900


[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   41.4s


building tree 788 of 900
building tree 789 of 900
building tree 790 of 900
building tree 791 of 900
building tree 792 of 900
building tree 793 of 900
building tree 794 of 900
building tree 795 of 900
building tree 796 of 900
building tree 797 of 900
building tree 798 of 900
building tree 799 of 900
building tree 800 of 900
building tree 801 of 900
building tree 802 of 900
building tree 803 of 900
building tree 804 of 900
building tree 805 of 900
building tree 806 of 900
building tree 807 of 900
building tree 808 of 900
building tree 809 of 900
building tree 810 of 900
building tree 811 of 900
building tree 812 of 900
building tree 813 of 900
building tree 814 of 900
building tree 815 of 900
building tree 816 of 900
building tree 817 of 900
building tree 818 of 900
building tree 819 of 900
building tree 820 of 900
building tree 821 of 900
building tree 822 of 900
building tree 823 of 900
building tree 824 of 900
building tree 825 of 900
building tree 826 of 900
building tree 827 of 900


[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   46.6s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  96 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 256 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 480 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 900 out of 900 | elapsed:    0.3s finished


0.8340943683409436


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
import pickle
from xgboost import XGBClassifier
import numpy as np
from sklearn.ensemble import BaggingClassifier
# Create a VotingClassifier
from sklearn.ensemble import VotingClassifier

# Initialize the Bagging Classifier
bagging_clf = BaggingClassifier(estimator=rf1, n_estimators=50, random_state=42, n_jobs=-1, verbose=3)

bagging_clf.fit(X_train, y_train)
y_pred_bag = bagging_clf.predict(X_val)
print(f1_score(y_val, y_pred_bag))


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.


Building estimator 1 of 4 for this parallel run (total 50)...
Building estimator 1 of 4 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building estimator 1 of 3 for this parallel run (total 50)...
Building

[Parallel(n_jobs=16)]: Done   3 out of  16 | elapsed:  6.4min remaining: 27.8min


Building estimator 4 of 4 for this parallel run (total 50)...
Building estimator 4 of 4 for this parallel run (total 50)...


[Parallel(n_jobs=16)]: Done   9 out of  16 | elapsed:  6.5min remaining:  5.0min
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:  6.7min finished
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   3 out of  16 | elapsed:    1.4s remaining:    6.2s
[Parallel(n_jobs=16)]: Done   9 out of  16 | elapsed:    3.1s remaining:    2.4s


0.7855072463768116


[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    4.9s finished
