In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input,Dense, Dropout
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
import joblib
from imblearn.over_sampling import SMOTE,ADASYN
import numpy as np
from tensorflow.keras.optimizers import Adam
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import learning_curve
from sklearn.metrics import make_scorer, f1_score

In [51]:
df  = pd.read_csv("./fraud_detection.csv")
df.head(10)

Unnamed: 0,card_no,user_location_country,user_location_city,amount,transaction_hour,transaction_day_of_week,email_domain,avg_transaction_amount,city_consistency,country_consistency,upi_id,average_transaction_frequency,transaction_type,transaction_day,transaction_month,transaction_year,potential_fraud,receiver_location_city,receiver_city_consistency,sender_city_consistency
0,8647040000000000.0,3,0,2741.48298,20,2,4,759.609914,0.501313,1,29101,0.491414,3,11,1,2024,0.392756,1,0.763547,0.501313
1,9945390000000000.0,0,0,561.427521,20,1,4,547.612552,0.501313,1,2369,0.952263,1,25,5,2024,0.002815,2,0.842365,0.501313
2,7711180000000000.0,5,4,623.903488,15,6,1,629.771852,0.698687,1,24433,1.731335,1,1,3,2024,0.0014,5,0.0,0.698687
3,1554070000000000.0,0,0,3245.993981,4,1,1,344.785819,0.501313,1,4299,0.158086,1,16,6,2024,0.575304,2,0.842365,0.501313
4,4450350000000000.0,1,4,1860.500562,17,0,0,718.786598,0.698687,1,5479,1.175585,1,28,6,2024,0.22645,2,0.842365,0.698687
5,2565400000000000.0,1,5,4696.870648,20,2,0,466.147972,0.498687,1,12532,1.648844,0,16,9,2024,0.838331,4,1.0,0.498687
6,1712070000000000.0,4,4,1203.154741,9,4,4,100.41858,0.698687,1,1408,1.528569,1,15,5,2024,0.218529,5,0.0,0.698687
7,3085300000000000.0,2,0,2803.599135,17,2,1,410.091641,0.501313,1,29644,0.499567,0,26,4,2024,0.474315,5,0.0,0.501313
8,1326940000000000.0,0,4,203.290449,16,2,3,173.907865,0.698687,1,22675,1.284772,1,11,2,2024,0.005861,5,0.0,0.698687
9,3175120000000000.0,2,0,1948.414681,4,1,0,681.087849,0.501313,1,8989,0.770734,3,8,8,2024,0.251378,2,0.842365,0.501313


In [52]:
X = df.drop(columns=['card_no', 'potential_fraud','user_location_country','user_location_city','upi_id','email_domain','city_consistency','receiver_location_city'])
y = df['potential_fraud']
print(df.shape)
X.shape
df.head(4)

(30000, 20)


Unnamed: 0,card_no,user_location_country,user_location_city,amount,transaction_hour,transaction_day_of_week,email_domain,avg_transaction_amount,city_consistency,country_consistency,upi_id,average_transaction_frequency,transaction_type,transaction_day,transaction_month,transaction_year,potential_fraud,receiver_location_city,receiver_city_consistency,sender_city_consistency
0,8647040000000000.0,3,0,2741.48298,20,2,4,759.609914,0.501313,1,29101,0.491414,3,11,1,2024,0.392756,1,0.763547,0.501313
1,9945390000000000.0,0,0,561.427521,20,1,4,547.612552,0.501313,1,2369,0.952263,1,25,5,2024,0.002815,2,0.842365,0.501313
2,7711180000000000.0,5,4,623.903488,15,6,1,629.771852,0.698687,1,24433,1.731335,1,1,3,2024,0.0014,5,0.0,0.698687
3,1554070000000000.0,0,0,3245.993981,4,1,1,344.785819,0.501313,1,4299,0.158086,1,16,6,2024,0.575304,2,0.842365,0.501313


In [53]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [54]:
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [55]:
y = (y > 0.55).astype(int)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [57]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [58]:
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

In [59]:
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_adasyn, y_train_adasyn)

In [60]:
y_pred_prob = xgb.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
best_threshold = thresholds[ix]

In [61]:
y_pred_new = (y_pred_prob > best_threshold).astype(int)

In [62]:
print(f"Best Threshold: {best_threshold}, F-Score: {fscore[ix]}")
print("Classification Report:")
print(classification_report(y_test, y_pred_new))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_new))

Best Threshold: 0.5701147317886353, F-Score: 0.9978845572680567
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4346
           1       1.00      1.00      1.00      1654

    accuracy                           1.00      6000
   macro avg       1.00      1.00      1.00      6000
weighted avg       1.00      1.00      1.00      6000

Confusion Matrix:
[[4342    4]
 [   4 1650]]


In [63]:
import joblib
joblib.dump(xgb, 'xgb_model.pkl')

['xgb_model.pkl']

In [64]:
df.columns

Index(['card_no', 'user_location_country', 'user_location_city', 'amount',
       'transaction_hour', 'transaction_day_of_week', 'email_domain',
       'avg_transaction_amount', 'city_consistency', 'country_consistency',
       'upi_id', 'average_transaction_frequency', 'transaction_type',
       'transaction_day', 'transaction_month', 'transaction_year',
       'potential_fraud', 'receiver_location_city',
       'receiver_city_consistency', 'sender_city_consistency'],
      dtype='object')

In [65]:
# models = {
#     "RandomForest": RandomForestClassifier(random_state=42),
#     "GradientBoosting": GradientBoostingClassifier(random_state=42),
#     "XGBoost": XGBClassifier(random_state=42)
# }

# def evaluate_model(model, X_train, y_train, X_test, y_test):
#     model.fit(X_train, y_train)
#     y_pred_prob = model.predict_proba(X_test)[:, 1]
#     precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
#     fscore = (2 * precision * recall) / (precision + recall)
#     ix = np.argmax(fscore)
#     best_threshold = thresholds[ix]
    
#     y_pred_new = (y_pred_prob > best_threshold).astype(int)
#     print(f"Model: {model.__class__.__name__}")
#     print(f"Best Threshold: {best_threshold}, F-Score: {fscore[ix]}")
#     print("Classification Report:")
#     print(classification_report(y_test, y_pred_new))
#     print("Confusion Matrix:")
#     print(confusion_matrix(y_test, y_pred_new))
#     print("-" * 60)

In [66]:
# print("Evaluation with SMOTE:")
# for name, model in models.items():
#     evaluate_model(model, X_train_res, y_train_res, X_test, y_test)

# print("Evaluation with ADASYN:")
# for name, model in models.items():
#     evaluate_model(model, X_train_adasyn, y_train_adasyn, X_test, y_test)

In [67]:
# def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None, scoring=None):
#     plt.figure()
#     plt.title(title)
#     plt.xlabel("Training examples")
#     plt.ylabel("Score")
#     train_sizes, train_scores, test_scores = learning_curve(
#         estimator, X, y, cv=cv, n_jobs=n_jobs, scoring=scoring, train_sizes=np.linspace(0.1, 1.0, 5)
#     )
#     train_scores_mean = np.mean(train_scores, axis=1)
#     train_scores_std = np.std(train_scores, axis=1)
#     test_scores_mean = np.mean(test_scores, axis=1)
#     test_scores_std = np.std(test_scores, axis=1)
#     plt.grid()

#     plt.fill_between(
#         train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r"
#     )
#     plt.fill_between(
#         train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g"
#     )
#     plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
#     plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

#     plt.legend(loc="best")
#     return plt

# plot_learning_curve(RandomForestClassifier(random_state=42), "Learning Curve (Random Forest with SMOTE)", X_train_res, y_train_res, cv=5, n_jobs=-1, scoring=make_scorer(f1_score))
# plt.show()

# plot_learning_curve(XGBClassifier(random_state=42), "Learning Curve (XGBoost with SMOTE)", X_train_res, y_train_res, cv=5, n_jobs=-1, scoring=make_scorer(f1_score))
# plt.show()

# plot_learning_curve(RandomForestClassifier(random_state=42), "Learning Curve (Random Forest with ADASYN)", X_train_adasyn, y_train_adasyn, cv=5, n_jobs=-1, scoring=make_scorer(f1_score))
# plt.show()

# plot_learning_curve(XGBClassifier(random_state=42), "Learning Curve (XGBoost with ADASYN)", X_train_adasyn, y_train_adasyn, cv=5, n_jobs=-1, scoring=make_scorer(f1_score))
# plt.show()

In [68]:
# print("Classification Report with Threshold 0.5")
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

In [69]:
# precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
# fscore = (2 * precision * recall) / (precision + recall)
# ix = np.argmax(fscore)
# best_threshold = thresholds[ix]
# print(f"Best Threshold: {best_threshold}, F-Score: {fscore[ix]}")

In [70]:
# y_pred_new = (y_pred_prob > best_threshold).astype(int)

In [71]:
# print("Classification Report with Best Threshold")
# print(classification_report(y_test, y_pred_new))
# print(confusion_matrix(y_test, y_pred_new))

In [72]:
# plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_accuracy'])
# plt.title('Model accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Test'], loc='upper left')
# plt.show()

In [73]:
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('Model loss')
# plt.ylabel('Loss')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Test'], loc='upper left')
# plt.show()