In [180]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input,Dense, Dropout
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
import joblib
from imblearn.over_sampling import SMOTE,ADASYN
import numpy as np
from tensorflow.keras.optimizers import Adam
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import learning_curve
from sklearn.metrics import make_scorer, f1_score

In [181]:
df  = pd.read_csv("./probablity.csv")
df.head(4)

Unnamed: 0,card_no,user_location_country,user_location_city,amount,transaction_hour,transaction_day_of_week,email_domain,avg_transaction_amount,city_consistency,country_consistency,upi_id,average_transaction_frequency,transaction_type,transaction_day,transaction_month,transaction_year,potential_fraud
0,8647040000000000.0,3,0,1.498991,20,2,4,1.107757,0,1,29101,1.029788,3,11,1,2024,0.747672
1,9945390000000000.0,0,0,0.345564,20,1,4,0.051889,0,1,2369,-1.293402,1,25,5,2024,0.471034
2,7711180000000000.0,5,4,0.025707,15,6,1,0.382548,0,1,24433,-0.784766,1,1,3,2024,0.378364
3,1554070000000000.0,0,0,1.622265,4,1,1,0.006644,0,1,4299,-1.219332,1,16,6,2024,0.526459


In [182]:
X = df.drop(columns=['card_no', 'potential_fraud'])
y = df['potential_fraud']
df.shape

(30000, 17)

In [183]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [184]:
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [185]:
y = (y > 0.755).astype(int)

In [186]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [187]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [188]:
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

In [189]:
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_adasyn, y_train_adasyn)

In [190]:
y_pred_prob = xgb.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
best_threshold = thresholds[ix]

In [191]:
y_pred_new = (y_pred_prob > best_threshold).astype(int)

In [192]:
print(f"Best Threshold: {best_threshold}, F-Score: {fscore[ix]}")
print("Classification Report:")
print(classification_report(y_test, y_pred_new))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_new))

Best Threshold: 0.3972204625606537, F-Score: 0.6456692913385826
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5935
           1       0.66      0.62      0.63        65

    accuracy                           0.99      6000
   macro avg       0.83      0.81      0.82      6000
weighted avg       0.99      0.99      0.99      6000

Confusion Matrix:
[[5914   21]
 [  25   40]]


In [193]:
import joblib
joblib.dump(xgb, 'xgb_model.pkl')

['xgb_model.pkl']

In [203]:
df.columns

Index(['card_no', 'user_location_country', 'user_location_city', 'amount',
       'transaction_hour', 'transaction_day_of_week', 'email_domain',
       'avg_transaction_amount', 'city_consistency', 'country_consistency',
       'upi_id', 'average_transaction_frequency', 'transaction_type',
       'transaction_day', 'transaction_month', 'transaction_year',
       'potential_fraud'],
      dtype='object')

In [194]:
# models = {
#     "RandomForest": RandomForestClassifier(random_state=42),
#     "GradientBoosting": GradientBoostingClassifier(random_state=42),
#     "XGBoost": XGBClassifier(random_state=42)
# }

# def evaluate_model(model, X_train, y_train, X_test, y_test):
#     model.fit(X_train, y_train)
#     y_pred_prob = model.predict_proba(X_test)[:, 1]
#     precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
#     fscore = (2 * precision * recall) / (precision + recall)
#     ix = np.argmax(fscore)
#     best_threshold = thresholds[ix]
    
#     y_pred_new = (y_pred_prob > best_threshold).astype(int)
#     print(f"Model: {model.__class__.__name__}")
#     print(f"Best Threshold: {best_threshold}, F-Score: {fscore[ix]}")
#     print("Classification Report:")
#     print(classification_report(y_test, y_pred_new))
#     print("Confusion Matrix:")
#     print(confusion_matrix(y_test, y_pred_new))
#     print("-" * 60)

In [195]:
# print("Evaluation with SMOTE:")
# for name, model in models.items():
#     evaluate_model(model, X_train_res, y_train_res, X_test, y_test)

# print("Evaluation with ADASYN:")
# for name, model in models.items():
#     evaluate_model(model, X_train_adasyn, y_train_adasyn, X_test, y_test)

In [196]:
# def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None, scoring=None):
#     plt.figure()
#     plt.title(title)
#     plt.xlabel("Training examples")
#     plt.ylabel("Score")
#     train_sizes, train_scores, test_scores = learning_curve(
#         estimator, X, y, cv=cv, n_jobs=n_jobs, scoring=scoring, train_sizes=np.linspace(0.1, 1.0, 5)
#     )
#     train_scores_mean = np.mean(train_scores, axis=1)
#     train_scores_std = np.std(train_scores, axis=1)
#     test_scores_mean = np.mean(test_scores, axis=1)
#     test_scores_std = np.std(test_scores, axis=1)
#     plt.grid()

#     plt.fill_between(
#         train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r"
#     )
#     plt.fill_between(
#         train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g"
#     )
#     plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
#     plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

#     plt.legend(loc="best")
#     return plt

# plot_learning_curve(RandomForestClassifier(random_state=42), "Learning Curve (Random Forest with SMOTE)", X_train_res, y_train_res, cv=5, n_jobs=-1, scoring=make_scorer(f1_score))
# plt.show()

# plot_learning_curve(XGBClassifier(random_state=42), "Learning Curve (XGBoost with SMOTE)", X_train_res, y_train_res, cv=5, n_jobs=-1, scoring=make_scorer(f1_score))
# plt.show()

# plot_learning_curve(RandomForestClassifier(random_state=42), "Learning Curve (Random Forest with ADASYN)", X_train_adasyn, y_train_adasyn, cv=5, n_jobs=-1, scoring=make_scorer(f1_score))
# plt.show()

# plot_learning_curve(XGBClassifier(random_state=42), "Learning Curve (XGBoost with ADASYN)", X_train_adasyn, y_train_adasyn, cv=5, n_jobs=-1, scoring=make_scorer(f1_score))
# plt.show()

In [197]:
# print("Classification Report with Threshold 0.5")
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

In [198]:
# precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
# fscore = (2 * precision * recall) / (precision + recall)
# ix = np.argmax(fscore)
# best_threshold = thresholds[ix]
# print(f"Best Threshold: {best_threshold}, F-Score: {fscore[ix]}")

In [199]:
# y_pred_new = (y_pred_prob > best_threshold).astype(int)

In [200]:
# print("Classification Report with Best Threshold")
# print(classification_report(y_test, y_pred_new))
# print(confusion_matrix(y_test, y_pred_new))

In [201]:
# plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_accuracy'])
# plt.title('Model accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Test'], loc='upper left')
# plt.show()

In [202]:
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('Model loss')
# plt.ylabel('Loss')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Test'], loc='upper left')
# plt.show()