In [154]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTENC,BorderlineSMOTE,SMOTE
import openpyxl


pd.options.mode.copy_on_write = True

In [None]:
df = pd.read_csv('fraudTrain.csv')

In [None]:
df_test = pd.read_excel('fraudTest.xlsx')

### Target Encoding

In [None]:
df.dropna(axis=0, how='any',inplace=True)

In [None]:
cat_targ_enc = df.groupby('category')['is_fraud'].mean()
df['category_target_enc'] = df['category'].map(cat_targ_enc)

In [None]:
merchant_targ_enc = df.groupby('merchant')['is_fraud'].mean()
df['merchant_target_enc'] = df['merchant'].map(merchant_targ_enc)

In [None]:
sns.boxplot(x= df['is_fraud'],y=df['category_target_enc'])

In [None]:
sns.boxplot(x= df['is_fraud'],y=df['merchant_target_enc'])

#### Testing Set

In [None]:
df_test['category_target_enc'] = df_test['category'].map(cat_targ_enc)
df_test['merchant_target_enc'] = df_test['merchant'].map(merchant_targ_enc)

# Pre-Processing Features

### *Time* Feature

In [None]:
df['Time'] = pd.to_datetime(df['Time'])

In [None]:
df_test['Time'] = pd.to_datetime(df_test['Time'])

In [None]:
df['Time:year'] = df['Time'].dt.year
df['Time:month'] = df['Time'].dt.month
df['Time:day'] = df['Time'].dt.day
df['Time:hour'] = df['Time'].dt.hour

In [None]:
df_test['Time:year'] = df_test['Time'].dt.year
df_test['Time:month'] = df_test['Time'].dt.month
df_test['Time:day'] = df_test['Time'].dt.day
df_test['Time:hour'] = df_test['Time'].dt.hour

In [None]:
df.drop(columns=['ID','trans_num','Time','firstName','lastName'], inplace=True)

In [None]:
df_test.drop(columns=['ID','trans_num','Time','firstName','lastName'], inplace=True)

In [176]:
df.dtypes

Card Number            float64
is_fraud               float64
category_target_enc    float64
merchant_target_enc    float64
Time:year                int32
Time:month               int32
Time:day                 int32
Time:hour                int32
log_amount             float64
dtype: object

In [None]:
df_test.dtypes

In [None]:
df.drop(columns=['merchant','category'],inplace =True)
df_test.drop(columns=['merchant','category'],inplace=True)

In [None]:
corr_matrix = df.corr()

plt.figure(figsize=(10,6))
sns.heatmap(corr_matrix,annot=True,cmap='YlGnBu',fmt='.5f')
plt.title('Correlation Matrix')
plt.show()

### *Amount* Feature

In [None]:
fig, (ax1,ax2) = plt.subplots(nrows=1,ncols=2,figsize=(10,5))
sns.histplot(ax=ax1,data=df['Amount'], kde=True)
ax1.set_title('Distribution of Amount')
ax1.set_xlabel('Amount')
ax1.set_ylabel('Distribution of ' + 'Amount')
sns.boxplot(data=df['Amount'])
ax2.set_title('Distribution of Amount')
ax2.set_xlabel('Amount')
ax2.set_ylabel('Distribution of ' + 'Amount')
plt.show()

In [None]:
#z_scores = (df['Amount'] - np.mean(df['Amount'])) / np.std(df['Amount'])
#df = df[np.abs(z_scores) <= 3]

In [None]:
#fig1, (ax1,ax2) = plt.subplots(nrows=1,ncols=2,figsize=(10,5))
#sns.histplot(ax=ax1,data=df['Amount'], kde=True)
#ax1.set_title('Distribution of Amount')
#ax1.set_xlabel('Amount')
#ax1.set_ylabel('Distribution of ' + 'Amount')
#sns.boxplot(data=df['Amount'])
#ax2.set_title('Distribution of Amount')
#ax2.set_xlabel('Amount')
#ax2.set_ylabel('Distribution of ' + 'Amount')
#plt.show()

In [None]:
df['log_amount'] = np.log(df['Amount'])

In [None]:
df_test['log_amount'] = np.log(df_test['Amount'])

In [None]:
fig2, (ax1,ax2) = plt.subplots(nrows=1,ncols=2,figsize=(10,5))
sns.histplot(ax=ax1,data=df['log_amount'], kde=True)
ax1.set_title('Distribution of Amount')
ax1.set_xlabel('Amount')
ax1.set_ylabel('Distribution of ' + 'Amount')
sns.boxplot(data=df['log_amount'])
ax2.set_title('Distribution of Amount')
ax2.set_xlabel('Amount')
ax2.set_ylabel('Distribution of ' + 'Amount')
plt.show()

In [None]:
df.drop(columns='Amount',inplace=True)

In [None]:
df_test.drop(columns='Amount',inplace=True)

In [174]:
df.dtypes

Card Number            float64
is_fraud               float64
category_target_enc    float64
merchant_target_enc    float64
Time:year                int32
Time:month               int32
Time:day                 int32
Time:hour                int32
log_amount             float64
dtype: object

In [175]:
df_test.dtypes

Card Number              int64
is_fraud                 int64
category_target_enc    float64
merchant_target_enc    float64
Time:year                int32
Time:month               int32
Time:day                 int32
Time:hour                int32
log_amount             float64
dtype: object

### Applying *OverSampling & UnderSampling*

In [123]:
X = df.drop('is_fraud',axis = 1)
y = df['is_fraud']

In [124]:
X_test = df_test.drop('is_fraud',axis = 1)
y_test = df_test['is_fraud']

In [147]:
border_smote = BorderlineSMOTE(sampling_strategy=1,random_state=42)
smote = SMOTE(sampling_strategy=1,random_state=42)
X_resampled, y_resampled = smote.fit_resample(X,y)

In [None]:
X_resampled.dtypes

In [None]:
y_resampled.value_counts()

# Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=50,max_depth=6,random_state=13)

In [151]:
rf.fit(X_resampled,y_resampled)

In [152]:
rf_predictions = rf.predict(X_test)

In [162]:
rf_Accuracy_Score = accuracy_score(y_test, rf_predictions)
rf_F1_Score = f1_score(y_test, rf_predictions)
rf_Confusion_Matrix = confusion_matrix(y_test,rf_predictions)
rf_Recall = recall_score(y_test,rf_predictions)
rf_Precision = precision_score(y_test,rf_predictions)
rf_roc_auc_score = roc_auc_score(y_test,rf_predictions)

rf_scores = [(rf_Recall,rf_Precision,rf_Accuracy_Score,rf_F1_Score,rf_roc_auc_score)]
rf_df = pd.DataFrame(data = rf_scores,columns=['Recall','Precision','Accuracy','F1','ROC-AUC'])
rf_df.insert(0,'Model','Random Forest')
rf_df

Unnamed: 0,Model,Recall,Precision,Accuracy,F1,ROC-AUC
0,Random Forest,0.774359,0.116431,0.976447,0.202425,0.875794


# XGBoost

In [137]:
xgb_model = xgb.XGBClassifier(objective='binary:logistic',random_state=42)

In [148]:
xgb_model.fit(X_resampled,y_resampled)

In [149]:
xgb_predictions = xgb_model.predict(X_test)

In [171]:
xgb_Accuracy_Score = accuracy_score(y_test, xgb_predictions)
xgb_F1_Score = f1_score(y_test, xgb_predictions)
xgb_Confusion_Matrix = confusion_matrix(y_test,xgb_predictions)
xgb_Recall = recall_score(y_test,xgb_predictions)
xgb_Precision = precision_score(y_test,xgb_predictions)
xgb_roc_auc_score = roc_auc_score(y_test,xgb_predictions)

xgb_scores = [(xgb_Recall,xgb_Precision,xgb_Accuracy_Score,xgb_F1_Score,xgb_roc_auc_score)]
xgb_df = pd.DataFrame(data = xgb_scores,columns=['Recall','Precision','Accuracy','F1','ROC-AUC'])
xgb_df.insert(0,'Model','XGB')
xgb_df

Unnamed: 0,Model,Recall,Precision,Accuracy,F1,ROC-AUC
0,XGB,0.770163,0.511455,0.996273,0.614698,0.883656


# Logistic Regression

In [143]:
LR = LogisticRegression()
LR.fit(X,y)

In [144]:
LR_predictions = LR.predict(X_test)

In [145]:
LR_Accuracy_Score = accuracy_score(y_test, LR_predictions)
LR_F1_Score = f1_score(y_test, LR_predictions)
LR_Confusion_Matrix = confusion_matrix(y_test,LR_predictions)
LR_Recall = recall_score(y_test,LR_predictions)
LR_Precision = precision_score(y_test,LR_predictions)

print(LR_Recall)
print(LR_Precision)
print(LR_Accuracy_Score)
print(LR_F1_Score)
print(LR_Confusion_Matrix)

0.0
0.0
0.9961401355721147
0.0
[[553574      0]
 [  2145      0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Tree

In [None]:
Tree = DecisionTreeClassifier()
Tree.fit(X_resampled,y_resampled)

In [None]:
tree_predictions = Tree.predict(X_test)

In [None]:
Tree_Accuracy_Score = accuracy_score(y_test, tree_predictions)
Tree_F1_Score = f1_score(y_test, tree_predictions)
Tree_Confusion_Matrix = confusion_matrix(y_test,tree_predictions)
Tree_Recall = recall_score(y_test,tree_predictions)
Tree_Precision = precision_score(y_test,tree_predictions)

In [None]:
fig, ax = plt.subplots()
ax.grid(False)
disp = ConfusionMatrixDisplay(confusion_matrix=Tree_Confusion_Matrix,display_labels=Tree.classes_)
disp.plot(cmap='Greens',ax=ax)
plt.show()