<a href="https://www.kaggle.com/code/omarmostafataha/fraud-detection-using-tree-based-models?scriptVersionId=143007901" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Importing Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Overview

In [None]:
df = pd.read_csv('/kaggle/input/credit-card-fraud/card_transdata.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

the dataset has no missing values or duplicates.

In [None]:
df.describe()

There are extreme outliers in numerical features.

# Features Distribution

In [None]:
num_features = ['distance_from_home','distance_from_last_transaction','ratio_to_median_purchase_price']
fig,ax = plt.subplots(1,3,figsize=(20,4))
for i,col in enumerate(num_features):
    sns.boxplot(data=df,x=col,ax=ax[i])
    ax[i].set_title(f'{col} Distribution')

**Target distribution before deleting outliers**

In [None]:
df['fraud'].value_counts(normalize=True) * 100

In [None]:
df_no_outilers = df.copy()

for col in ['distance_from_home','distance_from_last_transaction','ratio_to_median_purchase_price']:
    q1 = df_no_outilers[col].quantile(0.25)
    q3 = df_no_outilers[col].quantile(0.75)
    iqr = q3 - q1 
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    df_no_outilers = df_no_outilers[(df_no_outilers[col]>=lower) & (df_no_outilers[col]<=upper)]

**Target distribution after deleting outliers**

In [None]:
df_no_outilers['fraud'].value_counts(normalize=True)*100

**we can see that most of outliers are fraud so dropping them will lead to losing important information. so we won't drop them.**

In [None]:
cat_features = ['repeat_retailer', 'used_chip','used_pin_number', 'online_order']
fig,ax = plt.subplots(1,4,figsize=(20,4))
for i,col in enumerate(cat_features):
    df[col].value_counts().plot(kind='pie',ax=ax[i],title=col)

# Target Distribution

In [None]:
sns.countplot(data=df,x='fraud');

# Features Correlation with Target

In [None]:
fig,ax = plt.subplots(1,3,figsize=(20,15))
for i,col in enumerate(num_features):
    sns.boxenplot(x=df['fraud'],y=df[col],ax=ax[i])
    ax[i].set_title(f'{col} vs Fraud')

In [None]:
fig,ax = plt.subplots(1,4,figsize=(20,4))
for i,col in enumerate(cat_features):
    sns.barplot(data=df,x=col,y='fraud',ax=ax[i])
    ax[i].set_title(f'{col} vs Fraud')    

Except for repeat_retailer, features have a strong correlation with the target.

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),cmap='Blues',annot=True);

# Model Building

In [None]:
def validate(model,X_train,y_train,score,n):
    
    '''this function is to validate the model across multiple stratified splits'''
    
    splits = StratifiedKFold(n_splits=n)
    validate = cross_val_score(model,X_train,y_train,scoring=score,cv=splits)
    print('Cross Validation Scores: ',validate)
    print('Scores Mean: ',validate.mean())
    print('Scores Standard Deviation: ',validate.std())
    model.fit(X_train,y_train)
    return model

In [None]:
def model_tunning(model,X_train,y_train,param_grid):
    
    '''This function recieves a model then tune it using GridSearch 
    then print the best parameters and return the best estimator'''
    
    grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=5,scoring='recall')
    grid_search.fit(X_train,y_train)
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_
    best_score = grid_search.best_score_
    cv_results = pd.DataFrame(grid_search.cv_results_)
    print("Best parameters are: ",best_params)
    print('Mean cross-validated recall of the best_estimator is: ',best_score)
    #print(cv_results)
    return best_estimator

In [None]:
def model_evaluation(model,X_test,y_test,color='Blues',threshold=0.5):
    
    '''this function is to evaluate the model based on a given threshold
    1--> print the classification report     2--> display the confusion matrix'''
    
    #classification report
    y_proba_test = model.predict_proba(X_test)
    y_pred_test  = (y_proba_test[:,1]>=threshold)
    print(classification_report(y_test,y_pred_test,zero_division=0))
    #confusion matrix
    plt.figure(figsize=(5,4))
    sns.heatmap(confusion_matrix(y_test,y_pred_test),cmap=color,annot=True)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')

In [None]:
def precision_recall_trade_off(model,X_test,y_test):
    
    '''this function is to plot the precision-recall curve then
    printing the thresholds that achieves the highest recall'''
    
    y_proba = model.predict_proba(X_test)
    precision ,recall ,threshold = precision_recall_curve(y_test,y_proba[:,1])
    p_r_t = pd.DataFrame({'Threshold':threshold,'Precision':precision[:-1],'Recall':recall[:-1]})
    fig = px.line(
        p_r_t,
        x='Recall',
        y='Precision',
        title='Precision-Recall Curve',
        width=700,height=500,
        hover_data=['Threshold']
    )
    fig.show()
    print(p_r_t[ (p_r_t['Recall']==1)].tail(10))

In [None]:
def roc_auc(model,X_test,y_test):
    
    '''this function plots the roc-auc curve and calculate the model ROC-AUC score '''
    
    y_proba = model.predict_proba(X_test)
    fpr ,tpr ,threshold = roc_curve(y_test,y_proba[:,1])
    fp_tp = pd.DataFrame({'Threshold':threshold,'FPR':fpr,'TPR':tpr})
    fig = px.line(
        fp_tp,
        x='FPR',
        y='TPR',
        title='ROC Curve',
        width=700,height=500,
        hover_data=['Threshold']
    )
    fig.show()
    print('Testing ROC-AUC Score: ',roc_auc_score(y_test,y_proba[:,1]))

In [None]:
from sklearn.model_selection import train_test_split , GridSearchCV ,StratifiedKFold , cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix , precision_recall_curve , roc_auc_score , roc_curve , classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier ,plot_tree
from sklearn.ensemble import VotingClassifier ,RandomForestClassifier ,AdaBoostClassifier ,GradientBoostingClassifier
from xgboost import XGBClassifier

# Data Splitting and Preprocessing

In [None]:
features = df.columns.drop(['fraud'])
target = 'fraud'

X = df[features]
y = df[target]

In [None]:
X_train , X_test ,y_train , y_test = train_test_split(X,y,test_size = 0.2 ,random_state = 42 ,stratify=y)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 1. Decision Tree Classifier

In [None]:
dec_tree = validate(DecisionTreeClassifier(random_state=42),X_train,y_train,'recall',5)

In [None]:
precision_recall_trade_off(dec_tree,X_test,y_test)

In [None]:
model_evaluation(dec_tree,X_train,y_train,'Blues')

In [None]:
model_evaluation(dec_tree,X_test,y_test,'Blues')

In [None]:
roc_auc(dec_tree,X_test,y_test)

In [None]:
plt.figure(figsize=(80,25))
plot_tree(
          dec_tree,
          feature_names=X.columns,
          class_names=['No Fraud', "Fraud"],
          filled=True,
          rounded=True,
          fontsize=16
);

In [None]:
feat_imp_dt = pd.Series(dec_tree.feature_importances_,index=X.columns).sort_values(ascending=False)
sns.barplot(x=feat_imp_dt,y=feat_imp_dt.index)
plt.title('Feature Importances');

# 2. XGBoost Classifier

In [None]:
xgb = validate(XGBClassifier(n_estimators=500 ,random_state=42 ,n_jobs=-1),X_train,y_train,'recall',5)

In [None]:
precision_recall_trade_off(xgb,X_test,y_test)

In [None]:
model_evaluation(xgb,X_train,y_train,'Greens',threshold=0.001976)

In [None]:
model_evaluation(xgb,X_test,y_test,'Greens',threshold=0.001976)

In [None]:
roc_auc(xgb,X_test,y_test)

# 3. Random Forest Classifier

In [None]:
rfc = validate(RandomForestClassifier(n_estimators=500 ,n_jobs=-1 ,random_state=42),X_train,y_train,'recall',5)

In [None]:
precision_recall_trade_off(rfc,X_test,y_test)

In [None]:
model_evaluation(rfc,X_train,y_train,'Blues',threshold=0.212)

In [None]:
model_evaluation(rfc,X_test,y_test,'Blues',threshold=0.212)

In [None]:
roc_auc(rfc,X_test,y_test)

In [None]:
feat_imp_rf = pd.Series(rfc.feature_importances_,index=X.columns).sort_values(ascending=False)
sns.barplot(x=feat_imp_rf,y=feat_imp_rf.index)
plt.title('Feature Importances');

# Conclusion

<ul>
<li>dropping outliers will lead to losing important information
<li>all the features except for repeat_retailer have a moderate to strong correlation with the target.
<li>Decision Tree ,XGBoost and Random Forest achieved great results on both training and test sets (nearly 100% F1 score,100% Recall, 100% ROC-AUC score).
</ul>
