<h1 align="center"> Credit Fraud Detector </h1>

In [None]:
# Imported Libraries

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


# Other Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predictfrom imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, classification_report



import warnings
warnings.filterwarnings("ignore")



In [None]:

df = pd.read_csv('../input/creditcard.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

Except for the transaction and amount we dont know what the other columns are (**due to privacy reasons**). The only thing we know, is that those columns that are unknown have been scaled already.

In [None]:
# No Null Values!
df.isnull().sum().max()

In [None]:
df.columns

In [None]:
# The classes are heavily skewed(imbalanced).
print('No Frauds', round(df['Class'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Frauds', round(df['Class'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

In [None]:
df.Class.value_counts()

Most of the transactions are non-fraud. Our original datasetis **imbalanced**!  If we use this dataframe as the base for our predictive models our algorithms will probably overfit since it will "assume" that most transactions are not fraud.

In [None]:
colors = ["#FFA500", "#00FF00"]

sns.countplot('Class', data=df, palette=colors)
plt.title('Fraud distribution', fontsize=14)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

amount_val = df['Amount'].values
time_val = df['Time'].values

sns.distplot(amount_val, ax=ax[0], color='r')
ax[0].set_title('Distribution of Transaction Amount', fontsize=14)

sns.distplot(time_val, ax=ax[1], color='b')
ax[1].set_title('Distribution of Transaction Time', fontsize=14)



plt.show()

# Scaling Time and amount 

In [None]:
# Since most of our data has already been scaled we should scale the columns that are left to scale (Amount and Time)
from sklearn.preprocessing import RobustScaler

# RobustScaler is less prone to outliers.

rob_scaler = RobustScaler()

df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)

In [None]:
scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']

df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

# Amount and Time are Scaled!

df.head()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

amount_val = df['scaled_amount'].values
time_val = df['scaled_time'].values

sns.distplot(amount_val, ax=ax[0], color='r')
ax[0].set_title('Distribution of Transaction Amount', fontsize=14)

sns.distplot(time_val, ax=ax[1], color='b')
ax[1].set_title('Distribution of Transaction Time', fontsize=14)



plt.show()

# Splitting the Data 

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

## Random Under-Sampling

In [None]:
df = df.sample(frac=1) # shuffle

fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

new_df = normal_distributed_df.sample(frac=1, random_state=42)

new_df.head()

In [None]:
print('Distribution of the Classes in the subsample dataset')
print(new_df['Class'].value_counts()/len(new_df))



sns.countplot('Class', data=new_df, palette=colors)
plt.title('Equally Distributed Classes', fontsize=14)
plt.show()

In [None]:
# Make sure we use the subsample in our correlation

f, (ax1, ax2) = plt.subplots(2, 1, figsize=(24,20))

# Entire DataFrame
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm_r', ax=ax1)
ax1.set_title("Imbalanced Correlation Matrix", fontsize=14)


sub_sample_corr = new_df.corr()
sns.heatmap(sub_sample_corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax2)
ax2.set_title('SubSample Correlation Matrix', fontsize=14)
plt.show()



### Explaining Correlation: 
<ul>
<li><b>Negative Correlations: </b>V17, V14, V12 and V10 are negatively correlated.The lower these values are, the more likely the end result will be a fraud transaction.  </li>
<li> <b> Positive Correlations: </b> V2, V4, V11, and V19 are positively correlated. The higher these values are, the more likely the end result will be a fraud transaction. </li>


In [None]:
f, axes = plt.subplots(ncols=4, figsize=(20,4))

# Negative Correlations with our Class (The lower our feature value the more likely it will be a fraud transaction)
sns.boxplot(x="Class", y="V17", data=new_df, palette=colors, ax=axes[0])
axes[0].set_title('V17 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V14", data=new_df, palette=colors, ax=axes[1])
axes[1].set_title('V14 vs Class Negative Correlation')


sns.boxplot(x="Class", y="V12", data=new_df, palette=colors, ax=axes[2])
axes[2].set_title('V12 vs Class Negative Correlation')


sns.boxplot(x="Class", y="V10", data=new_df, palette=colors, ax=axes[3])
axes[3].set_title('V10 vs Class Negative Correlation')

plt.show()

In [None]:
f, axes = plt.subplots(ncols=4, figsize=(20,4))

# Positive correlations (The higher the feature the probability increases that it will be a fraud transaction)
sns.boxplot(x="Class", y="V11", data=new_df, palette=colors, ax=axes[0])
axes[0].set_title('V11 vs Class Positive Correlation')

sns.boxplot(x="Class", y="V4", data=new_df, palette=colors, ax=axes[1])
axes[1].set_title('V4 vs Class Positive Correlation')


sns.boxplot(x="Class", y="V2", data=new_df, palette=colors, ax=axes[2])
axes[2].set_title('V2 vs Class Positive Correlation')


sns.boxplot(x="Class", y="V19", data=new_df, palette=colors, ax=axes[3])
axes[3].set_title('V19 vs Class Positive Correlation')

plt.show()

* After showing the coloumns collerated to the target using boxplots
=> outliers were found in features 10, 12, 14


In [None]:
import numpy as np

def remove_outliers(df, columns):
    for col in columns:
        col_fraud = df.loc[df['Class'] == 1, col]
        col_q25, col_q75 = np.percentile(col_fraud, [25, 75])
        col_iqr = col_q75 - col_q25

        col_cut_off = col_iqr * 1.5
        col_lower, col_upper = col_q25 - col_cut_off, col_q75 + col_cut_off

        df = df.drop(df[(df[col] > col_upper) | (df[col] < col_lower)].index)

    return df

In [None]:
new_df = remove_outliers(new_df, ['V10', 'V12', 'V14'])

In [None]:
f,(ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20,6))

colors = ['#B3F9C5', '#f9c5b3']
# Boxplots with outliers removed
# Feature V14
sns.boxplot(x="Class", y="V14", data=new_df,ax=ax1, palette=colors)
ax1.set_title("V14 Feature \n Reduction of outliers", fontsize=14)

# Feature 12
sns.boxplot(x="Class", y="V12", data=new_df, ax=ax2, palette=colors)
ax2.set_title("V12 Feature \n Reduction of outliers", fontsize=14)

# Feature V10
sns.boxplot(x="Class", y="V10", data=new_df, ax=ax3, palette=colors)
ax3.set_title("V10 Feature \n Reduction of outliers", fontsize=14)


plt.show()

In [None]:
# New_df is from the random undersample data (fewer instances)
X = new_df.drop('Class', axis=1)
y = new_df['Class']

<h2> Classifiers </h2>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def evaluate_classifiers(X_train, y_train, X_test, y_test):
    classifiers = {
        "LogisiticRegression": LogisticRegression(),
        "KNearest": KNeighborsClassifier(),
        "Support Vector Classifier": SVC(probability=True),
        "DecisionTreeClassifier": DecisionTreeClassifier()
    }

    best_estimators = {}
    cross_val_scores = {}
    roc_auc_scores = {}

    # Define the hyperparameters for each classifier
    log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    knears_params = {"n_neighbors": list(range(2,5,1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
    svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
    tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)), "min_samples_leaf": list(range(5,7,1))}

    # Loop through the classifiers
    for name, clf in classifiers.items():
        # Define the hyperparameters for the current classifier
        if name == "LogisiticRegression":
            params = log_reg_params
        elif name == "KNearest":
            params = knears_params
        elif name == "Support Vector Classifier":
            params = svc_params
        elif name == "DecisionTreeClassifier":
            params = tree_params

        # Perform grid search to find the best estimator
        grid_clf = GridSearchCV(clf, params)
        grid_clf.fit(X_train, y_train)
        best_estimators[name] = grid_clf.best_estimator_

        # Evaluate the classifier using cross-validation
        scores = cross_val_score(best_estimators[name], X_train, y_train, cv=5)
        cross_val_scores[name] = scores.mean()

        # Calculate the ROC AUC score on the test data
        y_pred_proba = cross_val_predict(best_estimators[name], X_test, y_test, cv=5, method='predict_proba')[:,1]
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        roc_auc_scores[name] = roc_auc

    # Generate classification reports for each classifier
    reports = {}
    for name, clf in best_estimators.items():
        y_pred = clf.predict(X_test)
        report = classification_report(y_test, y_pred, target_names=['nonFraud', 'Fraud'])
        reports[name] = report

    return best_estimators, reports, cross_val_scores, roc_auc_scores

In [None]:
best_estimators, reports, cross_val_scores, roc_auc_scores = evaluate_classifiers(X_train, y_train, X_test, y_test)

print("---------------------------------------------\n")
# Print the classification reports
for name, report in reports.items():
    print(f"Classification Report for {name}:")
    print(report)

print("---------------------------------------------\n")

# Print the cross-validation scores
for name, score in cross_val_scores.items():
    print(f"Cross-validation score for {name}: {round(score.mean() * 100, 2).astype(str) + '%'}")
print("---------------------------------------------\n")
    
for name, score in roc_auc_scores.items():
    print(f"ROC AUC score for {name}: {round(score.mean() * 100, 2).astype(str) + '%'}")


* A high ROC AUC indicates that the classifier is good at distinguishing between the two classes, such as **Logestic Regression**.
* while a high F1 score indicates a good balance between precision and recall, such as **Support Vector Machine**.

In [None]:
def plot_learning_curve(best_estimators, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    f, axes = plt.subplots(2, 2, figsize=(20, 14), sharey=True)
    if ylim is not None:
        plt.ylim(*ylim)
    for i, (name, estimator) in enumerate(best_estimators.items()):
        ax = axes[i // 2, i % 2]
        train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                                                                train_sizes=train_sizes)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        ax.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="#ff9124")
        ax.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1, color="#2492ff")
        ax.plot(train_sizes, train_scores_mean, 'o-', color="#ff9124",
                 label="Training score")
        ax.plot(train_sizes, test_scores_mean, 'o-', color="#2492ff",
                 label="Cross-validation score")
        ax.set_title(f"{name} Learning Curve", fontsize=14)
        ax.set_xlabel('Training size (m)')
        ax.set_ylabel('Score')
        ax.grid(True)
        ax.legend(loc="best")
    return plt

In [None]:
plot_learning_curve(best_estimators, X_train, y_train, cv=cv, n_jobs=-1)


## Summary: 
<ul>
    <li> <b> Logistic Regression and Support Vector Machine</b> classifiers are more accurate than the other three classifiers in most cases. (We will further analyze Logistic Regression) </li>
<li><b> GridSearchCV </b> is used to determine the paremeters that gives the best predictive score for the classifiers. </li>
<li> Logistic Regression has the best Receiving Operating Characteristic score  (ROC), meaning that LogisticRegression pretty accurately separates <b> fraud </b> and <b> non-fraud </b> transactions.</li>
</ul>

## Learning Curves:
<ul>
<li>The <b>wider the  gap</b>  between the training score and the cross validation score, the more likely your model is <b>overfitting (high variance)</b>.</li>
<li> If the score is low in both training and cross-validation sets</b> this is an indication that our model is <b>underfitting (high bias)</b></li>
<li><b> Support Vector Machine Classifier</b>  shows the best score in both training and cross-validating sets.</li>
<li><b> Logistic Regression Classifier</b>  shows the best F1 score which indicates a good balance between precision and recall.</li>

</ul>

### SMOTE Technique (Over-Sampling):
<b>SMOTE</b> stands for Synthetic Minority Over-sampling Technique.  Unlike Random UnderSampling, SMOTE creates new synthetic points in order to have an equal balance of the classes. This is another alternative for solving the "class imbalance problems". <br><br>


### Cross Validation Overfitting Mistake:
## Overfitting during Cross Validation:  
In our undersample analysis, there was a mistake. If you want to undersample or oversample your data you should not do it before cross validating. Why because you will be directly influencing the validation set before implementing cross-validation causing a "data leakage" problem.

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
def evaluate_classifiers(X_train, y_train, X_test, y_test):
    classifiers = {
        "LogisiticRegression": LogisticRegression(),
        "KNearest": KNeighborsClassifier(),
        "Support Vector Classifier": SVC(probability=True),
        "DecisionTreeClassifier": DecisionTreeClassifier()
    }

    best_estimators = {}
    cross_val_scores = {}
    roc_auc_scores = {}

    # Define the hyperparameters for each classifier
    log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    knears_params = {"n_neighbors": list(range(2,5,1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
    svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
    tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)), "min_samples_leaf": list(range(5,7,1))}

    # Loop through the classifiers
    for name, clf in classifiers.items():
        # Define the hyperparameters for the current classifier
        if name == "LogisiticRegression":
            params = log_reg_params
        elif name == "KNearest":
            params = knears_params
        elif name == "Support Vector Classifier":
            params = svc_params
        elif name == "DecisionTreeClassifier":
            params = tree_params

        # Perform grid search to find the best estimator
        rand_grid_clf = RandomizedSearchCV(clf, params, n_iter=4)
        pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), rand_grid_clf) # SMOTE happens during Cross Validation not before..
        model = pipeline.fit(X_train, y_train)
        best_estimators[name] =  rand_grid_clf.best_estimator_

        # Evaluate the classifier using cross-validation
        scores = cross_val_score(best_estimators[name], X_train, y_train, cv=5)
        cross_val_scores[name] = scores.mean()

        # Calculate the ROC AUC score on the test data
        y_pred_proba = cross_val_predict(best_estimators[name], X_test, y_test, cv=5, method='predict_proba')[:,1]
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        roc_auc_scores[name] = roc_auc

    # Generate classification reports for each classifier
    reports = {}
    for name, clf in best_estimators.items():
        y_pred = clf.predict(X_test)
        report = classification_report(y_test, y_pred, target_names=['nonFraud', 'Fraud'])
        reports[name] = report

    return best_estimators, reports, cross_val_scores, roc_auc_scores

In [None]:
best_estimators, reports, cross_val_scores, roc_auc_scores = evaluate_classifiers(X_train, y_train, X_test, y_test)

print("---------------------------------------------\n")
# Print the classification reports
for name, report in reports.items():
    print(f"Classification Report for {name}:")
    print(report)

print("---------------------------------------------\n")

# Print the cross-validation scores
for name, score in cross_val_scores.items():
    print(f"Cross-validation score for {name}: {round(score.mean() * 100, 2).astype(str) + '%'}")
print("---------------------------------------------\n")
    
for name, score in roc_auc_scores.items():
    print(f"ROC AUC score for {name}: {round(score.mean() * 100, 2).astype(str) + '%'}")


### Conclusion: 
Implementing SMOTE on our imbalanced dataset increased the overall score in all our models. 
