# Credit Card Fraud Detection

![fraude](fraude.jpg)

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Important-libraries" data-toc-modified-id="Important-libraries-0">Important libraries</a></span></li><li><span><a href="#Loading-data" data-toc-modified-id="Loading-data-1">Loading data</a></span></li><li><span><a href="#important-functions" data-toc-modified-id="important-functions-2">important functions</a></span></li><li><span><a href="#Understanding-our-data" data-toc-modified-id="Understanding-our-data-3">Understanding our data</a></span></li><li><span><a href="#Correlation-analysis" data-toc-modified-id="Correlation-analysis-4">Correlation analysis</a></span></li><li><span><a href="#Fraud/-non-fraud-ratio" data-toc-modified-id="Fraud/-non-fraud-ratio-5">Fraud/ non-fraud ratio</a></span></li><li><span><a href="#Synthetic-Minority-Oversampling-Technique-(SMOTE)" data-toc-modified-id="Synthetic-Minority-Oversampling-Technique-(SMOTE)-6">Synthetic Minority Oversampling Technique (SMOTE)</a></span></li><li><span><a href="#Traditional-way-of-catching-fraud" data-toc-modified-id="Traditional-way-of-catching-fraud-7">Traditional way of catching fraud</a></span></li><li><span><a href="#XY-Split" data-toc-modified-id="XY-Split-8">XY Split</a></span></li><li><span><a href="#Logistic-Regression-with-imbalance-data" data-toc-modified-id="Logistic-Regression-with-imbalance-data-9">Logistic Regression with imbalance data</a></span></li><li><span><a href="#Decision-Trees" data-toc-modified-id="Decision-Trees-10">Decision Trees</a></span></li><li><span><a href="#Random-Forest" data-toc-modified-id="Random-Forest-11">Random Forest</a></span></li><li><span><a href="#AdaBoostClassifier" data-toc-modified-id="AdaBoostClassifier-12">AdaBoostClassifier</a></span></li><li><span><a href="#Model-Comparison" data-toc-modified-id="Model-Comparison-13">Model Comparison</a></span></li><li><span><a href="#Logistic-Regression-combined-with-SMOTE" data-toc-modified-id="Logistic-Regression-combined-with-SMOTE-14">Logistic Regression combined with SMOTE</a></span></li></ul></div>

## Important libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline 
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from collections import Counter
from sklearn.model_selection import GridSearchCV
import dtreeviz.trees
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import AdaBoostClassifier

## Loading data

In [None]:

df=pd.read_csv("Data/creditcard/creditcard_sampledata_3.csv",index_col=0)
df


## important functions

def plot_data(X,y):
    plt.figure(figsize=(12, 8))
    plt.title('Data class representation')
    plt.scatter(X[y==0, 0], X[y==0, 1], label='Class #0', alpha=0.5, linewidth=0.15)
    plt.scatter(X[y==1, 0], X[y==1, 1], label='Class #1', alpha=0.5, linewidth=0.15, c='r')
    plt.legend()
    plt.grid(False)
    
    return plt.show()

## Understanding our data


In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#Do we have any missing value?
df.isna().sum()

In [None]:
#According to the dataset description the features from "V1" to "V28" are the result of PCA(Principal Components Analysis). 
#We know that features must be scaled before using this technique. 
#However the features "Time" and "Amount" are not scaled we should scaled them before continuing with our analysis. 

df['Amount_scaled'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))



#Now we have to drop from our dataset the features "Time" and "Amount":
df=df.drop(columns=['Amount'],axis=1)






In [None]:
list(df.columns) 
df = df[[
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount_scaled',
 'Class']]

df

## Correlation analysis

In [None]:
df[df.columns[0:]].corr()['Class'][:].sort_values(ascending=False)

In [None]:

df.corr()

In [None]:
corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(30, 25))
    ax = sns.heatmap(corr, mask=mask,cmap='icefire', vmin=-1,vmax=1,annot=True, square=True)

## Fraud/ non-fraud ratio

 the feature "Class" is our target variable. This variable has two possible values: 
 1 for fraudulent transactions and 0 for no fraudulent transactions. 
A very commun problem in classification datasets is classs imbalance. This means that the dataset contains an imbalance number of fraudulents and no-fraudulents transactions. ML algorithms works better when the different classes are equally represented  

In [None]:
y=df['Class'].value_counts()
y

In [None]:
#ratio of fraudulent transactions
y/len(df)

We can see that fraudulent transactions represent only 0.9901% of our datasets meanwhile non-fraudulent transactions respresent 99.0099%.
it is confirmed we have class imbalance in our dataset. 
 


In [None]:
#Visualizations can be very usefull to detect the class imbalance:
fig,ax=plt.subplots(figsize=(4,4))
sns.countplot('Class', data=df, palette="Set2")
plt.title('Class Distributions \n (0: No Fraud || 1: Fraud)', fontsize=14)

In [None]:
df


In [None]:
#We can also use a scatter plot to see our class imbalance. 
#First, we need to convert our dataframe in 2 variables:

X=df.iloc[:,0:29].values
y=df.Class.values


In [None]:
X

In [None]:
def plot_data(X: np.ndarray, y: np.ndarray):
    sns.set_palette("Set2")
    sns.scatterplot(X[y == 0, 0], X[y == 0, 1], label="Class #0", alpha=0.5, linewidth=0.15)
    sns.scatterplot(X[y == 1, 0], X[y == 1, 1], label="Class #1", alpha=0.5, linewidth=0.15)
    
    plt.legend()
    
    return plt.show()

In [None]:
plot_data(X,y)

The plot helps us to see the data imbalance problem very clear. 

##  Synthetic Minority Oversampling Technique (SMOTE)

In order to treat the data imbanlance we can use oversampling and undersampling techniques. SMOTE is an oversampling technique. 

In [None]:
print(f'X shape: {X.shape}\ny shape: {y.shape}')

In [None]:

# Define the resampling method
method = SMOTE(random_state=42)


In [None]:
# Create the resampled feature set
X_sm, y_sm = method.fit_resample(X, y)

In [None]:
pd.value_counts(pd.Series(y_sm))


In [None]:
# Plot the resampled data
plot_data(X_sm, y_sm)

## Traditional way of catching fraud
First you'll define threshold values using common statistics, to split fraud and non-fraud. Then, use those thresholds on your features to detect fraud. This is common practice within fraud analytics teams.

Statistical thresholds are often determined by looking at the mean values of observations. Let's start this exercise by checking whether feature means differ between fraud and non-fraud cases. Then, you'll use that information to create common sense thresholds. Finally, you'll check how well this performs in fraud detection.


In [None]:
df.groupby('Class').mean()

In [None]:
# Implement a rule for stating which cases are flagged as fraud
df['flag_as_fraud'] = np.where(np.logical_and(df['V1'] < -3, df['V7'] < -6), 1, 0)

In [None]:
# Create a crosstab of flagged fraud cases versus the actual fraud cases
print(pd.crosstab(df.Class, df.flag_as_fraud, rownames=['Actual Fraud'], colnames=['Flagged Fraud']))

With this first approach we have detected 170 of 492 fraudulent cases, but we got 1226 false positives. Now we will see how we can improve these numbers with ML

In [None]:
#We do not need the feature "flag as fraud" for this analysis, so I will delete it
df=df.drop(columns=['flag_as_fraud'])

## XY Split

In [None]:
X=df.drop('Class', axis=1)
y=df.Class


## Logistic Regression with imbalance data

In [None]:
#Create training and test set (XY split)
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.30, random_state=1000,stratify=y)

In [None]:
#Let's use GridSearchCV in order to find the best parameters for our logistic Regression model
hyperparameters = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],"solver" : ['liblinear', 'saga']}

randomizedsearch = RandomizedSearchCV(LogisticRegression(), hyperparameters)
best_model_random = randomizedsearch.fit(X_train, y_train)
print(best_model_random.best_estimator_)

In [None]:

#Define our model
logreg=LogisticRegression(C=100,solver='liblinear')

#fit our the model with our training set
logreg.fit(X_train,y_train)
# Get predicting values

y_predicted_logreg=logreg.predict(X_test)
#predicted_y_train=model.predict(X_train)


In [None]:

#print classification report for the test set
print("Classification report for the test set")
print(classification_report(y_test,y_predicted_logreg))

In [None]:

print("Confusion matrix for the test set")
print(confusion_matrix(y_test, y_predicted_logreg))
conf_mat = confusion_matrix(y_test,y_predicted_logreg)
sns.heatmap(conf_mat, square=True, annot=True, cmap='icefire', fmt='d', cbar=False)

plt.show()



In [None]:
# Predict probabilities
y_pred_probs_logreg = logreg.predict_proba(X_test)[:,1]


#Calculate roc_auc_score
print("roc_auc_score of logistic regression classifier: ",roc_auc_score(y_test, y_pred_probs_logreg))

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)

plt.plot([0, 1], [0, 1], 'k--')

# Plot tpr against fpr
plt.plot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for fraudulent transactions')
plt.show()





## Decision Trees

In [None]:
#Let's use GridSearchCV in order to find the best parameters for our decision tree
hyperparameters = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)), 
              "min_samples_leaf": list(range(5,7,1))}

randomizedsearch = RandomizedSearchCV(DecisionTreeClassifier(), hyperparameters)
best_model_random = randomizedsearch.fit(X_train, y_train)

print(best_model_random.best_estimator_)

In [None]:

#Define the model with our best parameters and the resampling:
dtc= DecisionTreeClassifier(criterion='entropy',min_samples_leaf=5,max_depth=2)



In [None]:
##fit our pipeline with our training set
dtc.fit(X_train,y_train)

In [None]:
# Get predicting values
predicted=dtc.predict(X_test)

#predicted_y_train=model.predict(X_train)

In [None]:
#print classification report for the test set
print("Classification report for the test set")
print(classification_report(y_test,predicted))

In [None]:
print("Confusion matrix for the test set")
print(confusion_matrix(y_test, predicted))
conf_mat = confusion_matrix(y_test,predicted)
sns.heatmap(conf_mat, square=True, annot=True, cmap='icefire', fmt='d', cbar=False)

plt.show()



performance_model = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, predicted_y_train),
                                         precision_score(y_train, predicted_y_train),
                                         recall_score(y_train, predicted_y_train)],
                               'Test': [accuracy_score(y_test, predicted_y_test),
                                        precision_score(y_test, predicted_y_test),
                                        recall_score(y_test, predicted_y_test)]})
display(performance_model)

In [None]:
# Predict probabilities
y_pred_probs = dtc.predict_proba(X_test)[:,1]


#Calculate roc_auc_score
print("roc_auc_score of decision tree classifier: ",roc_auc_score(y_test, y_pred_probs))

In [None]:
# Tree Representation :
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (20,20))

plot_tree(dtc,filled = True, rounded=True,feature_names=X.columns, class_names=['No Fraud', "Fraud"])
plt.show() 

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)

plt.plot([0, 1], [0, 1], 'k--')

# Plot tpr against fpr
plt.plot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for fraudulent transactions')
plt.show()



#Calculate roc_auc_score
print("roc_auc_score of logistic regression classifier: ",roc_auc_score(y_test, y_pred_probs))

## Random Forest

In [None]:
hyperparameters = {'n_estimators': [1, 30],
              'max_features': ['auto', 'log2'], 
              'max_depth': [4, 8, 10, 12],
              'criterion': ['gini', 'entropy']}


randomizedsearch = RandomizedSearchCV(RandomForestClassifier(), hyperparameters)
best_model_random = randomizedsearch.fit(X_train, y_train)

print(best_model_random.best_estimator_)


In [None]:
#Define model with our best parameters
rfc= RandomForestClassifier(criterion='gini',max_depth=8,max_features='log2',n_estimators=30)

In [None]:
#fit our pipeline with our training set
rfc.fit(X_train,y_train)

In [None]:
# Get predicting values
predictedt=rfc.predict(X_test)

#predicted_y_train=model.predict(X_train)

In [None]:
#print classification report for the test set
print("Classification report for the test set")
print(classification_report(y_test,predicted))

In [None]:
print("Confusion matrix for the test set")
print(confusion_matrix(y_test, predicted))
conf_mat = confusion_matrix(y_test,predicted)
sns.heatmap(conf_mat, square=True, annot=True, cmap='icefire', fmt='d', cbar=False)

plt.show()


In [None]:
# Predict probabilities
y_pred_probs = rfc.predict_proba(X_test)[:,1]


#Calculate roc_auc_score
print("roc_auc_score of random forest classifier: ",roc_auc_score(y_test, y_pred_probs))

In [None]:
# Create a pd.Series of features importances
importances = pd.Series(data=rfc.feature_importances_,
                        index= X_train.columns)

# Sort importances
importances_sorted = importances.sort_values()

# Draw a horizontal barplot of importances_sorted

fig,ax=plt.subplots(figsize=(7,7))

importances_sorted.plot(kind='barh', color='#BCD8C1')
plt.title('Features Importances')
plt.show() 


## AdaBoostClassifier

## Model Comparison

In [None]:
models = {"Logistic Regression": logreg, "Decision Trees": dtc, "Random Forest": rfc}
results = []

# Loop through the models' values
for model in models.values():
    kf = KFold(n_splits=6, random_state=42, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kf)
    results.append(cv_results)
plt.boxplot(results, labels=models.keys())
plt.show()

In [None]:
model_pipeline = [ logreg,dtc,rfc]
model_names = ['Logistic Regresion','Decision Tree','Random Forest']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

## Logistic Regression combined with SMOTE

In [None]:

#Define the resampling method and the model the model:
resampling = SMOTE()
model = LogisticRegression(C=0.1)

# Define the pipeline, tell it to combine SMOTE with the Logistic Regression model
pipeline = Pipeline([('SMOTE', resampling), ('Logistic Regression', model)])

# XY split:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.30, random_state=1000,stratify=y)


#fit the pipeline into the training set:
pipeline.fit(X_train, y_train) 

#Get predictions:
predicted_sm = pipeline.predict(X_test)

In [None]:
#print classification report for the test set
print("Classification report for the test set")
print(classification_report(y_test,predicted_sm))

In [None]:
print("Confusion matrix for the test set")
print(confusion_matrix(y_test, predicted_sm))
conf_mat = confusion_matrix(y_test,predicted_sm)
sns.heatmap(conf_mat, square=True, annot=True, cmap='icefire', fmt='d', cbar=False)

plt.show()

In [None]:
# Predict probabilities
y_pred_probs = model.predict_proba(X_test)[:,1]

In [None]:
# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)

plt.plot([0, 1], [0, 1], 'k--')

# Plot tpr against fpr
plt.plot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for fraudulent transactions')
plt.show()



#Calculate roc_auc_score
print("roc_auc_score of logistic regression classifier: ",roc_auc_score(y_test, y_pred_probs))