# Credit Card Fraud Detection
#### By Shadi Bavar, Matthew Euliano, and Claire Parisi


##### Importing Required Libraries & Dataset

In [None]:
#Importing the libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report, f1_score, accuracy_score, precision_score, recall_score, make_scorer 


In [None]:
#Importing the Data
data = pd.read_csv('creditcard.csv')
data.head()

In [None]:
#Show statistics
data.describe()

##### Visualization and Data Conditioning

In [None]:
#Plot number of each class: data is skewed
plt.hist(data['Class'], bins = [-.4, .4, .6, 1.4], color = 'red')
plt.xlabel('Class')
plt.xticks([0, 1])
plt.ylabel('Occurrences')
plt.show()

print('Number of fraud transactions:', len(data[data['Class'] == 1]), '(', round(len(data[data['Class'] == 1])/len(data)*100, 2), '%)')
print('Number of non-fraud transactions:', len(data[data['Class'] == 0]), '(', round(len(data[data['Class'] == 0])/len(data)*100, 2), '%)')

#Ditrubution of transaction amounts
plt.hist(data['Amount'], bins = 500, color = 'red')
plt.xlabel('Amount ($)')
plt.ylabel('Occurrences')
plt.show()

#Distribution of fraudulent transaction amounts
plt.hist(data[data['Class'] == 1]['Amount'], bins = 500, color = 'red')
plt.xlabel('Amount ($)')
plt.ylabel('Occurrences')
plt.show()

#Distribution of transaction times
plt.hist(data['Time'], bins = 500)
plt.xlabel('Time (in seconds)')
plt.ylabel('Occurrences')
plt.show()

#Distribution of fraudulent transaction times
plt.hist(data[data['Class'] == 1]['Time'], bins = 500)
plt.xlabel('Time (in seconds)')
plt.ylabel('Occurrences')
plt.show()

#Time vs Amount Feature Plot
plt.scatter(data[data['Class'] == 0]['Time'], data[data['Class'] == 0]['Amount'], color = 'blue', label = 'Normal')
plt.scatter(data[data['Class'] == 1]['Time'], data[data['Class'] == 1]['Amount'], color = 'red', marker = 'x', label = 'Fraud')
plt.xlabel('Time (in seconds)')
plt.ylabel('Amount ($)')
plt.legend()
plt.show()

In [27]:
#Scaling time and amount features
data['Amount'] =  StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))


#Plotting the scaled amount data to see if that improves visualization
plt.hist(data['Amount'], bins = 500, color = 'red')
plt.xlabel('Amount (regularized)')
plt.ylabel('Occurrences')
plt.show()

plt.hist(data[data['Class'] == 1]['Amount'], bins = 500, color = 'red')
plt.xlabel('Amount (regularized)')
plt.ylabel('Occurrences')
plt.show()
print('Min (scaled) amount:', data['Amount'].min())

plt.scatter(data[data['Class'] == 0]['Time'], data[data['Class'] == 0]['Amount'], color = 'blue', label = 'Normal')
plt.scatter(data[data['Class'] == 1]['Time'], data[data['Class'] == 1]['Amount'], color = 'red', marker = 'x', label = 'Fraud')
plt.xlabel('Time (in seconds)')
plt.ylabel('Amount (regularized)')
plt.legend()
plt.show()


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

#### Undersample the Data to Balance the Classes

In [None]:
#Undersampling the dataset to balance the classes
frauds = data[data['Class'] == 1]
n_frauds = len(frauds)
fraud_indx = np.array(data[data['Class'] == 1].index)

#Randomly select numbeber of non-fraud transactions to match the number of fraud transactions
random_normal_indx = np.random.choice(data[data['Class'] == 0].index, n_frauds, replace = False)
random_normal_indx = np.array(random_normal_indx)

#Concatenate the indices of the normal and fraud transactions
undersample_indx = np.concatenate([fraud_indx, random_normal_indx])
undersample_data = data.iloc[undersample_indx]
undersample_data.head()

print("Percentage of Normal Transcations:", len(undersample_data[undersample_data['Class'] == 0])/len(undersample_data)*100, '%')
print("Percentage of Fraud Transcations:", len(undersample_data[undersample_data['Class'] == 1])/len(undersample_data)*100, '%')

#Split 
x = undersample_data.drop(['Class'], axis = 1)
y = undersample_data['Class']

#### Cross Validation 

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 4)

k = 5
k_fold = KFold(n_splits = k, shuffle = True, random_state = 4)
c = [0.01, 0.1, 1, 10, 100] #weight values to try

for i in range(len(c)):

    recall = np.zeros(k)
    precision = np.zeros(k)
    f1 = np.zeros(k)
    indx = 0
    for train_index, valid_index in k_fold.split(x_train):
        lr = LogisticRegression(C = c[i], penalty= "l2", max_iter= 1000) #Logistic Regression model
        Xtrain = x_train.iloc[train_index]
        Ytrain = y_train.iloc[train_index]
        lr.fit(Xtrain, Ytrain) #Fitting the model
        y_pred = lr.predict(x_train.iloc[valid_index]) #Predicting on the validation set

        #Evaluating the model
        recall[indx] = recall_score(y_train.iloc[valid_index], y_pred)
        precision[indx] = precision_score(y_train.iloc[valid_index], y_pred)
        f1[indx] = f1_score(y_train.iloc[valid_index], y_pred)
        indx += 1

    avg_recall = np.mean(recall)
    avg_precision = np.mean(precision)
    avg_f1 = np.mean(f1)
    print('C =', c[i], ', Recall:', avg_recall, 'Precision:', avg_precision, 'F1:', avg_f1)


C = 0.01 ,Recall: 0.8834103317669963 Precision: 0.983936345344796 F1: 0.9304858941580078
C = 0.1 ,Recall: 0.9084928134701457 Precision: 0.9700031937034742 F1: 0.9373996925178808
C = 1 ,Recall: 0.9107734596532309 Precision: 0.9678033724781692 F1: 0.9375995581814076
C = 10 ,Recall: 0.9108708622506334 Precision: 0.9604482531311799 F1: 0.9340893175657499
C = 100 ,Recall: 0.898239283303265 Precision: 0.9666355379770014 F1: 0.9305475238243119
