In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
creditcarddf = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv') # Load the dataset of interest

In [5]:
creditcarddf.head() # Look at the first 5 lines

In [6]:
creditcarddf.describe()

In [7]:
creditcarddf.isnull().max().sum()

There aren't any missing values so we don't have to worry about filling in values/imputation.

Let's see how many transactions have been classified as fraud.

In [8]:
print('No Fraud: {}%'.format(round(creditcarddf['Class'].value_counts()[0]/len(creditcarddf)*100, 2)))
print('Fraud: {}%'.format(round(creditcarddf['Class'].value_counts()[1]/len(creditcarddf)*100, 2)))

In [16]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.countplot('Class', data=creditcarddf)
plt.title('Class Distributions \n (0: No Fraud || 1: Fraud)')

In [23]:
amount_val = creditcarddf['Amount'].values.ravel()
time_val = creditcarddf['Time'].values.ravel()

fig,ax = plt.subplots(1,2, figsize=(18,4))

sns.distplot(amount_val, ax=ax[0], color='r')
ax[0].set_title('Transaction Amount Distribution')
ax[0].set_xlim([min(amount_val), max(amount_val)])

sns.distplot(time_val, ax=ax[1], color='b')
ax[1].set_title('Transaction Time Distribution')
ax[1].set_xlim([min(time_val), max(time_val)])

These are really large ranges so these two columns need to be scaled down.

In [24]:
from sklearn.preprocessing import RobustScaler

In [25]:
robust_scaler = RobustScaler()

scaled_amount = robust_scaler.fit_transform(creditcarddf['Amount'].values.reshape(-1,1)) # Scale
scaled_time = robust_scaler.fit_transform(creditcarddf['Time'].values.reshape(-1,1)) # Scale

creditcarddf.drop(['Amount', 'Time'], axis=1, inplace=True) # Drop the Amount and Time columns

In [27]:
creditcarddf.insert(0, 'scaled_amount', scaled_amount)
creditcarddf.insert(1, 'scaled_time', scaled_time)

In [28]:
creditcarddf.head()

Now we will split the data into X and y.

In [34]:
X = creditcarddf.drop(['Class'], axis=1)
y = creditcarddf['Class']

# These are undersampled

In [32]:
from sklearn.model_selection import StratifiedKFold

In [36]:
stratified_split = StratifiedKFold(n_splits=5, random_state=1, shuffle=False)

for train_index, test_index in stratified_split.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

In [39]:
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)

print(train_counts_label*100/len(original_ytrain)) # Check percentage of transactions marked fraudulent
print(test_counts_label*100/len(original_ytest)) # Check percentage of transactions marked fraudulent

In [44]:
# We attempt to balance our dataset

# Shuffle data before creating our samples

creditcarddf = creditcarddf.sample(frac=1, random_state=0)

# amount of fraud classes 492 rows.
fraud = creditcarddf.loc[creditcarddf['Class'] == 1]
non_fraud = creditcarddf.loc[creditcarddf['Class'] == 0][:492]

len(fraud), len(non_fraud)

In [45]:
balanced_df = pd.concat([fraud, non_fraud])

# Shuffle dataframe rows
new_df = balanced_df.sample(frac=1, random_state=8)

new_df.head()

In [46]:
sns.countplot('Class', data=new_df)
plt.title('Class Distributions \n (0: No Fraud || 1: Fraud)')

In [47]:
# Make sure we use the subsample in our correlation

f, (ax1, ax2) = plt.subplots(2, 1, figsize=(24,20))

# Entire DataFrame
corr = creditcarddf.corr()
sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax1)
ax1.set_title("Imbalanced Correlation Matrix \n (don't use for reference)", fontsize=14)


sub_sample_corr = new_df.corr()
sns.heatmap(sub_sample_corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax2)
ax2.set_title('SubSample Correlation Matrix \n (use for reference)', fontsize=14)
plt.show()

**Negative Correlations**: V14, V12, V10 and V9 are negatively correlated. Notice how the lower these values are, the more likely the end result will be a fraud transaction.


**Positive Correlations**: V2, V4, V11, and V19 are positively correlated. Notice how the higher these values are, the more likely the end result will be a fraud transaction.

In [49]:
# Look at box plots to see how these columns compare to fraud classification.

f, axes = plt.subplots(ncols=4, figsize=(20,5))

# Negative Correlations with our Class (The lower our feature value the more likely it will be a fraud transaction)
sns.boxplot(x="Class", y="V14", data=new_df, ax=axes[0])
axes[0].set_title('V14 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V12", data=new_df, ax=axes[1])
axes[1].set_title('V12 vs Class Negative Correlation')


sns.boxplot(x="Class", y="V10", data=new_df, ax=axes[2])
axes[2].set_title('V10 vs Class Negative Correlation')


sns.boxplot(x="Class", y="V9", data=new_df, ax=axes[3])
axes[3].set_title('V9 vs Class Negative Correlation')

plt.show()

In [50]:
f, axes = plt.subplots(ncols=4, figsize=(20,5))

# Negative Correlations with our Class (The lower our feature value the more likely it will be a fraud transaction)
sns.boxplot(x="Class", y="V2", data=new_df, ax=axes[0])
axes[0].set_title('V2 vs Class Positive Correlation')

sns.boxplot(x="Class", y="V4", data=new_df, ax=axes[1])
axes[1].set_title('V4 vs Class Positive Correlation')


sns.boxplot(x="Class", y="V11", data=new_df, ax=axes[2])
axes[2].set_title('V11 vs Class Positive Correlation')


sns.boxplot(x="Class", y="V19", data=new_df, ax=axes[3])
axes[3].set_title('V19 vs Class Positive Correlation')

plt.show()

Now we will train some classifiers.

In [51]:
X = new_df.drop('Class', axis=1)
y = new_df['Class']

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import make_pipeline

classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "Random Forest": RandomForestClassifier()
}

from sklearn.model_selection import cross_val_score

for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print("Classifiers: ", key,
          "has a training score of", round(training_score.mean(), 2) * 100,
          "% accuracy score")

Logistic Regression and SVM had the best models on our training dataset. We will explore Logistic Regression further.

In [60]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [74]:
from sklearn.model_selection import GridSearchCV

parameters = {'penalty': ['l1', 'l2'],
             'C':[0.1,1,10]}

lr = LogisticRegression(solver='liblinear')
cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(X_train, y_train)
log_reg = cv.best_estimator_

In [75]:
cv.best_params_

In [77]:
log_reg_score = cross_val_score(log_reg, X_train, y_train, cv=5)
print('Logistic Regression Cross Validation Score: ',
      round(log_reg_score.mean() * 100, 2).astype(str) + '%')