In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors 
from sklearn import datasets, model_selection as ms
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive
drive.mount('/content/drive/',force_remount=True)
!ls "/content/drive/My Drive/COMP551/"

creditcard = pd.read_csv("/content/drive/My Drive/COMP551/creditcard.csv")

Mounted at /content/drive/
creditcard.csv


In [None]:
#Checking for missing data
#total = creditcard.isnull().sum().sort_values(ascending = False)
#percent = (creditcard.isnull().sum()/creditcard.isnull().count()*100).sort_values(ascending = False)
#pd.concat([total, percent], axis=1, keys=['Total', 'Percent']).transpose()
#There is no missing data

In [None]:
#Splitting the data
x = creditcard.iloc[:,1:30]
y = creditcard.iloc[:,30]

x_train, x_test, y_train, y_test = ms.train_test_split(x, y, test_size=0.25, random_state=99)

In [None]:
#Checking for data unbalance

temp = creditcard["Class"].value_counts()
df = pd.DataFrame({'Class': temp.index,'values': temp.values})
trace = go.Bar(
    x = df['Class'],y = df['values'],
    name="Credit Card Fraud Class - data unbalance (Not fraud = 0, Fraud = 1)",
    marker=dict(color="Blue"),
    text=df['values']
)
data = [trace]
layout = dict(title = 'Credit Card Fraud Class - data unbalance (Not fraud = 0, Fraud = 1)',
          xaxis = dict(title = 'Class', showticklabels=True), 
          yaxis = dict(title = 'Number of transactions'),
          hovermode = 'closest',width=700
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='class')

In [None]:
#Feature correlation 
plt.figure(figsize = (14,14))
plt.title('Credit Card Transactions features correlation plot (Pearson)')
corr = creditcard.corr()
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns,linewidths=.1,cmap="Blues")
plt.show()

In [None]:
# Isolation Forest
IF = IsolationForest(n_estimators = 50, max_samples=20, contamination = 0.001)
IF.fit(x_train,y_train)
y_pred = pd.Series(IF.predict(x_test)).map({-1:1,1:0})
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))

In [None]:
#Local Outlier Factor
LOF = neighbors.LocalOutlierFactor(novelty=True, n_neighbors=20, contamination=0.001)
LOF.fit(x_train,y_train)
y_pred2 = pd.Series(LOF.predict(x_test)).map({-1:1,1:0})
print(accuracy_score(y_test,y_pred2))
print(classification_report(y_test,y_pred2))
print(roc_auc_score(y_test,y_pred2))