<a href="https://colab.research.google.com/github/pastrop/kaggle/blob/master/fraud_dection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fraud Detection Notebook Using Isolation Forest

In [0]:
# Use seaborn for the correlation heatmap
!pip install seaborn

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pathlib

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import gridspec

import pandas as pd
import seaborn as sns

import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans

# Datetime related libraries
import time
import datetime
import dateutil.parser

In [0]:
# file upload while using Google Colab
from google.colab import files
uploaded = files.upload()

In [0]:
#Unzipping if needed
!unzip creditcard.zip

In [0]:
df = pd.read_csv('file_name')
df.describe()

In [0]:
data_shape = df_model1.shape
print(data_shape)

In [0]:
unknown_count = df_model1.isna().sum().drop_duplicates()
unknown_count[unknown_count>0]

In [0]:
# Correlation Analysis - NEEDS REFACTORING
# Parameters
THRESHOLD = 0.7 # Correlation Coefficient Threshold of Interest
#printing formats
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Sample to calculate correlations (using filtered set of column, run exlusion calculation first)
#df_model1_sample = measurements_filtered.sample(frac=0.3, random_state=42)
df_model1_sample = measurements_for_corr.sample(frac=0.3, random_state=42)

#SECTION TO REFACTOR
#geo exclusions - filter out lat/long/zipcode 
column_list = list(df_model1_sample.columns)
column_geo_include=[]
for column in column_list:
  if column.find('longitude') == -1 and column.find('latitude') == -1 and column.find('zipcode') == -1:
    column_geo_include.append(column)

#List of columns after fitering numeric geo columns out
df_model1_sample_filtered = df_model1_sample.filter(column_geo_include).columns
#END OF SECTION TO REFACTORls
print('columns to correlate')
print(df_model1_sample_filtered)

start = time.time()

# Correlations, positive and negative. Round to 2 decimals and filling NaN with O
df_model1_corr = round(df_model1_sample.filter(column_geo_include).corr(),6).fillna(0)

print("CORRELATION MATRIX")
print(df_model1_corr)

n_columns = df_model1_corr.shape[1]
column_list = list(df_model1_corr.columns)

#Upper triangle matrix of correlation coefficients
corr_matrix = df_model1_corr.values
triu_corr_matrix = np.triu(corr_matrix, k=1)

#printing technical data that is used in follow-up calculations or/and troubleshooting
print("TRIANGILATED CORRELATION MATRIX")
print(triu_corr_matrix)

#Selecting corr coeff within the range of interest
i=0
corr_ranking=[] # initializing array of highly correlated pairs
for i in range(n_columns):
  for j in range(i):
    if abs(triu_corr_matrix[j][i]) > THRESHOLD and abs(triu_corr_matrix[j][i]) <= 1:
      temp=triu_corr_matrix[j][i],column_list[i], column_list[j]
      corr_ranking.append(temp)

#Ranked Correlation Coeff Array - High to Low
corr_ranking.sort(reverse=True)
print("ranked correlation array")
print(corr_ranking)

#Selecting least correlated columns
l=0
corr_ranking_smallest=[] #initializing array of least correlated pairs
for l in range(n_columns):
  for k in range(l):
    if abs(triu_corr_matrix[k][l]) < THRESHOLD and abs(triu_corr_matrix[k][l]) >= 0:
      temp=triu_corr_matrix[k][l],column_list[l], column_list[k]
      corr_ranking_smallest.append(temp)
sorted(corr_ranking_smallest,key=lambda item: abs(item[0]))
if len(corr_ranking_smallest) == 0:
  corr_ranking_smallest = corr_ranking[:]

#printing technical data that is used in follow-up calculations or/and troubleshooting
print("most uncorrelated")
print(corr_ranking_smallest)

#Select top 10 correlated set by absolute value of the correlation coefficient
top_10_corr = [(abs(item[0]),item[1],item[2]) for item in corr_ranking]

#printing technical data that is used in follow-up calculations or/and troubleshooting
print("top 10 coeff by value - {}".format(top_10_corr))  
print("Running time in seconds =", time.time() - start)

In [0]:
    # Correlation plot related parameters calculation
    col_corr = [] # Set of all the names of highly correlated columns
    for i in range(len(df_model1_corr.columns)):
        for j in range(len(df_model1_corr.columns)):
            if (abs(df_model1_corr.iloc[j, i]) > 0.5) and (abs(df_model1_corr.iloc[j, i]) < 0.9) and (df_model1_corr.columns[i] not in col_corr) and i != j:
                colname = df_model1_corr.columns[i] # getting the name of column
                col_corr.append(colname)
    print(col_corr)
    top_ten_set = set()
    for item in top_10_corr:
      for i in range(1,len(item)):
        top_ten_set.add(item[i])
    print("top_ten_set: {}".format(top_ten_set))
    if len(top_ten_set)>0:
      df_model1_filtered = df_model1.filter(top_ten_set)
    else:
      df_model1_filtered=df_model1

In [0]:
# HeatMap Using Seaborn
sns.set()
# this scales up all text, but does not affect annot (see annot_kws={'size':1.4})
sns.set(font_scale=1.4)
# compute the correlation matrix
corr = df_model1_filtered.corr()
fig = plt.subplots(figsize=(15,15))
cmap = sns.diverging_palette(0, 359, as_cmap=True)
# annot controls the correlation values display, bizarre shrink-value properly scales the colorbar 
ax = sns.heatmap(corr, square=True, cbar_kws={'shrink': 0.82}, annot=True, annot_kws={'size': 14})
# take care of the labels printing
labels_list = df_model1_filtered.columns
# this centers and prints horizontally the y labels
ax.set_yticklabels(labels_list, rotation=0, va='center')
# this rotates the x labels
ax.set_xticklabels(labels_list, rotation=45, va='top')
ax.collections[0].colorbar.set_label('Absolute value of the correlation', rotation=-90, va='bottom')
plt.show()