# IEEE-CIS Fraud Detection

## Imports

In [1]:
# Dataset
import pandas as pd
import numpy as np

# Graph
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

# Vectorizing and spliting
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances

import warnings
warnings.filterwarnings("ignore")

# Visualization
# from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from umap import UMAP

# Metric
from sklearn.metrics import silhouette_score
from sklearn.manifold import trustworthiness

# Math
from math import sqrt

from operator import itemgetter 

# Imbalanced dataset
from imblearn.over_sampling import SMOTE  # type: ignore

## Read files

In [2]:
!unzip 'data/train_transaction.zip' -d 'data'

Archive:  data/train_transaction.zip
replace data/train_transaction.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [16]:
train_transaction_df = pd.read_csv("data/train_transaction.csv")

In [8]:
train_transaction_df

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.50,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,W,6550,,150.0,visa,226.0,...,,,,,,,,,,
590536,3577536,0,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,...,,,,,,,,,,
590537,3577537,0,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,...,,,,,,,,,,
590538,3577538,0,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,...,,,,,,,,,,


Remove NaN values

In [26]:
train_transaction_df = train_transaction_df.replace(np.nan, 0)

In [27]:
X_pca_df = train_transaction_df[train_transaction_df.columns.drop(list(train_transaction_df.filter(regex='^((?!V).)*$')))][:5000]
y_pca_df = train_transaction_df['isFraud'][:5000]

Analyzing the proportion of fraudulent samples:

In [28]:
y_pca_df.value_counts(normalize=True)

0    0.9782
1    0.0218
Name: isFraud, dtype: float64

The fraudulent samples are approximately 2% of the dataset, so we are dealing with an unbalanced dataset.

We'll have to apply an oversampling or undersampling technique.

In [29]:
def balance_dataset(X, y):
    oversample = SMOTE()
    X, y = oversample.fit_resample(X, y)

    return X, y

In [30]:
X_pca, y_pca = balance_dataset(X_pca_df, y_pca_df)

In [32]:
y_pca.value_counts(normalize=True)

0    0.5
1    0.5
Name: isFraud, dtype: float64

## Visualizations

In [33]:
def plotFigure(data, y):
  
  # Assign data of lists.  
  data = {'x': list(map(itemgetter(0), data)), 
          'y': list(map(itemgetter(1), data))} 
  
  # Create DataFrame  
  df = pd.DataFrame(data)

  # Create figure
  fig = px.strip(
    data_frame=df,
    x='x',
    y='y',
    hover_name=y,
    color=y,
    hover_data={'x': False, 'y': False}
  )
  
  fig.update_yaxes(color='white', showticklabels=False, showline=True, linewidth=1, linecolor='black', mirror=True)
  fig.update_xaxes(color='white', showticklabels=False, showline=True, linewidth=1, linecolor='black', mirror=True)
  fig.update_layout(plot_bgcolor= 'rgba(0, 0, 0, 0)')

  return fig

In [34]:
pca_tsne = TSNE(perplexity=500).fit_transform(X_pca)

In [35]:
fig = plotFigure(pca_tsne, y_pca.replace([1, 0],['Fraud', 'Not fraud']))
print('T-distributed Stochastic Neighbor Embedding (TSNE)')
fig.show()

T-distributed Stochastic Neighbor Embedding (TSNE)


In [36]:
pca_umap = UMAP(n_neighbors=50, min_dist=0.1).fit_transform(X_pca)

In [37]:
fig = plotFigure(pca_umap, y_pca.replace([1, 0],['Fraud', 'Not fraud']))
print('Uniform Manifold Approximation and Projection (UMAP)')
fig.show()

Uniform Manifold Approximation and Projection (UMAP)
