# Data UnderStanding

In [None]:
# Standard Library Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 3rd Party Library Imports
import seaborn as sns
from numpy.core.defchararray import add
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization, Conv1D

## Description Data

load data

In [None]:
df = pd.read_csv("dataset.csv")

show the first 5 rows of the dataset

In [None]:
df.head() 

shows the last 5 rows of the dataset

In [None]:
df.tail() 

shows the statistical summary of the dataset

In [None]:
df.describe() 

shows the number of rows and columns

In [None]:
df.shape 

shows that there are no null values

In [None]:
df.info() 

## Exploratory Data Analysis (EDA)

#### `Feature Correlation Heatmap`

In [None]:
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# correlation matrix
correlation_matrix = numeric_df.corr()

# heatmap show the correlation between features
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=False, cmap='plasma', linewidths=.5)
plt.title('Feature Correlation Heatmap', fontsize=15)

In [None]:
plt.show()

#### `Fraud vs Non-Fraud Distribution`

`1` -> fraud

`0` -> not fraud

In [None]:
df['label'].value_counts()

In [None]:
print('percentage of fraud transaction = ', (float(len(df[df['label'] == 1])) / len(df) )*100, "%")

In [None]:
import plotly.express as px

counts = df['label'].value_counts().reset_index()
counts.columns = ['label', 'total']

fig = px.pie(counts, names='label', values='total',
             title='Fraud vs Non-Fraud Distribution',
             hover_data=['total'],
             color_discrete_sequence=px.colors.qualitative.Alphabet)

fig.update_traces(textinfo='percent')
fig.update_layout(legend=dict(x=0.15, y=1.0))

fig.show()

# Data Preprocessing

#### `Validation Data`

In [None]:
def missing_values_table(df):
       
        mis_val = df.isnull().sum()
        
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        return mis_val_table_ren_columns

check for missing values

In [None]:
missing_values_table(df) 

check for duplicates

In [None]:
df.duplicated().sum()

drop column that:
1. `visit_id` -> not use
2. `dx2_koo_k93`, `dx2_u00_u99`, `procv00_v89` -> not have correlation 

In [None]:
drop = ['visit_id', 'dx2_koo_k93', 'dx2_u00_u99', 'procv00_v89'] 
df.drop(drop, inplace=True, axis = 1)

In [None]:
df.head()

change gender to numeric

`0` -> Female

`1` -> Male

In [None]:
df['jkpst'].replace(to_replace=['L', 'P'], value = [0,1], inplace = True)

numeric data type

In [None]:
data_num = df.select_dtypes(include=[np.number])

category data type /object 

In [None]:
data_cat = df.select_dtypes(exclude=[np.number])

get dummies (data transformation)

In [None]:
transform_cat = pd.get_dummies(data_cat, prefix_sep='_', drop_first=True) 

In [None]:
data_cat = transform_cat.assign(new=add('', np.arange(1, len(data_cat) + 1).astype(str))) 
data_num = data_num.assign(new=add('', np.arange(1, len(data_num) + 1).astype(str)))
df = pd.concat([data_cat, data_num], axis=1)
df.drop(['new'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.columns

## Imbalance Data Handling

In [None]:
df['label'].value_counts()

In [None]:
fraud = df[df['label'] == 1]
non_fraud = df[df['label'] == 0]

In [None]:
fraud.shape

In [None]:
non_fraud.shape

#### `Upsampling`

In [None]:
non_fraud = non_fraud.sample(fraud.shape[0], replace=True) 
non_fraud.shape

In [None]:
df = pd.concat([fraud, non_fraud], ignore_index=True)

In [None]:
df['label'].value_counts()

# Modeling

`feature selection` for determining `input` and `target features`

In [None]:
X = df.drop('label', axis = 1)
y = df['label']

change the scale for each feature using `normalization` so that each value is on a `scale` between `0-1`.

In [None]:
X = MinMaxScaler(feature_range=(0, 1)).fit_transform(X)

divide the dataset into `training data` and `test data`

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 
X_train.shape, X_test.shape

In [None]:
X_train.shape, X_test.shape 

In [None]:
X_train 

`scaling standardization` focuses on turning raw data into usable information before it is analyzed.

In [None]:
scaler = StandardScaler() 

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [None]:
X_train.shape

In [None]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [None]:
X_train.shape, X_test.shape

In [None]:
X_train[0].shape

## Model CNN

In [None]:
model = Sequential()
model.add(Conv1D(32, 2, activation='relu', input_shape = (111, 1)))
model.add(BatchNormalization())
model.add(Dropout(0.1))

model.add(Conv1D(64, 2, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))

model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy']) 

In [None]:
training_results = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), verbose=1) 

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2,  random_state=0) 

In [None]:
print('X_val', X_val.shape) 

In [None]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
loss_v, accuracy_v = model.evaluate(X_val, y_val, verbose=1)

In [None]:
print("Validation | accuracy = %f  ;  loss = %f" % (accuracy_v, loss_v))
print("Test       | accuracy = %f  ;  loss = %f" % (accuracy, loss))

In [None]:
y_pred_cnn = model.predict(X_test)
y_pred_cnn = np.round(y_pred_cnn).astype(int)

cm = confusion_matrix(y_test, y_pred_cnn)
sns.heatmap(cm, annot=True, fmt="d", cbar=False)
plt.title('CNN Confusion Matrix')
plt.savefig('cnn_con_mat')

plt.show()

In [None]:
print(classification_report(y_test, y_pred_cnn))

In [None]:
print('Precision Score:', precision_score(y_test, y_pred_cnn))
print('Accuracy Score:', accuracy_score(y_test, y_pred_cnn))
print('Recall Score:', recall_score(y_test,y_pred_cnn))

In [None]:
print("Number of fraud points in the testing dataset =", sum(y_test))