## SVM
##### 10/12/21

In [250]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

np.random.seed(42)

#### Read the input file
###### Discard the `Time` variable

In [251]:
df = pd.read_csv("creditcard.csv", sep = ',')

df["Time"] = pd.to_numeric(df["Time"], downcast='integer')
df = df.drop("Time", axis=1)

#### Split the data to train-test

In [252]:
X, y = df.drop(columns="Class"), df["Class"]

train_X, test_X, train_Y, test_Y = train_test_split(X, y, test_size=.2, stratify=y)

#### Scale the ```Amount``` column [0,1]

In [253]:
min_max_scaler = MinMaxScaler()
 
train_X['Amount'] = min_max_scaler.fit_transform(train_X["Amount"].values.reshape(-1,1))
test_X["Amount"] = min_max_scaler.transform(test_X["Amount"].values.reshape(-1,1))

##### Deal with the imbalance of the 2 classes by undersampling the majority class, in our case the legitimate transactions, so that each class is equally represented in the training dataset 

In [254]:
df2 = pd.concat([train_X,train_Y], axis=1)

n_legit, n_fraud = df2.Class.value_counts()

df2_temp = df2[df2["Class"] == 0].sample(n_fraud)

df_downsampled = pd.concat([df2_temp, df2[df2["Class"] == 1]])

# Shuffle data
df_downsampled = df_downsampled.sample(frac=1).reset_index(drop=True)

train_X = df_downsampled.drop("Class", axis=1)
train_Y = df_downsampled.Class


In [255]:
df3 = pd.concat([test_X, test_Y], axis=1)

n_legit_t, n_fraud_t = df3.Class.value_counts()

df3_temp = df3[df3["Class"] == 0].sample(n_fraud_t)

df3_downsampled = pd.concat([df3_temp, df3[df3["Class"] == 1]])

# Shuffle data
df3_downsampled = df3_downsampled.sample(frac=1).reset_index(drop=True)

test_X = df3_downsampled.drop("Class", axis=1)
test_Y = df3_downsampled.Class

#### Train the SVM classifier on the training data

In [256]:
c = 100
kernel_ = 'sigmoid' 
gamma_ = 5
deg = 5

clf = SVC(C=c, kernel=kernel_, gamma=gamma_, degree=deg)

clf.fit(train_X, train_Y)

SVC(C=100, degree=5, gamma=5, kernel='sigmoid')

#### Make Predictions 

In [257]:
y_pred = clf.predict(test_X)

#### Metrics

In [258]:
print("Accuracy : %7.4f" % accuracy_score(test_Y, y_pred))
print("Recall : %9.4f" % recall_score(test_Y, y_pred))
print("Precision : %2.4f" % precision_score(test_Y, y_pred))
print("F1 Score : %7.4f" % f1_score(test_Y, y_pred))

Accuracy :  0.7398
Recall :    0.7857
Precision : 0.7196
F1 Score :  0.7512


In [259]:
tn, fp, fn, tp = confusion_matrix(test_Y, y_pred, labels=[0, 1]).ravel()

print("(Actually Fraudulant transactions) TP:", tp)
print("(Legit transactions classified as fraudulant) FP:", fp)
print("(Actually Legit transactions) TN:", tn)
print("(Fraudulant transactions classified as legitimate) FN:", fn)


(Actually Fraudulant transactions) TP: 77
(Legit transactions classified as fraudulant) FP: 30
(Actually Legit transactions) TN: 68
(Fraudulant transactions classified as legitimate) FN: 21
