In [5]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from datetime import datetime
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import StandardScaler

2023-04-09 22:19:52.435235: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
# This turns all the axes white in all the matplotlib plots. Comment this out if you dont want that
COLOR = 'white'
matplotlib.rcParams['text.color'] = COLOR
matplotlib.rcParams['axes.labelcolor'] = COLOR
matplotlib.rcParams['xtick.color'] = COLOR
matplotlib.rcParams['ytick.color'] = COLOR

In [7]:
df_transaction = pd.read_csv('datasets/ieee-fraud-detection/train_transaction.csv')

In [8]:
# Features that are used, isFraud is the target
features = ['isFraud', 'TransactionDT',
            'TransactionAmt']

df = df_transaction[features]

In [9]:
# Sort features into categorical and numerical types
target = 'isFraud'
# Categorical features
cat = ['TransactionDT']
# Numeric features
num = ['TransactionAmt']

In [10]:
# Drop rows with missing features
df = df.dropna()
df = df[:10000]
y = df[target].values
df.head()
df.size


30000

In [11]:
x_cat = df.filter(items = cat).values
x_num = df.filter(items = num).values


In [12]:
labelencoder_X = LabelEncoder()
# Label encode every categorical column
for i in range(len(cat)): 
    x_cat[:, i] = labelencoder_X.fit_transform(x_cat[:, i])

In [13]:
# Build input vector X, the training data
X = np.concatenate((x_cat, x_num), axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train[:10]

array([[7.1210e+03, 2.4500e+01],
       [8.9390e+03, 1.5000e+02],
       [2.9030e+03, 3.0000e+02],
       [5.1120e+03, 5.0990e+01],
       [3.4600e+02, 2.0995e+02],
       [4.7000e+01, 3.5950e+00],
       [7.6440e+03, 5.9000e+01],
       [1.4890e+03, 1.0000e+02],
       [7.2800e+03, 5.0000e+01],
       [4.8300e+02, 1.0795e+02]])

In [15]:
# feature scaling - prepares training and test data for ML by standardizing features to have similar scales
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
#Y_train_std = sc.transform(y_train)
#Y_test_std = sc.transform(y_test)

In [16]:
model_linear = SVC(C=.1,kernel='linear')
model_linear.fit(X_train_std, y_train)

SVC(C=0.1, kernel='linear')

In [17]:
pred = model_linear.predict(X_test_std)
print(accuracy_score(y_test,pred))

0.971


In [30]:
#### Make a function to do all of this
# inputs: lower bound, upper bound, step for C values, kernel type (linear, poly, rbf) 

def tuner(range, kernel):
    ''' 
    range (list of floats): lower, upper, and step for values of c 
    kernel (str): 'linear', 'poly' 'rbf' (could also include sigmoid)
    prints the accuracy of model of specified type with all c values in given range
    '''
    results = {}
    for c in np.arange(range[0],range[1],range[2]):
        model = SVC(C=c, kernel=kernel)
        model.fit(X_train_std, y_train)
        pred = model.predict(X_test_std)
        results[c] = accuracy_score(y_test,pred)
    for r in results:
        print(kernel + " model with regularization paramater " + str(round(r,3)) + " had accuracy " + str(round(results[r],3)))

In [32]:
tuner([.1,1,.2],'poly')

Notes: recommended to scale your data before using an SVM, can be done using a pipeline, could try different kernel functions, look more into run-time of SVM