## Importing dataset

In [1]:
import seaborn as sns 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,KFold
from sklearn.metrics import mean_squared_error,confusion_matrix,recall_score
from sklearn.svm import SVC

In [2]:
df=pd.read_csv('PS_20174392719_1491204439457_log.csv')
df.shape

(6362620, 11)

In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## In this step we will try to seek the relation between the two names column and target variable

### We will encode the two string columns then find the correlation between them and the target variable

Since the dataset is too large we will take half of it to perform the operation we tried before taking 10%,5% and 25% of the total data and the results were approxamitely the same

In [4]:
nameOrig_encoded=df.sample(frac=0.5)['nameOrig']=df['nameOrig'].astype('category').cat.codes
isFraud_encoded=df.sample(frac=0.5)['isFraud']=df['isFraud'].astype('category').cat.codes
nameDest_encoded=df.sample(frac=0.5)['nameDest']=df['nameDest'].astype('category').cat.codes

In [5]:
corr_1=nameOrig_encoded.corr(isFraud_encoded)
print("The correlation coefficient between column nameOrig and target variable is",corr_1)
corr_2=nameDest_encoded.corr(isFraud_encoded)
print("The correlation coefficient between column nameDest and target variable is",corr_2)

The correlation coefficient between column nameOrig and target variable is -0.0004637760228697384
The correlation coefficient between column nameDest and target variable is -0.021963429581900222


From the above calculations the column nameOrig does not have any relation with the target variable so it will be dropped from the dataset

In [6]:
df.drop('nameOrig',inplace=True,axis=1)
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## Encoding column strings

In [7]:
df['nameDest'].nunique()

2722362

The column nameDest contain lots of unique values so if we encoded it using OHE or any other technique it will result to a massive dataset so we will take the first character from the column and encode it to reduce the unique values

In [8]:
df['nameDest'] = df['nameDest'].astype(str).str[0]

In [9]:
df['nameDest'].nunique()

2

Now the column nameDest is reduced to 2 unique values which can easily be encoded using any encoding technique

In [10]:
df=pd.get_dummies(df,columns=['type','nameDest'])
df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,nameDest_C,nameDest_M
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,0,0,1,0,0,1
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,0,0,1,0,0,1
2,1,181.0,181.0,0.0,0.0,0.0,1,0,0,0,0,0,1,1,0
3,1,181.0,181.0,0.0,21182.0,0.0,1,0,0,1,0,0,0,1,0
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,0,0,1,0,0,1


## Scaling data

### In this section we will scale the data using StandardScaler and drop the target variable "isFraud" 

In [11]:
df=df[['step','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest','isFlaggedFraud','type_CASH_IN',
      'type_CASH_OUT','type_DEBIT','type_PAYMENT','type_TRANSFER','nameDest_C','nameDest_M','isFraud']]

Since the data is too large so we will only take a sample of it to analyze it

In [12]:
df_sampled=df.sample(n=100000)
df_sampled.shape

(100000, 15)

In [13]:
scaler=StandardScaler()

In [14]:
scaler.fit(df_sampled.drop('isFraud',axis=1))
scaled_features = scaler.transform(df_sampled.drop('isFraud',axis=1))

In [15]:
df_scaled = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_scaled.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,nameDest_C,nameDest_M
0,-1.577725,0.873623,-0.287667,-0.291316,0.085377,0.297527,-0.003162,-0.528887,1.348867,-0.082499,-0.715343,-0.297839,0.715343,-0.715343
1,-0.559964,-0.166994,-0.268038,-0.291316,-0.315193,-0.302599,-0.003162,-0.528887,-0.741363,-0.082499,-0.715343,3.357521,0.715343,-0.715343
2,-1.549649,-0.283541,-0.287667,-0.291316,-0.315193,-0.322374,-0.003162,-0.528887,-0.741363,-0.082499,1.397931,-0.297839,-1.397931,1.397931
3,-0.728421,-0.278574,-0.287605,-0.291316,-0.315193,-0.322374,-0.003162,-0.528887,-0.741363,-0.082499,1.397931,-0.297839,-1.397931,1.397931
4,-0.054593,-0.244676,-0.287667,-0.291316,0.022827,-0.005144,-0.003162,-0.528887,1.348867,-0.082499,-0.715343,-0.297839,0.715343,-0.715343


Split data into 80% train 20% test

In [16]:
X_Train80, X_Test20, Y_Train80, Y_Test20 = train_test_split(scaled_features,df_sampled['isFraud'],
                                                    test_size=0.2)

In [17]:
Y_Train80=Y_Train80.to_numpy()

# SVM classification without using KFold

In [18]:
degrees=list(np.arange(2,9))
scores_without_kfold=[]
for deg in degrees:
    clf = SVC(kernel='poly',degree=deg)
    clf.fit(X_Train80,Y_Train80)
    Y_pred_without_kfold = clf.predict(X_Test20)
    scores_without_kfold.append(recall_score(Y_Test20,Y_pred_without_kfold))
max_deg_without_kfold=degrees[np.argmax(scores_without_kfold)]
print("Array of scores is",scores_without_kfold)
print("Length",len(scores_without_kfold))
## Displaying the highest score without the use of kFold 
print("Highest score is {} and degree of {}".format(max(scores_without_kfold),max_deg_without_kfold))

    
    

Array of scores is [0.21621621621621623, 0.21621621621621623, 0.2702702702702703, 0.32432432432432434, 0.32432432432432434, 0.35135135135135137, 0.3783783783783784]
Length 7
Highest score is 0.3783783783783784 and degree of 8


# SVM classification after using KFold

In [19]:
kf = KFold(n_splits=10)
arr_train_index=[]
arr_test_index=[]
scores=[]

x_train_arr=[]
x_cv_arr=[]
y_train_arr=[]
y_cv_arr=[]
degrees_arr=degrees*kf.get_n_splits(X_Train80)
for train_index, test_index in kf.split(X_Train80):
    print("TRAIN:", train_index, "TEST:", test_index)
    arr_train_index.append(train_index)
    arr_test_index.append(test_index)
    X_train_fold, X_cv_fold = X_Train80[train_index], X_Train80[test_index]
    y_train_fold, y_cv_fold = Y_Train80[train_index], Y_Train80[test_index]
    for degree in degrees:
        clf = SVC(kernel='poly',degree=degree)
        clf.fit(X_train_fold,y_train_fold)
        Y_pred = clf.predict(X_cv_fold)
        scores.append(recall_score(y_cv_fold,Y_pred))
        x_train_arr.append(X_train_fold)
        x_cv_arr.append(X_cv_fold)
        y_train_arr.append(y_train_fold)
        y_cv_arr.append(y_cv_fold)
        
max_x_train_data=x_train_arr[np.argmax(scores)]
max_y_train_data=y_train_arr[np.argmax(scores)]
max_x_cv_data=x_cv_arr[np.argmax(scores)]
max_y_cv_data=y_cv_arr[np.argmax(scores)]
max_deg=degrees_arr[np.argmax(np.argmax(scores))]
print("Highest score is {} for train data {} and {}, cv_data of {} and {} and degree of {}".format(max(scores),max_x_train_data,
                                                                                 max_y_train_data,max_x_cv_data,max_y_cv_data,
                                                                                 max_deg))
            
    
    
    
    
    

TRAIN: [ 8000  8001  8002 ... 79997 79998 79999] TEST: [   0    1    2 ... 7997 7998 7999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [ 8000  8001  8002 ... 15997 15998 15999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [16000 16001 16002 ... 23997 23998 23999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [24000 24001 24002 ... 31997 31998 31999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [32000 32001 32002 ... 39997 39998 39999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [40000 40001 40002 ... 47997 47998 47999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [48000 48001 48002 ... 55997 55998 55999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [56000 56001 56002 ... 63997 63998 63999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [64000 64001 64002 ... 71997 71998 71999]
TRAIN: [    0     1     2 ... 71997 71998 71999] TEST: [72000 72001 72002 ... 79997 79998 79999]
Array of scores is [0.16666666666666

We can see the score after the kfold is so much higher than before applying KFold and the degree is different than before applying kFold

## Testing with the optimum train data and cv data 

In [20]:
clf = SVC(kernel='poly',degree=max_deg)
clf.fit(max_x_train_data,max_y_train_data)
Y_pred_final = clf.predict(X_Test20)
final_score=recall_score(Y_Test20,Y_pred_final)

In [21]:
print("Final score is ",final_score)

Final score is  0.21621621621621623


Score is not that high due to the imbalance of the target variable as we only took 100k sample from a data that contains 6 millions records but we can enhance this score by applying the regularization parameter.

We can try to test with the original train and test data and monitor the score 

In [25]:
clf = SVC(kernel='poly',degree=max_deg_without_kfold)
clf.fit(X_Train80,Y_Train80)
Y_pred_without_kFold= clf.predict(X_Test20)
final_score_without_kFold=recall_score(Y_Test20,Y_pred_final)
print("Final score without Kfold for train and test data is",final_score_without_kFold)

Final score without Kfold for train and test data is 0.21621621621621623


In [23]:
clf = SVC(kernel='poly',degree=max_deg,C=10.24)
clf.fit(max_x_train_data,max_y_train_data)
Y_pred_final_after_reg = clf.predict(X_Test20)
final_score_after_reg=recall_score(Y_Test20,Y_pred_final_after_reg)

In [24]:
print("Final score after regularization is",final_score_after_reg)

Final score after regularization is 0.32432432432432434


We can observe an enhancement of the score after entering this parameter

## The scores before and after kFold are the same which is not regularly true but in this case it occured because the dataset's target isFraud is always labeled 0 