## Importing dataset

In [1]:
import seaborn as sns 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,KFold
from sklearn.metrics import mean_squared_error,confusion_matrix,recall_score
from sklearn.svm import SVC

In [2]:
df=pd.read_csv('PS_20174392719_1491204439457_log.csv')
df.shape

(6362620, 11)

In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## In this step we will try to seek the relation between the two names column and target variable

### We will encode the two string columns then find the correlation between them and the target variable

Since the dataset is too large we will take half of it to perform the operation we tried before taking 10%,5% and 25% of the total data and the results were approxamitely the same

In [4]:
nameOrig_encoded=df.sample(frac=0.5)['nameOrig']=df['nameOrig'].astype('category').cat.codes
isFraud_encoded=df.sample(frac=0.5)['isFraud']=df['isFraud'].astype('category').cat.codes
nameDest_encoded=df.sample(frac=0.5)['nameDest']=df['nameDest'].astype('category').cat.codes

In [5]:
corr_1=nameOrig_encoded.corr(isFraud_encoded)
print("The correlation coefficient between column nameOrig and target variable is",corr_1)
corr_2=nameDest_encoded.corr(isFraud_encoded)
print("The correlation coefficient between column nameDest and target variable is",corr_2)

The correlation coefficient between column nameOrig and target variable is -0.0004637760228697384
The correlation coefficient between column nameDest and target variable is -0.021963429581900222


From the above calculations the column nameOrig does not have any relation with the target variable so it will be dropped from the dataset

In [6]:
df.drop('nameOrig',inplace=True,axis=1)
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## Encoding column strings

In [7]:
df['nameDest'].nunique()

2722362

The column nameDest contain lots of unique values so if we encoded it using OHE or any other technique it will result to a massive dataset so we will take the first character from the column and encode it to reduce the unique values

In [8]:
df['nameDest'] = df['nameDest'].astype(str).str[0]

In [9]:
df['nameDest'].nunique()

2

Now the column nameDest is reduced to 2 unique values which can easily be encoded using any encoding technique

In [10]:
df=pd.get_dummies(df,columns=['type','nameDest'])
df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,nameDest_C,nameDest_M
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,0,0,1,0,0,1
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,0,0,1,0,0,1
2,1,181.0,181.0,0.0,0.0,0.0,1,0,0,0,0,0,1,1,0
3,1,181.0,181.0,0.0,21182.0,0.0,1,0,0,1,0,0,0,1,0
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,0,0,1,0,0,1


## Scaling data

### In this section we will scale the data using StandardScaler and drop the target variable "isFraud" 

In [11]:
df=df[['step','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest','isFlaggedFraud','type_CASH_IN',
      'type_CASH_OUT','type_DEBIT','type_PAYMENT','type_TRANSFER','nameDest_C','nameDest_M','isFraud']]

Since the data is too large so we will only take a sample of it to analyze it

In [12]:
df_sampled=df.sample(n=100000)
df_sampled.shape

(100000, 15)

In [13]:
scaler=StandardScaler()

In [14]:
scaler.fit(df_sampled.drop('isFraud',axis=1))
scaled_features = scaler.transform(df_sampled.drop('isFraud',axis=1))

In [15]:
df_scaled = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_scaled.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,nameDest_C,nameDest_M
0,1.136747,-0.291359,-0.28794,-0.291627,0.210054,0.162927,-0.003162,-0.52802,1.351376,-0.081323,-0.713539,-0.303238,0.713539,-0.713539
1,-0.566683,0.116666,-0.285823,-0.291627,-0.325037,-0.267576,-0.003162,-0.52802,1.351376,-0.081323,-0.713539,-0.303238,0.713539,-0.713539
2,-0.279273,-0.305619,-0.262223,-0.269326,-0.325037,-0.335413,-0.003162,-0.52802,-0.739987,-0.081323,1.401465,-0.303238,-1.401465,1.401465
3,-0.132063,-0.273773,-0.28794,-0.291627,-0.325037,-0.335413,-0.003162,-0.52802,-0.739987,-0.081323,1.401465,-0.303238,-1.401465,1.401465
4,-0.419473,-0.187046,-0.28794,-0.291627,-0.237543,-0.233853,-0.003162,-0.52802,1.351376,-0.081323,-0.713539,-0.303238,0.713539,-0.713539


Split data into 80% train 20% test

In [16]:
X_Train80, X_Test20, Y_Train80, Y_Test20 = train_test_split(scaled_features,df_sampled['isFraud'],
                                                    test_size=0.2,random_state=0)

## SVM classification algorithm

Loop on multiple degrees to tune in the best parameter to the model

In [17]:

#  scores_kfold=[]
#  scores_stratifiedkFold=[]
# Mse=[]
#  for degree in degrees:
#      for kernel in kernels:
# clf = SVC(kernel='linear',degree=6)
# clf.fit(X_Train80,Y_Train80)
# Y_pred = clf.predict(X_Test20)
# Mse.append(mean_squared_error(Y_pred,Y_Test20))
# cv_kfold = KFold(n_splits=10, random_state=1, shuffle=True)
# cv_stratifiedKfold=StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
# Y_pred_kfold=cross_val_predict(clf, X_Test20, Y_Test20, cv=cv_kfold)
# Y_pred_stratifiedkFold=cross_val_predict(clf, X_Test20, Y_Test20, cv=cv_stratifiedKfold)
#          scores_kfold.append(cross_val_score(clf, X_Train80, Y_Train80, scoring='accuracy', cv=cv_kfold, n_jobs=-1))
#          scores_stratifiedkFold.append(cross_val_score(clf, X_Train80, Y_Train80, scoring='accuracy', cv=cv_stratifiedKfold, n_jobs=-1))
# cm=confusion_matrix(Y_Test20,Y_pred)
# accuracy = float(cm.diagonal().sum())/len(Y_Test20)
# print("Accuracy Of SVM For The Given Dataset without using any folds is : ",accuracy)
# cm_kfold=confusion_matrix(Y_Test20,Y_pred_kfold)
# print(cm_kfold)
# accuracy_kfold = float(cm_kfold.diagonal().sum())/len(Y_Test20)
# print("Accuracy Of SVM For The Given Dataset after applying kFold is : ", accuracy_kfold)
# cm_stratifiedkFold=confusion_matrix(Y_Test20,Y_pred_stratifiedkFold)
# accuracy_stratifiedKfold = float(cm_stratifiedkFold.diagonal().sum())/len(Y_Test20)
# print("Accuracy Of SVM For The Given Dataset after applying stratifiedKFold is : ", accuracy_kfold)
# min_deg=degrees[Mse.index(min(Mse))]
# min_kernel=kernels[Mse.index(min(Mse))]
# print('Accuracy: %.3f (%.3f)' % (np.mean(scores_kfold), np.std(scores_kfold)))
# print('Accuracy: %.3f (%.3f)' % (np.mean(scores_stratifiedkFold), np.std(scores_stratifiedkFold)))

# print("The minimal error is {} for kernel of {} and degree of {}".format(min(Mse),min_kernel,min_deg))
        

In [18]:
Y_Train80=Y_Train80.to_numpy()

### Score calculation without kfold

In [19]:
degrees=np.arange(2,6)
scores_without_kfold=[]
for deg in degrees:
    clf = SVC(kernel='poly',degree=deg)
    clf.fit(X_Train80,Y_Train80)
    Y_pred_without_kfold = clf.predict(X_Test20)
    scores_without_kfold.append(recall_score(Y_Test20,Y_pred_without_kfold))
max_deg_without_kfold=degrees[np.argmax(scores_without_kfold)]
print("Array of scores is",scores_without_kfold)
print("Length",len(scores_without_kfold))
print("Highest score is {} and degree of {}".format(max(scores_without_kfold),max_deg_without_kfold))
#print("The average recall score is ",np.mean(scores_without))     
    
    
    

Array of scores is [0.2, 0.25, 0.25, 0.25]
Length 4
Highest score is 0.25 and degree of 3


In [20]:
kf = KFold(n_splits=10)
arr_train_index=[]
arr_test_index=[]
scores=[]
for train_index, test_index in kf.split(X_Train80):
    print("TRAIN:", train_index, "TEST:", test_index)
    arr_train_index.append(train_index)
    arr_test_index.append(test_index)
    X_train_fold, X_cv_fold = X_Train80[train_index], X_Train80[test_index]
    y_train_fold, y_cv_fold = Y_Train80[train_index], Y_Train80[test_index]
    for degree in degrees:
        clf = SVC(kernel='poly',degree=degree)
        clf.fit(X_train_fold,y_train_fold)
        Y_pred = clf.predict(X_cv_fold)
        scores.append(recall_score(y_cv_fold,Y_pred))
        
        
print("Array of scores is",scores)
x=list(set(scores))
scores_unique=np.array(x)
print("Scores unique is ",scores_unique)
arr_train_index=np.array(arr_train_index)
print("Array train is",arr_train_index)

arr_test_index=np.array(arr_test_index)
print("Array test is",arr_test_index)
best_index=np.argmax(scores_unique)
print("The best index is",best_index)
max_train_index=arr_train_index[best_index]
max_test_index=arr_test_index[best_index]
print("Highest score is {} for train index {} test index {}".format(max(scores_unique),max_train_index,max_test_index))
#print("The average recall score is ",np.mean(scores))        
#Getting the best degree

            
    
    
    
    
    

TRAIN: [ 8000  8001  8002 ... 79997 79998 79999] TEST: [   0    1    2 ... 7997 7998 7999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [ 8000  8001  8002 ... 15997 15998 15999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [16000 16001 16002 ... 23997 23998 23999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [24000 24001 24002 ... 31997 31998 31999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [32000 32001 32002 ... 39997 39998 39999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [40000 40001 40002 ... 47997 47998 47999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [48000 48001 48002 ... 55997 55998 55999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [56000 56001 56002 ... 63997 63998 63999]
TRAIN: [    0     1     2 ... 79997 79998 79999] TEST: [64000 64001 64002 ... 71997 71998 71999]
TRAIN: [    0     1     2 ... 71997 71998 71999] TEST: [72000 72001 72002 ... 79997 79998 79999]
Array of scores is [0.55555555555555

We can see the score after the kfold is so much higher than before applying KFold