In [1]:
%matplotlib inline 

In [69]:
import os, shutil
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix

from sklearn import preprocessing 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from seaborn import despine
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.multiclass import type_of_target
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import cross_val_predict
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
#loading the dataset
url = 'https://raw.githubusercontent.com/mn20781/auditing/main/audit_risk.csv'
url0 = 'https://raw.githubusercontent.com/mn20781/auditing/main/trial.csv'
#reading the data as a csv and storing dataframe in 'data' variable. 
risk = pd.read_csv(url, header=0,skiprows=0, delimiter = ',')  
trial = pd.read_csv(url, header=0,skiprows=0, delimiter = ',')  

In [4]:
#split data into test and training sets
train_set, test_set = train_test_split(trial, test_size=0.2,random_state=42)

In [16]:
train_set.columns

Index(['Sector_score', 'LOCATION_ID', 'PARA_A', 'Score_A', 'Risk_A', 'PARA_B',
       'Score_B', 'Risk_B', 'TOTAL', 'numbers', 'Score_B.1', 'Risk_C',
       'Money_Value', 'Score_MV', 'Risk_D', 'District_Loss', 'PROB', 'RiSk_E',
       'History', 'Prob', 'Risk_F', 'Score', 'Inherent_Risk', 'CONTROL_RISK',
       'Detection_Risk', 'Audit_Risk', 'Risk'],
      dtype='object')

In [17]:
numeric_feats = ['Audit_Risk', 'Sector_score', 'Risk', 'Inherent_Risk', 'Risk_D']
features_drop = ['LOCATION_ID', 'Money_Value']

In [18]:
X = train_set.drop(columns=features_drop, axis=1)

In [19]:
y= train_set['Risk']

In [20]:
colTransformer = ColumnTransformer([('num', StandardScaler(), numeric_feats)],
                                  remainder='passthrough')
x_scaled = colTransformer.fit_transform(X)

In [44]:
from sklearn.svm import LinearSVC
clf = LinearSVC(C=1, loss='hinge', max_iter=50000)

In [48]:
from sklearn.svm import SVC
clf_SVC = SVC(kernel='linear')

In [49]:
skf = StratifiedKFold(n_splits=2)

In [50]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf_SVC, x_scaled, y,
                         scoring="accuracy", cv=10)


In [52]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [53]:
display_scores(scores)

Scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Mean: 1.0
Standard deviation: 0.0


In [57]:
from sklearn.model_selection import GridSearchCV
#implementing parameters for GRID SEARCH WITH THE SVM 
param_grid = {'C': [0.1, 1, 10, 100, 1000],
             'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
             'kernel':['rbf']}


In [59]:
grid = GridSearchCV(clf_SVC,  param_grid, refit=True, verbose=3)

In [61]:
#creating validation set
X_train, X_test, y_train, y_test = train_test_split(x_scaled,y, test_size=0.2, random_state=42 )

In [62]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.840 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.889 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.919 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.818 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.859 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.970 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.970 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.970 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=1.000 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.960 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.990 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [63]:
#best parameters
grid.best_params_

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}

In [67]:
#check best score
grid.best_score_

1.0

In [65]:
#how the model looks after training
grid.best_estimator_

In [68]:
gCV_preds = grid.predict(X_test)

In [70]:
#print classification report 
print(classification_report(y_test, gCV_preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        75
           1       1.00      1.00      1.00        49

    accuracy                           1.00       124
   macro avg       1.00      1.00      1.00       124
weighted avg       1.00      1.00      1.00       124



In [72]:
#print confusion matrix
confusion_matrix = confusion_matrix(y_test, gCV_preds)
confusion_matrix

array([[75,  0],
       [ 0, 49]], dtype=int64)

In [75]:
#get test set 
X_final = test_set.drop(columns=features_drop, axis=1)
X_final.columns

Index(['Sector_score', 'PARA_A', 'Score_A', 'Risk_A', 'PARA_B', 'Score_B',
       'Risk_B', 'TOTAL', 'numbers', 'Score_B.1', 'Risk_C', 'Score_MV',
       'Risk_D', 'District_Loss', 'PROB', 'RiSk_E', 'History', 'Prob',
       'Risk_F', 'Score', 'Inherent_Risk', 'CONTROL_RISK', 'Detection_Risk',
       'Audit_Risk', 'Risk'],
      dtype='object')

In [78]:
#labels from test set
y_final = test_set['Risk']

In [79]:
#transform input
x_scaled_final = colTransformer.fit_transform(X_final)

In [81]:
#final cross score
final_score = cross_val_score(grid, x_scaled_final, y_final,
                         scoring="accuracy", cv=10)


Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.643 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.607 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.607 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.607 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.607 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.964 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.964 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.964 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.929 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.964 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.857 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [84]:
pred = grid.predict(x_scaled)

In [82]:
#display metrics
display_scores(final_score)

Scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Mean: 1.0
Standard deviation: 0.0


In [86]:
#classification report
#print(classification_report(y_final, pred))

In [55]:
#for train_index, test_index in skf.split(x_scaled, y):
 #       
    #print("TRAIN:", train_index, "TEST:", test_index)
  #      X_train = x_scaled.iloc[train_index]
    #    y_train = y.iloc[train_index]
   #X_test = x_scaled.iloc[test_index]
     #   y_test = y.iloc[test_index]