**Import Librarires And Dataset**

In [81]:
import warnings
import numpy as np
import pandas as pd
import time
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
#some settings to show data
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

#import dataset
audit_risk = pd.read_csv("datasets/audit_risk.csv")
trial = pd.read_csv("datasets/trial.csv")

**Show Data**

In [82]:
audit_risk.head(10)

In [83]:
trial.head(10)

**Lets See Values Of Two Dataset**

In [84]:
audit_risk.describe()

In [85]:
trial.describe()

**Analysis**

As you can see, two dataset are similarly same expect a bit difference. 
Firsty, SCORE_A AND SCORE_B in trial, multiply 10 with audit_risk Score_A and Score_B values, also that's capital. 
Second, Loss and Risk column in trial, completely different from audit_risk.

First of all, change capital column names like audit_risk columns, then divide by 10 to Score_A and Score_B;

In [86]:
trial.columns = ['Sector_score', 'LOCATION_ID', 'PARA_A', 'Score_A', 'PARA_B',
                 'Score_B', 'TOTAL', 'numbers', 'Marks',
                 'Money_Value', 'MONEY_Marks', 'District',
                 'Loss', 'LOSS_SCORE', 'History', 'History_score', 'Score', 'Risk_trial']

In [87]:
trial['Score_A'] = trial['Score_A'] / 10
trial['Score_B'] = trial['Score_B'] / 10

**Observe two dataset**

In [88]:
same_columns = np.intersect1d(audit_risk.columns, trial.columns)
same_columns

**Let's merge two dataset with same column**

In [89]:
merged_df = pd.merge(audit_risk, trial, how='outer',
                     on=['History', 'LOCATION_ID', 'Money_Value', 'PARA_A', 'PARA_B', 'Score', 'Score_A', 'Score_B',
                         'Sector_score', 'TOTAL', 'numbers'])
merged_df.columns

**Analysis**

As you can see some values in Risk_trial in trial and Risk in audit_risk are different, we can select Risk column in audit_risk because if you will click link https://api.openml.org/d/42931, you can see target value is Risk in audit_risk dataset. So delete that column.

In [90]:
df = merged_df.drop(['Risk_trial'], axis=1)

Check null values

In [91]:
df.isnull().sum()

As you can see, Money_Value column has a null value. Set average value,

In [92]:
df['Money_Value'] = df['Money_Value'].fillna(df['Money_Value'].median())

and Detection_Risk column is same value of Risk column, so delete it.

In [93]:
df = df.drop(['Detection_Risk'], axis=1)
df.info()

Up to now, everything is good, let's see location id

In [94]:
df["LOCATION_ID"].unique()

if you iterate to showed values, you will see end of the table there are some non numeric values, LOHARU, NUH and SAFIDON. How much that values in dataset

In [95]:
len(df[(df["LOCATION_ID"] == 'LOHARU') | (df["LOCATION_ID"] == 'NUH') | (df["LOCATION_ID"] == 'SAFIDON')])

In [96]:
len(df)

Only 3 rows we have non numerical rows, so they seem deletable, i deleted it.

In [97]:
df = df[(df.LOCATION_ID != 'LOHARU')]
df = df[(df.LOCATION_ID != 'NUH')]
df = df[(df.LOCATION_ID != 'SAFIDON')]

In [98]:
len(df)

Also i drop duplicate values,

In [99]:
df = df.drop_duplicates(keep='first')
print(f"Rows: {len(df)}")

i drop high correlation values;

In [100]:
import seaborn as sns

corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')
# 'RdBu_r' & 'BrBG' are other good diverging colormaps
cm = sns.diverging_palette(220, 20, sep=20, as_cmap=True)
corr.style.background_gradient(cmap=cm)

In [101]:
df = df[['Risk_A', 'Risk_B', 'Risk_C', 'Risk_D', 'RiSk_E', 'Prob', 'Score', 'CONTROL_RISK', 'Audit_Risk', 'Risk', 'MONEY_Marks', 'Loss']]
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [102]:
df

# Data Clean Operation Is Done

# I will Implement Knn

In [103]:
import math

# Define a function to calculate the Euclidean distance between two points
def euclidean_distance(x1, x2):
    return math.sqrt(np.sum((x1 - x2) ** 2))

In [104]:
# Define the KNN function
def knn_classification_with_euclidean_distance(X_train, y_train, X_test, k):
    # List to store the predicted labels for the test set
    y_pred = []
    distances = []
    
    for i in range(len(X_test)):
        for j in range(len(X_train)):
            # Calculate the distance between the two points using euclidean_distance func where I defined above section
            dist = euclidean_distance(X_test[i], X_train[j])
            distances.append((dist, y_train[j]))

        distances.sort()
        neighbors = distances[:k] # Get the k nearest neighbors

        counts = {} # Count the votes for each class
        for neighbor in neighbors:
            label = neighbor[1]
            if label in counts:
                counts[label] += 1
            else:
                counts[label] = 1

        max_count = max(counts, key=counts.get) # Get the class with the most votes
        y_pred.append(max_count)

    return y_pred

In [105]:
# Define a function to calculate the Manhattan distance between two points
def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

In [106]:
def knn_regressor_with_manhattan_distance(X_train, y_train, X_test, k):
    y_pred = []
    distances = []
    
    for i in range(len(X_test)):
        for j in range(len(X_train)):
            # Calculate the distance between the two points using manhattan_distance func where I defined above section
            dist = manhattan_distance(X_test[i], X_train[j])
            distances.append((dist, y_train[j]))

        distances.sort()
        neighbors = distances[:k]# Get the k nearest neighbors
        
        mean_val = np.mean(neighbors)# Get the mean from the neighbors
        y_pred.append(mean_val)

    return y_pred

# PART 3

In [127]:
from sklearn import svm

class_df = df.drop("Audit_Risk", axis=1)
classification_X = class_df.drop(["Risk"], axis=1)
classification_y = class_df["Risk"]

In [128]:
X_train, X_test, y_train, y_test = train_test_split(classification_X, classification_y, test_size=0.3, random_state=42)

**KNN Accuracy for k = 3 without k-fold**

In [129]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train and y_test to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

start_time = time.time()
clf = svm.SVC(kernel='linear')  # Linear Kernel
clf.fit(X_train, y_train)
end_time = time.time()

y_pred = clf.predict(X_test)

In [130]:
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")

**Find Confusion Matrix**

In [131]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

**Find Confusion Matrix & And Roc Values For Each K-fold**

In [132]:
import pylab as pl
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns

kf = KFold(n_splits=6, shuffle=True, random_state=42)

# List to store scores for each fold
rocs = []
roc_aucs = []
confusion_matrices = []

for train_index, val_index in kf.split(X_train_scaled):
    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    clf = svm.SVC(kernel='linear')  # Linear Kernel
    

    clf.fit(X_train_fold, y_train_fold)
    y_pred_fold = clf.predict(X_val_fold)
    end_time = time.time()
    
    fpr, tpr, thresholds = roc_curve(y_val_fold, y_pred_fold)
    roc_auc = auc(fpr, tpr)
    roc_aucs.append(roc_auc)

    cm = confusion_matrix(y_val_fold, y_pred_fold)
    confusion_matrices.append(cm)

    ######### IMPORTANT INFO ###########
    # The optimal cut-off would be where tpr is high and fpr is low
    # tpr - (1-fpr) is zero or near to zero is the optimal cut off point
    ####################################

    i = np.arange(len(tpr))  # index for df
    roc = pd.DataFrame(
        {'fpr': pd.Series(fpr, index=i),
         'tpr': pd.Series(tpr, index=i),
         '1-fpr': pd.Series(1 - fpr, index=i),
         'tf': pd.Series(tpr - (1 - fpr), index=i),
         'thresholds': pd.Series(thresholds, index=i)
         })

    rocs.append(roc)

**Show Optimum threshold & ROC Curves** 

In [133]:
 for i, roc in enumerate(rocs):
    # Plot tpr vs 1-fpr
    fig, ax = pl.subplots()
    pl.plot(roc['tpr'], color='black')
    pl.plot(roc['1-fpr'], color='red')
    pl.xlabel('False Positive Rate')
    pl.ylabel('True Positive Rate')
    pl.title(f"Area under the {i + 1}. ROC curve : {roc_aucs[i]:.3f}")
    pl.legend(["tpr", "1-fpr"], loc="lower right")

    plt.figure(figsize=(3, 3))
    sns.heatmap(confusion_matrices[i], annot=True)
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('False Label')
    plt.plot()

    roc_optimum_threshold = roc.iloc[(roc.tf - 0).abs().argsort()[:1]]
    print(f"{i + 1}. ROC Optimum threshold= {list(roc_optimum_threshold['thresholds'])}")

    plt.show()
    ax.set_xticklabels([])

**Runtime Performance**

Area Under Curve (AUC) is closer to 1 are more good, robust etc model than others. So as you can see, i get the 1 which optimal thresold 1 to.

In [134]:
print(f'Predict Runtime: {end_time - start_time:.6f} seconds')