**Import Librarires And Dataset**

**Import Librarires And Dataset**

In [81]:
import warnings
import numpy as np
import pandas as pd
import time

#some settings to show data
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

#import dataset
audit_risk = pd.read_csv("datasets/audit_risk.csv")
trial = pd.read_csv("datasets/trial.csv")

In [104]:
# Define the KNN function
def knn_classification_with_euclidean_distance(X_train, y_train, X_test, k):
    # List to store the predicted labels for the test set
    y_pred = []
    distances = []
    
    for i in range(len(X_test)):
        for j in range(len(X_train)):
            # Calculate the distance between the two points using euclidean_distance func where I defined above section
            dist = euclidean_distance(X_test[i], X_train[j])
            distances.append((dist, y_train[j]))

        distances.sort()
        neighbors = distances[:k] # Get the k nearest neighbors

        counts = {} # Count the votes for each class
        for neighbor in neighbors:
            label = neighbor[1]
            if label in counts:
                counts[label] += 1
            else:
                counts[label] = 1

        max_count = max(counts, key=counts.get) # Get the class with the most votes
        y_pred.append(max_count)

    return y_pred

In [105]:
# Define a function to calculate the Manhattan distance between two points
def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

In [106]:
def knn_regressor_with_manhattan_distance(X_train, y_train, X_test, k):
    y_pred = []
    distances = []
    
    for i in range(len(X_test)):
        for j in range(len(X_train)):
            # Calculate the distance between the two points using manhattan_distance func where I defined above section
            dist = manhattan_distance(X_test[i], X_train[j])
            distances.append((dist, y_train[j]))

        distances.sort()
        neighbors = distances[:k]# Get the k nearest neighbors
        
        mean_val = np.mean(neighbors)# Get the mean from the neighbors
        y_pred.append(mean_val)

    return y_pred

# PART 5

In [140]:
from sklearn import svm

class_df = df.drop("Audit_Risk", axis=1)
classification_X = class_df.drop(["Risk"], axis=1)
classification_y = class_df["Risk"]

X_train, X_test, y_train, y_test = train_test_split(classification_X, classification_y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train and y_test to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

clf = svm.SVC(kernel='rbf')  # Rbf Kernel
start_time = time.time()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
end_time = time.time()

In [141]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

**Find Confusion Matrix**

In [142]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

**Find Confusion Matrix & And Roc Values For Each K-fold**

In [143]:
import numpy as np
import pylab as pl
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns

kf = KFold(n_splits=6, shuffle=True, random_state=42)

# List to store scores for each fold
rocs = []
roc_aucs = []
confusion_matrices = []

for train_index, val_index in kf.split(X_train_scaled):
    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    clf = svm.SVC(kernel='rbf')  # Linear Kernel
    clf.fit(X_train_fold, y_train_fold)
    y_pred_fold = clf.predict(X_val_fold)

    fpr, tpr, thresholds = roc_curve(y_val_fold, y_pred_fold)
    roc_auc = auc(fpr, tpr)
    roc_aucs.append(roc_auc)

    cm = confusion_matrix(y_val_fold, y_pred_fold)
    confusion_matrices.append(cm)

    ######### IMPORTANT INFO ###########
    # The optimal cut-off would be where tpr is high and fpr is low
    # tpr - (1-fpr) is zero or near to zero is the optimal cut off point
    ####################################

    i = np.arange(len(tpr))  # index for df
    roc = pd.DataFrame(
        {'fpr': pd.Series(fpr, index=i),
         'tpr': pd.Series(tpr, index=i),
         '1-fpr': pd.Series(1 - fpr, index=i),
         'tf': pd.Series(tpr - (1 - fpr), index=i),
         'thresholds': pd.Series(thresholds, index=i)
         })

    rocs.append(roc)

**Show Optimum threshold & ROC Curves**

In [144]:
 for i, roc in enumerate(rocs):
    # Plot tpr vs 1-fpr
    fig, ax = pl.subplots()
    pl.plot(roc['tpr'], color='black')
    pl.plot(roc['1-fpr'], color='red')
    pl.xlabel('1-False Positive Rate')
    pl.ylabel('True Positive Rate')
    pl.title(f"Area under the {i + 1}. ROC curve : {roc_aucs[i]}")
    pl.legend(["tpr", "1-fpr"], loc="lower right")

    plt.figure(figsize=(3, 3))
    sns.heatmap(confusion_matrices[i], annot=True)
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('False Label')
    plt.plot()
    ax.set_xticklabels([])

    roc_optimum_threshold = roc.iloc[(roc.tf - 0).abs().argsort()[:1]]
    print(f"{i + 1}-fold ROC Optimum threshold= {list(roc_optimum_threshold['thresholds'])}")

plt.show()


# PART 7

In [153]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor

regression_df = bike_new
regression_X = regression_df.drop(["cnt"], axis=1)
regression_y = regression_df["cnt"]

X_train, X_test, y_train, y_test = train_test_split(regression_X, regression_y, test_size=0.3, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

# Detect single runtime performance
dt = DecisionTreeRegressor(random_state=1, max_depth=5, min_impurity_decrease=0.1)
start_time = time.time()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
end_time = time.time()

**Apply K-Fold Cross Validation**

In [154]:
fold_number = 1
accuracy_scores = []
kf = KFold(n_splits=6, shuffle=True, random_state=42)

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    
    dt = DecisionTreeRegressor(random_state=1, max_depth=5, min_impurity_decrease=0.1)
    dt.fit(X_train_fold, y_train_fold)
    y_pred = dt.predict(X_val_fold)
    
    accuracy = dt.score(X_val_fold, y_val_fold)  # we can also find accuracy: print(accuracy_score(y_test, y_pred))
    print(f"{fold_number}. Fold Cross Validation accuracy: {accuracy}")
    fold_number +=1

print("\n")
tree_to_set_of_rules(dt, regression_X.columns) # we can use again same tree_to_code function

**Final Test**

In [155]:
y_pred = dt.predict(X_test)
print(f"Final R2 Score After k-fold cross validation with test data:  {r2_score(y_test, y_pred):.3f}")

**Performance**
As you see in the above outputs, K-cross validation's predict value is a bit higher than final predeict value.  

In [156]:
print(f'Predict Runtime: {end_time - start_time:.6f} seconds')