**Import Librarires And Dataset**

In [81]:
import warnings
import numpy as np
import pandas as pd
import time

#some settings to show data
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

#import dataset
audit_risk = pd.read_csv("datasets/audit_risk.csv")
trial = pd.read_csv("datasets/trial.csv")

**Show Data**

In [82]:
audit_risk.head(10)

In [83]:
trial.head(10)

**Lets See Values Of Two Dataset**

In [84]:
audit_risk.describe()

In [85]:
trial.describe()

**Analysis**

As you can see, two dataset are similarly same expect a bit difference. 
Firsty, SCORE_A AND SCORE_B in trial, multiply 10 with audit_risk Score_A and Score_B values, also that's capital. 
Second, Loss and Risk column in trial, completely different from audit_risk.

First of all, change capital column names like audit_risk columns, then divide by 10 to Score_A and Score_B;

In [86]:
trial.columns = ['Sector_score', 'LOCATION_ID', 'PARA_A', 'Score_A', 'PARA_B',
                 'Score_B', 'TOTAL', 'numbers', 'Marks',
                 'Money_Value', 'MONEY_Marks', 'District',
                 'Loss', 'LOSS_SCORE', 'History', 'History_score', 'Score', 'Risk_trial']

In [87]:
trial['Score_A'] = trial['Score_A'] / 10
trial['Score_B'] = trial['Score_B'] / 10

**Observe two dataset**

In [88]:
same_columns = np.intersect1d(audit_risk.columns, trial.columns)
same_columns

**Let's merge two dataset with same column**

In [89]:
merged_df = pd.merge(audit_risk, trial, how='outer',
                     on=['History', 'LOCATION_ID', 'Money_Value', 'PARA_A', 'PARA_B', 'Score', 'Score_A', 'Score_B',
                         'Sector_score', 'TOTAL', 'numbers'])
merged_df.columns

**Analysis**

As you can see some values in Risk_trial in trial and Risk in audit_risk are different, we can select Risk column in audit_risk because if you will click link https://api.openml.org/d/42931, you can see target value is Risk in audit_risk dataset. So delete that column.

In [90]:
df = merged_df.drop(['Risk_trial'], axis=1)

Check null values

In [91]:
df.isnull().sum()

As you can see, Money_Value column has a null value. Set average value,

In [92]:
df['Money_Value'] = df['Money_Value'].fillna(df['Money_Value'].median())

and Detection_Risk column is same value of Risk column, so delete it.

In [93]:
df = df.drop(['Detection_Risk'], axis=1)
df.info()

Up to now, everything is good, let's see location id

In [94]:
df["LOCATION_ID"].unique()

if you iterate to showed values, you will see end of the table there are some non numeric values, LOHARU, NUH and SAFIDON. How much that values in dataset

In [95]:
len(df[(df["LOCATION_ID"] == 'LOHARU') | (df["LOCATION_ID"] == 'NUH') | (df["LOCATION_ID"] == 'SAFIDON')])

In [96]:
len(df)

Only 3 rows we have non numerical rows, so they seem deletable, i deleted it.

In [97]:
df = df[(df.LOCATION_ID != 'LOHARU')]
df = df[(df.LOCATION_ID != 'NUH')]
df = df[(df.LOCATION_ID != 'SAFIDON')]

In [98]:
len(df)

Also i drop duplicate values,

In [99]:
df = df.drop_duplicates(keep='first')
print(f"Rows: {len(df)}")

i drop high correlation values;

In [100]:
import seaborn as sns

corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')
# 'RdBu_r' & 'BrBG' are other good diverging colormaps
cm = sns.diverging_palette(220, 20, sep=20, as_cmap=True)
corr.style.background_gradient(cmap=cm)

In [101]:
df = df[['Risk_A', 'Risk_B', 'Risk_C', 'Risk_D', 'RiSk_E', 'Prob', 'Score', 'CONTROL_RISK', 'Audit_Risk', 'Risk', 'MONEY_Marks', 'Loss']]
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [102]:
df

# Data Clean Operation Is Done

# I will Implement Knn

In [103]:
import math

# Define a function to calculate the Euclidean distance between two points
def euclidean_distance(x1, x2):
    return math.sqrt(np.sum((x1 - x2) ** 2))

In [104]:
# Define the KNN function
def knn_classification_with_euclidean_distance(X_train, y_train, X_test, k):
    # List to store the predicted labels for the test set
    y_pred = []
    distances = []
    
    for i in range(len(X_test)):
        for j in range(len(X_train)):
            # Calculate the distance between the two points using euclidean_distance func where I defined above section
            dist = euclidean_distance(X_test[i], X_train[j])
            distances.append((dist, y_train[j]))

        distances.sort()
        neighbors = distances[:k] # Get the k nearest neighbors

        counts = {} # Count the votes for each class
        for neighbor in neighbors:
            label = neighbor[1]
            if label in counts:
                counts[label] += 1
            else:
                counts[label] = 1

        max_count = max(counts, key=counts.get) # Get the class with the most votes
        y_pred.append(max_count)

    return y_pred

In [105]:
# Define a function to calculate the Manhattan distance between two points
def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

In [106]:
def knn_regressor_with_manhattan_distance(X_train, y_train, X_test, k):
    y_pred = []
    distances = []
    
    for i in range(len(X_test)):
        for j in range(len(X_train)):
            # Calculate the distance between the two points using manhattan_distance func where I defined above section
            dist = manhattan_distance(X_test[i], X_train[j])
            distances.append((dist, y_train[j]))

        distances.sort()
        neighbors = distances[:k]# Get the k nearest neighbors
        
        mean_val = np.mean(neighbors)# Get the mean from the neighbors
        y_pred.append(mean_val)

    return y_pred

**I finished preprocessing to data. I will go to implementing functions, start Part1** 

# PART 1

In [107]:
from sklearn.model_selection import train_test_split

class_df = df.drop("Audit_Risk", axis=1)
classification_X = class_df.drop(["Risk"], axis=1)
classification_y = class_df["Risk"]

**I am seperate my data for train %70 and test %30, so i will use train_test_split func in model_selection library**

In [108]:
X_train, X_test, y_train, y_test = train_test_split(classification_X, classification_y, test_size=0.3, random_state=42)

**KNN Accuracy for k = 3 without k-fold**

In [109]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train and y_test to numpy arrays for using in knn_classification_with_euclidean_distance func
y_train = np.array(y_train)
y_test = np.array(y_test)

start_time = time.time() # i calculate prediction performance via using start_time and end_time 
y_pred = knn_classification_with_euclidean_distance(X_train_scaled, y_train, X_test_scaled, 3)
end_time = time.time()
accuracy = accuracy_score(y_test, y_pred)

# Create a data frame to store k values and accuracies
results_df = pd.DataFrame({'k': 3, 'Accuracy': accuracy}, index=[0])
print(f"k value: {3}, where accuracy is: {accuracy}")


**Find Confusion Matrix**

In [110]:
from sklearn.metrics import confusion_matrix

y_pred = knn_classification_with_euclidean_distance(X_train_scaled, y_train, X_test_scaled, 3)
confusion_matrix(y_test, y_pred)
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

**Find Confusion Matrix With K-fold Cross Validation**

In [111]:
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

kf = KFold(n_splits=6, shuffle=True, random_state=42)

# Lists to store accuracy scores & confusion_matrices for each fold
accuracy_scores = []
confusion_matrices = []

# Perform k-fold cross-validation via kf.split(X_train_scaled), this function give me an indexes subset of X_train_scaled actualy X_train
for train_index, val_index in kf.split(X_train_scaled):
    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    y_pred_fold = knn_classification_with_euclidean_distance(X_train_fold, y_train_fold, X_val_fold, 3)

    # Calculate accuracy and confusion_matrix for current fold
    accuracy = accuracy_score(y_val_fold, y_pred_fold)
    cm = confusion_matrix(y_val_fold, y_pred_fold)

    # Store accuracy score and confusion_matrix
    accuracy_scores.append(accuracy)
    confusion_matrices.append(cm)

**Classification Accuracy Performance**

In [112]:
for i, score in enumerate(accuracy_scores):
    plt.figure(figsize=(3, 3))
    sns.heatmap(confusion_matrices[i], annot=True, fmt='d', cmap='Greens')
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    i+=1
    plt.xlabel(f'{i}. Fold cross validation Score: {score:.3f}')
    plt.plot()

plt.show()

# Calculate average accuracy across all folds
avg_accuracy = np.mean(accuracy_scores)
print(f"Average accuracy: {avg_accuracy}")

**Runtime Performance**
As you can see to above plot graphics, we understood that last score is higher than others.

In [113]:
print(f'Predict Runtime: {end_time - start_time:.6f} seconds')

# PART 2

In [114]:
bike = pd.DataFrame(pd.read_csv("datasets/day.csv"))
print(bike.head())
print(bike.info())
print(bike.describe())
print(bike.shape)

# Conclusion of Data Analysis

Dataset has 730 rows and 16 columns.
Except one column, all others are either float or integer type.
One column is date type.

Looking at the data, it seems to be some fields that are categorical, but in integer/float type.
We will analyse to convert them to categorical as integer.

In [115]:
round(100 * (bike.isnull().sum() / len(bike)), 2).sort_values(ascending=False)

In [116]:
round((bike.isnull().sum(axis=1) / len(bike)) * 100, 2).sort_values(ascending=False)

**Analysis**

There are no missing / Null values either in columns or rows

In [117]:
bike_dup = bike.copy()
bike_dup = bike_dup.drop_duplicates(keep='first')
# we can assume same operation like this => bike_dup.drop_duplicates(subset=None, inplace=True)

print(bike_dup.shape)
print(bike.shape)

**Analysis**
The shape after running the drop duplicate command is same as the original dataframe.
Hence we can conclude that there were zero duplicate values in the dataset.

In [118]:
bike_dummy = bike.iloc[:, 1:16]

for col in bike_dummy:
    print(bike_dummy[col].value_counts(ascending=False), '\n\n\n')

**As you can see on above code output;**

instant, dteday, casual and registered columns are nonessential, so we can remove these columns. Because;

*instant* : its index,
*dteday* : it has the date,
*casual & registered* : i dont consider the columns that specify bike counts by customer categories since our objective is to determine the total bike count. Furthermore, we've introduced a new variable to represent the proportion of different customer types.

In [119]:
bike_new = bike[
    ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt']]
# bike_new = bike[['temp', 'atemp', 'hum', 'windspeed', 'cnt']] we can see that columns are numerical

In [120]:
bike_new.info()
bike_new.head()

**Creating Dummy Variables**
I can drop all categorical data, so ['temp', 'atemp', 'hum', 'windspeed', 'cnt'] columns are usefull. But i can create dummy variable. 
Dummy variables are usefull because they allow us to include categorical variables in our analysis, which would otherwise be difficult to include due to their non-numeric nature. 
They can also help us to control for confounding factors and improve the validity of our results.

In [121]:
bike_new['season'] = bike_new['season'].astype('category')
bike_new['weathersit'] = bike_new['weathersit'].astype('category')
bike_new['mnth'] = bike_new['mnth'].astype('category')
bike_new['weekday'] = bike_new['weekday'].astype('category')

In [122]:
bike_new = pd.get_dummies(bike_new, drop_first=True)
bike_new.info()

# Go

Splitting the data to Train and Test: - I am splitting the data into TRAIN and TEST (70:30 ratio), now,

In [123]:
from sklearn.model_selection import train_test_split

regression_df = bike_new
regression_X = regression_df.drop(["cnt"], axis=1)
regression_y = regression_df["cnt"]

X_train, X_test, y_train, y_test = train_test_split(regression_X, regression_y, test_size=0.3, random_state=42)

**Perform K-NN for k = 3 With K-fold Cross Validation**

In [124]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train and y_test to numpy arrays for using in knn_classification_with_euclidean_distance func
y_train = np.array(y_train)
y_test = np.array(y_test)

kf = KFold(n_splits=6, shuffle=True, random_state=42)

r2_values = []
mse_values = []
rmse_values = []

# Perform k-fold cross-validation
for train_index, val_index in kf.split(X_train_scaled):
    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]


    start_time = time.time()
    # Predict using KNN regression
    y_pred_fold = knn_regressor_with_manhattan_distance(X_train_fold, y_train_fold, X_val_fold, 3)
    end_time = time.time()
    
    # mean squared error, r2 score
    # r2 = r2_score(y_val_fold, y_pred_fold)
    mse = mean_squared_error(y_val_fold, y_pred_fold)
    rmse = np.sqrt(mse)

    # r2_values.append(r2)
    mse_values.append(mse)
    rmse_values.append(rmse)
    
# Calculate average mean
# print("Average R2 Score:", np.mean(r2_values))
print(f"Average Mean Squared Error: {np.mean(mse_values)}")
print(f"Average Root Mean Squared error: {np.mean(rmse_values)}")

In [125]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
k_neighbors_predictions = knn.predict(X_test_scaled)
accuracy_score(y_test, k_neighbors_predictions)

**Runtime Performance**

Mean Squared Error (MSE) is a metric commonly used to evaluate the performance of a regression model. It measures the average of the squares of the errors, 
which are the differences between actual values and predicted values. 

MSE quantifies the average squared difference between actual values and predicted values. A smaller MSE indicates better agreement between the predicted and actual values, 
whereas a larger MSE suggests poorer model performance.

In [126]:
print(f'Predict Runtime: {end_time - start_time:.6f} seconds')

# PART 3

In [127]:
from sklearn import svm

class_df = df.drop("Audit_Risk", axis=1)
classification_X = class_df.drop(["Risk"], axis=1)
classification_y = class_df["Risk"]

In [128]:
X_train, X_test, y_train, y_test = train_test_split(classification_X, classification_y, test_size=0.3, random_state=42)

**KNN Accuracy for k = 3 without k-fold**

In [129]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train and y_test to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

start_time = time.time()
clf = svm.SVC(kernel='linear')  # Linear Kernel
clf.fit(X_train, y_train)
end_time = time.time()

y_pred = clf.predict(X_test)

In [130]:
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")

**Find Confusion Matrix**

In [131]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

**Find Confusion Matrix & And Roc Values For Each K-fold**

In [132]:
import pylab as pl
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns

kf = KFold(n_splits=6, shuffle=True, random_state=42)

# List to store scores for each fold
rocs = []
roc_aucs = []
confusion_matrices = []

for train_index, val_index in kf.split(X_train_scaled):
    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    clf = svm.SVC(kernel='linear')  # Linear Kernel
    

    clf.fit(X_train_fold, y_train_fold)
    y_pred_fold = clf.predict(X_val_fold)
    end_time = time.time()
    
    fpr, tpr, thresholds = roc_curve(y_val_fold, y_pred_fold)
    roc_auc = auc(fpr, tpr)
    roc_aucs.append(roc_auc)

    cm = confusion_matrix(y_val_fold, y_pred_fold)
    confusion_matrices.append(cm)

    ######### IMPORTANT INFO ###########
    # The optimal cut-off would be where tpr is high and fpr is low
    # tpr - (1-fpr) is zero or near to zero is the optimal cut off point
    ####################################

    i = np.arange(len(tpr))  # index for df
    roc = pd.DataFrame(
        {'fpr': pd.Series(fpr, index=i),
         'tpr': pd.Series(tpr, index=i),
         '1-fpr': pd.Series(1 - fpr, index=i),
         'tf': pd.Series(tpr - (1 - fpr), index=i),
         'thresholds': pd.Series(thresholds, index=i)
         })

    rocs.append(roc)

**Show Optimum threshold & ROC Curves** 

In [133]:
 for i, roc in enumerate(rocs):
    # Plot tpr vs 1-fpr
    fig, ax = pl.subplots()
    pl.plot(roc['tpr'], color='black')
    pl.plot(roc['1-fpr'], color='red')
    pl.xlabel('False Positive Rate')
    pl.ylabel('True Positive Rate')
    pl.title(f"Area under the {i + 1}. ROC curve : {roc_aucs[i]:.3f}")
    pl.legend(["tpr", "1-fpr"], loc="lower right")

    plt.figure(figsize=(3, 3))
    sns.heatmap(confusion_matrices[i], annot=True)
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('False Label')
    plt.plot()

    roc_optimum_threshold = roc.iloc[(roc.tf - 0).abs().argsort()[:1]]
    print(f"{i + 1}. ROC Optimum threshold= {list(roc_optimum_threshold['thresholds'])}")

    plt.show()
    ax.set_xticklabels([])

**Runtime Performance**

Area Under Curve (AUC) is closer to 1 are more good, robust etc model than others. So as you can see, i get the 1 which optimal thresold 1 to.

In [134]:
print(f'Predict Runtime: {end_time - start_time:.6f} seconds')

# PART 4

**Get Data**

In [135]:
from sklearn.model_selection import train_test_split

regression_df = bike_new
regression_X = regression_df.drop(["cnt"], axis=1)
regression_y = regression_df["cnt"]

X_train, X_test, y_train, y_test = train_test_split(regression_X, regression_y, test_size=0.3, random_state=42)

**Regressor based on the linear SVM**

In [136]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

svr = SVR(kernel='linear', C=6)
start_time = time.time()
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
end_time = time.time()

#r2 score, mean squared error & rmse
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Single R2 Score (Before k-fold cross validation): {r2:.3f}')
print(f'Mean Squared Error: {mse:.3f}')
print(f'Root Mean Squared error: {rmse:.3f}')
print(f'Test Score: {svr.score(X_test, y_test):.3f}')
print(f'Train Score: {svr.score(X_train, y_train):.3f}')

In [137]:
# Convert y_train and y_test to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# Initialize the Linear SVR model
svr = SVR(kernel='linear', C=6)
kf = KFold(n_splits=6, shuffle=True, random_state=42)

r2_values = []
mse_values = []
rmse_values = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Predict using LSVM regression
    svr.fit(X_train_fold, y_train_fold)
    y_pred_fold = svr.predict(X_val_fold)

    # mean squared error, r2 score
    r2 = r2_score(y_val_fold, y_pred_fold)
    mse = mean_squared_error(y_val_fold, y_pred_fold)
    rmse = np.sqrt(mse)

    r2_values.append(r2)
    mse_values.append(mse)
    rmse_values.append(rmse)

# Mean squared error and R2 scores
mse_mean = np.mean(mse_values)
r2_mean = np.mean(r2_values)

print(f'Mean Squared Error: {mse_mean:.3f}')
print(f'R2 Score Mean After 6-fold cross validation: {r2_mean:.3f}')

# Final Test

In [138]:
y_pred = svr.predict(X_test)
print(f"Final R2 Score After k-fold cross validation with test data:  {r2_score(y_test, y_pred):.3f}")

**Runtime Performance**

Firstly, I find mse which after k-fold cross validation is better than before one.  

In [139]:
print(f'Predict Runtime: {end_time - start_time:.6f} seconds')

# PART 5

In [140]:
from sklearn import svm

class_df = df.drop("Audit_Risk", axis=1)
classification_X = class_df.drop(["Risk"], axis=1)
classification_y = class_df["Risk"]

X_train, X_test, y_train, y_test = train_test_split(classification_X, classification_y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train and y_test to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

clf = svm.SVC(kernel='rbf')  # Rbf Kernel
start_time = time.time()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
end_time = time.time()

In [141]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

**Find Confusion Matrix**

In [142]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

**Find Confusion Matrix & And Roc Values For Each K-fold**

In [143]:
import numpy as np
import pylab as pl
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns

kf = KFold(n_splits=6, shuffle=True, random_state=42)

# List to store scores for each fold
rocs = []
roc_aucs = []
confusion_matrices = []

for train_index, val_index in kf.split(X_train_scaled):
    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    clf = svm.SVC(kernel='rbf')  # Linear Kernel
    clf.fit(X_train_fold, y_train_fold)
    y_pred_fold = clf.predict(X_val_fold)

    fpr, tpr, thresholds = roc_curve(y_val_fold, y_pred_fold)
    roc_auc = auc(fpr, tpr)
    roc_aucs.append(roc_auc)

    cm = confusion_matrix(y_val_fold, y_pred_fold)
    confusion_matrices.append(cm)

    ######### IMPORTANT INFO ###########
    # The optimal cut-off would be where tpr is high and fpr is low
    # tpr - (1-fpr) is zero or near to zero is the optimal cut off point
    ####################################

    i = np.arange(len(tpr))  # index for df
    roc = pd.DataFrame(
        {'fpr': pd.Series(fpr, index=i),
         'tpr': pd.Series(tpr, index=i),
         '1-fpr': pd.Series(1 - fpr, index=i),
         'tf': pd.Series(tpr - (1 - fpr), index=i),
         'thresholds': pd.Series(thresholds, index=i)
         })

    rocs.append(roc)

**Show Optimum threshold & ROC Curves** 

In [144]:
 for i, roc in enumerate(rocs):
    # Plot tpr vs 1-fpr
    fig, ax = pl.subplots()
    pl.plot(roc['tpr'], color='black')
    pl.plot(roc['1-fpr'], color='red')
    pl.xlabel('1-False Positive Rate')
    pl.ylabel('True Positive Rate')
    pl.title(f"Area under the {i + 1}. ROC curve : {roc_aucs[i]}")
    pl.legend(["tpr", "1-fpr"], loc="lower right")

    plt.figure(figsize=(3, 3))
    sns.heatmap(confusion_matrices[i], annot=True)
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('False Label')
    plt.plot()
    ax.set_xticklabels([])

    roc_optimum_threshold = roc.iloc[(roc.tf - 0).abs().argsort()[:1]]
    print(f"{i + 1}-fold ROC Optimum threshold= {list(roc_optimum_threshold['thresholds'])}")

plt.show()


**Runtime Performance**

In [145]:
print(f'Predict Runtime: {end_time - start_time:.6f} seconds')

# PART 6

In [146]:
from sklearn.tree import DecisionTreeClassifier

class_df = df.drop("Audit_Risk", axis=1)
classification_X = class_df.drop(["Risk"], axis=1)
classification_y = class_df["Risk"]

X_train, X_test, y_train, y_test = train_test_split(classification_X, classification_y, test_size=0.3, random_state=42)

dt = DecisionTreeClassifier(random_state=1)
start_time = time.time()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
end_time = time.time()

accuracy = dt.score(X_test, y_test)  # we can also find accuracy: print(accuracy_score(y_test, y_pred))
print(f'Test set accuracy: {accuracy:.3f}')

**Pruning Strategies**

I am try to use min_impurity_decrease and max_depth parameters. 
The **min_impurity_decrease** parameter controls for how much the impurity of a node must be reduced by splitting it, and the **max_depth** parameter controls the maximum depth of the tree. 
By setting these parameters, appropriately, i can prune the tree to prevent it from overfitting the data.

**Different 2 Pruning Strategies**

In [147]:
# first second pruning strategies using min_impurity_decrease
dt_min_impurity = DecisionTreeClassifier(min_impurity_decrease=0.01, random_state=1)
dt_min_impurity.fit(X_train, y_train)

# second pruning strategies using max_depth
dt_max_depth = DecisionTreeClassifier(max_depth=5, random_state=1)
dt_max_depth.fit(X_train, y_train)

print(f'Test set accuracy (min_impurity_decrease): {dt_min_impurity.score(X_test, y_test):.3f}')
print(f'Test set accuracy (max_depth): {dt_max_depth.score(X_test, y_test):.3f}')

**Apply K-Fold Cross Validation**

In [148]:
import numpy as np
from sklearn.model_selection import KFold

kf = KFold(n_splits=6, shuffle=True, random_state=42)
accuracy_scores = []
fold_number = 1

X_train, X_test, y_train, y_test = train_test_split(classification_X, classification_y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    dt = DecisionTreeClassifier(random_state=1)
    dt.fit(X_train_fold, y_train_fold)
    accuracy = dt.score(X_val_fold, y_val_fold)
    accuracy_scores.append(accuracy)
    
    print(f"{fold_number}. Fold Cross Validation accuracy: {accuracy}")
    fold_number +=1

print(f"\nK-Fold Cross Validation accuracy mean: {np.mean(accuracy_scores)}")

In [149]:
from sklearn.tree import _tree

def tree_to_set_of_rules(tree, features):
    tree_ = tree.tree_
    feature_name = [
        features[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print("def predict({}):".format(", ".join(features)))

    def recurse(node, depth):
        indent = "    " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, np.round(threshold, 2)))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, np.round(threshold, 2)))
            recurse(tree_.children_right[node], depth + 1)
        else:
            # print("{}return {} {} {}".format(indent, tree_.value[node], "samples", tree_.n_node_samples[node])) #> if you wanna see classification
            print("{}return {} {}".format(indent, tree_.n_node_samples[node], "samples"))  # I will use this, because a bit simple and clean

    recurse(0, 1)

In [150]:
X_train, X_test, y_train, y_test = train_test_split(classification_X, classification_y, test_size=0.3, random_state=42)
model = dt.fit(X_train, y_train)
tree_to_set_of_rules(dt, X_train.columns)

In [151]:
# to compare my output and library function's output,
from sklearn import tree
text_representation = tree.export_text(dt)
print(text_representation)

**Report**

In [152]:
print(f'Predict Runtime: {end_time - start_time:.6f} seconds')

# PART 7

In [153]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor

regression_df = bike_new
regression_X = regression_df.drop(["cnt"], axis=1)
regression_y = regression_df["cnt"]

X_train, X_test, y_train, y_test = train_test_split(regression_X, regression_y, test_size=0.3, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

# Detect single runtime performance
dt = DecisionTreeRegressor(random_state=1, max_depth=5, min_impurity_decrease=0.1)
start_time = time.time()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
end_time = time.time()

**Apply K-Fold Cross Validation**

In [154]:
fold_number = 1
accuracy_scores = []
kf = KFold(n_splits=6, shuffle=True, random_state=42)

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    
    dt = DecisionTreeRegressor(random_state=1, max_depth=5, min_impurity_decrease=0.1)
    dt.fit(X_train_fold, y_train_fold)
    y_pred = dt.predict(X_val_fold)
    
    accuracy = dt.score(X_val_fold, y_val_fold)  # we can also find accuracy: print(accuracy_score(y_test, y_pred))
    print(f"{fold_number}. Fold Cross Validation accuracy: {accuracy}")
    fold_number +=1

print("\n")
tree_to_set_of_rules(dt, regression_X.columns) # we can use again same tree_to_code function

**Final Test**

In [155]:
y_pred = dt.predict(X_test)
print(f"Final R2 Score After k-fold cross validation with test data:  {r2_score(y_test, y_pred):.3f}")

**Performance**
As you see in the above outputs, K-cross validation's predict value is a bit higher than final predeict value.  

In [156]:
print(f'Predict Runtime: {end_time - start_time:.6f} seconds')