In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from scipy.spatial.distance import euclidean
from IPython.display import Image
plt.rcParams['figure.figsize'] = [10, 5]

# For results repeatability
np.random.seed(0)

# Dataset preprocessing

## Dataset 1

A function to upload and preprocess data in the "adults" format. As a first approach we simply remove rows with missing values.

In [None]:
def get_clean_dataset1(data_file, test=False):
    dataset = pd.read_csv(data_file, header=None)
    dataset.columns = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","salary"]
    dataset = dataset[(dataset!=" ?").all(axis=1)].reset_index(drop=True)
    df_strings = dataset.select_dtypes(['object'])
    dataset[df_strings.columns] = df_strings.apply(lambda x: x.str.strip())
    clean_dataset = pd.DataFrame(dataset["age"])
    for col in dataset.columns[1:-1]:
        if(dataset[col].dtype =='O'):
            clean_dataset = clean_dataset.join(pd.get_dummies(dataset[col], prefix=col))
        else:
            clean_dataset = clean_dataset.join(dataset[col])
    
    if test: labels = (dataset["salary"]==">50K.")*1
    else: labels = (dataset["salary"]==">50K")*1
    return clean_dataset, labels

### Training / Validation dataset

In [None]:
dataset_adult, labels_adult = get_clean_dataset1("data/adult.data")

### Test dataset

In [None]:
test_dataset_adult, test_labels_adult = get_clean_dataset1("data/adult.test", test=True)

In [None]:
# There is a 'missing' column in the adult test dataset because it does not contain people born in Holand-Netherlands
# We virtually add one to be able to run all algorithms
test_dataset_adult['native-country_Holand-Netherlands'] = 0

### Understanding the dataset

We try to visualize if some basic features one can easily think of seem to have an influence on salary (like sex or education).

In [None]:
male_low = dataset[(dataset["sex"]=="Male") & (dataset["salary"]=="<=50K")].shape[0]
male_high = dataset[(dataset["sex"]=="Male") & (dataset["salary"]==">50K")].shape[0]
female_low = dataset[(dataset["sex"]=="Female") & (dataset["salary"]=="<=50K")].shape[0]
female_high = dataset[(dataset["sex"]=="Female") & (dataset["salary"]==">50K")].shape[0]
fisher_tab = np.array([[male_low, male_high], [female_low, female_high]])

barWidth = 0.2
bars1 = [female_high, male_high]
bars2 = [female_low, male_low]
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
 
plt.bar(r1, bars1, width = barWidth, color = 'blue', edgecolor = 'black',label='>=50K')
plt.bar(r2, bars2, width = barWidth, color = 'cyan', edgecolor = 'black', label='<50K')

plt.xticks([r + barWidth for r in range(len(bars1))], ['Female', 'Male'])
plt.legend()
 
plt.show()

In [None]:
US_low = dataset[(dataset["native-country"]=="United-States") & (dataset["salary"]=="<=50K")].shape[0]
US_high = dataset[(dataset["native-country"]=="United-States") & (dataset["salary"]==">50K")].shape[0]
non_US_low = dataset[(dataset["native-country"]!="United-States") & (dataset["salary"]=="<=50K")].shape[0]
non_US_high = dataset[(dataset["native-country"]!="United-States") & (dataset["salary"]==">50K")].shape[0]
barWidth = 0.2
bars1 = [US_high, non_US_high]
bars2 = [US_low, US_high]
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
 
plt.bar(r1, bars1, width = barWidth, color = 'blue', edgecolor = 'black',label='>=50K')
plt.bar(r2, bars2, width = barWidth, color = 'cyan', edgecolor = 'black', label='<50K')

plt.xticks([r + barWidth for r in range(len(bars1))], ['US', 'Non US'])
plt.legend()
 
plt.show()

## Dataset 2

In [None]:
def get_clean_dataset2(data_file):
    dataset = pd.read_csv(data_file, header=None)
    dataset.columns = ["white_king_column","white_king_row","white_rook_column","white_rook_row","black_king_column","black_king_row", "outcome"]
    df_strings = dataset.select_dtypes(['object'])
    dataset[df_strings.columns] = df_strings.apply(lambda x: x.str.strip())
    clean_dataset = pd.DataFrame()
    for col in dataset.columns[0:-1]:
        if(dataset[col].dtype =='O'):
            clean_dataset = clean_dataset.join(pd.get_dummies(dataset[col], prefix=col))
        else:
            clean_dataset = clean_dataset.join(dataset[col])

    labels = (dataset["outcome"]!="draw")*1
    return clean_dataset, labels

### Training / Validation dataset

In [None]:
dataset_chess, labels_chess = get_clean_dataset2("data/krkopt.data")

### Test dataset

In [None]:
test_dataset_chess, test_labels_chess = get_clean_dataset2("data/krkopt.test")

# Utils

### Plotting function

In [None]:
def plotMeanAndStd(stats, x, color='b', ax = None, legend=None):
    """
    Input : array of tuples (mean std) and their x coordinates
    """
    mean = np.array([s[0] for s in stats])
    standard_dev = np.array([s[1] for s in stats])
    if ax == None :
        plt.plot(x, mean, c=color,label=legend)
        plt.fill_between(x, mean-standard_dev, mean+standard_dev, alpha=0.2, color=color)
    else:
        ax.plot(x, mean, c=color,label=legend)
        ax.fill_between(x, mean-standard_dev, mean+standard_dev, alpha=0.2, color=color)  

### Error measure

In [None]:
def error(predicted_labels, real_labels, loss="euclidean"):
    """
    Input: numpy array containing respectively the labels an algorithm predicted, and the real labels corresponding
    to the data. Type of loss we want to use.
    
    Output: float, the computed loss.
    """
    if loss == "euclidean": return euclidean(predicted_labels, real_labels)
    elif loss == "manhattan": return sum(abs(predicted_labels - real_labels))
    
    

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

### Features normalization

We add the possibility to perform data normalization with scikit-learn's scaler to avoid having overweighted features.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
def normalize(dataset):
    columns = dataset.columns
    scaler = StandardScaler()
    scaler.fit(dataset)
    dataset[columns] = scaler.transform(dataset[columns])
    return dataset

### Features weighting

In [None]:
def weight_features(dataset, weights):
    """
    Input: a dataset (pandas dataframe), and a dictionary giving each dataset's feature a weight
    according to their importance
    
    Output: None, but dataset's features have been multiplied by their weight to take into account the
    differences in their importance
    """
    for feature in weights:
        dataset[feature] *= weights[feature]
        

# Cross validation

In [None]:
def cross_validation(algo, dataset_, labels_, loss_="manhattan", folds = 5, algo_kwargs={}):
    """
    Input : Predictor function that works by supplying training set and labels and test set and return predicted labels
            dataset  and corresponding labels
            folds
            algo_kwargs : a dict with additional params for the algo : ex. {'n_neighbors':5}
    Output : Precision mean and variance
    """
    dataset_size = dataset_.shape[0]
    group_ids = np.tile(np.arange(folds),int(dataset_size/folds)+1)[:dataset_size]
    np.random.shuffle(group_ids)
    training_precisions = []
    validation_precisions = []
    for N in range(folds):
        training_set = dataset_[group_ids != N]
        training_labels = labels_[group_ids != N]
        test_set = dataset_[group_ids == N]
        test_labels = labels_[group_ids == N]
        # Training error
        training_predicted_labels = algo(training_set, training_labels, training_set, **algo_kwargs)
        training_precisions += [(len(training_labels) - error(training_predicted_labels, training_labels, loss = loss_))/len(training_labels)]
    
        
        # Validation error
        validation_predicted_labels = algo(training_set, training_labels, test_set, **algo_kwargs)
        validation_precisions += [(len(test_labels) - error(validation_predicted_labels, test_labels, loss = loss_))/len(test_labels)]
    
    
    return (np.mean(training_precisions), np.std(training_precisions), np.mean(validation_precisions), np.std(validation_precisions))

In [None]:
def data_size_influence(algo, dataset_, labels_, Ns=[], loss_="manhattan", folds = 5, algo_kwargs={}, show_time=False, visualize=True):
    
    nrows = dataset_.shape[0]
    mean_trainings = []
    std_trainings = []
    mean_validations = []
    std_validations = []
    
    if show_time: times = []
    
    for N in Ns:
        indices = list(np.random.choice(nrows, N))
        d = dataset.iloc[indices]
        l = labels.iloc[indices]
        
        if show_time: t = time.time()
        mean_training, std_training, mean_validation, std_validation = cross_validation(knn, d, l, folds = folds, algo_kwargs=algo_kwargs)
        if show_time: times.append(time.time() - t)
        
        mean_trainings.append(mean_training)
        std_trainings.append(std_training)
        mean_validations.append(mean_validation)
        std_validations.append(std_validation)
    
    if visualize:
        fig = plt.figure(figsize=(10,7))
        fig.suptitle("Influence of training/validation dataset size")
        ax1 = fig.gca()
        plotMeanAndStd([i for i in zip(mean_trainings, std_trainings)],Ns, ax= ax1, legend = "Training", color='red')
        plotMeanAndStd([i for i in zip(mean_validations, std_validations)],Ns, ax= ax1, legend = "Validation",color='blue')
        ax1.legend(loc="lower right")
        ax1.set_ylabel("Precision")
        ax1.set_xlabel("Dataset sample size")
        ax1.grid()

        if show_time:
            ax5 = ax1.twinx()
            ax5.set_ylabel("Computation time (s)")
            ax5.plot(Ns, times, label ="Computation time", c="orange")

        fig.tight_layout()
        plt.show()
    
    if show_time: return (mean_trainings, std_trainings, mean_validations, std_validations, times)
    else: return (mean_trainings, std_trainings, mean_validations, std_validations)

# Knn

### Knn algorithm implementation

We use scikit-learn's knn function to design a knn classifier.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def knn(training_features, training_labels, to_predict_features,
        n_neighbors=5, weights = "uniform", algorithm="auto", p=2):
    
    """
    Input: Training data, features for which we want to predict the labels, number of neighbors k for knn algo,
    features weights system ('uniform' or 'distance'), algorithm usewd to find closer k neighbors, p is the value
    used in the computation of the minkowski distance that is used here, p=1 gives a manhattan distance, p=2 a
    euclidian distance.
    
    Output: Numpy array containing the labels predicted by KNN for the given 'to_predict_features'
    """
    neigh = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, p=p)
    neigh.fit(training_features, training_labels)
    
    return neigh.predict(to_predict_features)

### Let us study the influence of the hyperparameter K

Let us study the influence of the hyperparameter k (number of neighbors) on KNN algorithm's performance.

In [None]:
def n_neighbors_influence_multiple_datasize(Ks, dataset_, labels_, Ns, loss_="manhattan",
                                            weights = "uniform", folds = 5, show_time=False, 
                                            visualize=True):
    
    mean_training_per_k = dict()
    std_training_per_k = dict()
    mean_validation_per_k = dict()
    std_validation_per_k = dict()
    
    if show_time: time_per_k = dict()
    
    for k in Ks:
        print(k)
        if show_time: (mean_trainings, std_trainings, mean_validations, std_validations, times) = data_size_influence(knn, dataset_, labels_, Ns, loss_="manhattan", folds = folds, algo_kwargs={"n_neighbors":k, "weights":weights}, show_time=show_time, visualize=False)
        else: (mean_trainings, std_trainings, mean_validations, std_validations) = data_size_influence(knn, dataset_, labels_, Ns, loss_="manhattan", folds = folds, algo_kwargs={"n_neighbors":k, "weights":weights}, show_time=show_time, visualize=False)
            
        mean_training_per_k[k] = mean_trainings
        std_training_per_k[k] = std_trainings
        mean_validation_per_k[k]  = mean_validations
        std_validation_per_k[k]  = std_validations

        if show_time: time_per_k[k]  = times
    subplotcode = 120
    if show_time :
        subplotcode = 130
    if visualize:
        
        fig = plt.figure(figsize=(15,10))
        fig.suptitle("Influence of K (in k nearest neighbor) and training/validation dataset size")
        
        ax1 = fig.add_subplot(subplotcode+1)
        ax1.title.set_text("Training precision")
        cmap = plt.get_cmap("tab10")
        curveID = 0
        for k in Ks:
            plotMeanAndStd([i for i in zip(mean_training_per_k[k],std_training_per_k[k])],Ns, legend="K = "+str(k),ax = ax1, color = cmap(curveID))
            curveID += 1
        ax1.set_ylim(0,1.05)
        ax1.set_xlim(Ns[0],Ns[-1])
        ax1.grid()
        ax1.legend()
        
        ax2 = fig.add_subplot(subplotcode+2)
        curveID = 0
        for k in Ks:
            plotMeanAndStd([i for i in zip(mean_validation_per_k[k],std_validation_per_k[k])],Ns, legend="K = "+str(k), ax=ax2,color = cmap(curveID))
            curveID += 1
        ax2.set_ylim(0,1.05)
        ax2.title.set_text("Validation precision")
        ax2.set_xlim(Ns[0],Ns[-1])
        ax2.grid()
        ax2.legend()
        
        if show_time:
            ax5 = fig.add_subplot(subplotcode+3)
            ax5.title.set_text("Computation time")
            for k in Ks:
                ax5.plot(Ns, time_per_k[k], label=f"k={k}")
            ax5.grid()
            plt.legend()
        


        fig.tight_layout()
              
        plt.show()
        
        
    
    
    if show_time: return (mean_training_per_k, std_training_per_k, mean_validation_per_k, std_validation_per_k, time_per_k)
    else: return (mean_training_per_k, std_training_per_k, mean_validation_per_k, std_validation_per_k)
    

In [None]:
nrows = dataset_adult.shape[0]
Ks = list(range(1, 50, 2))
Ns = [int(k * nrows) for k in [0.25, 0.5, 0.75, 1]]
"""
n_neighbors_influence_multiple_datasize(Ks, dataset_adult, labels_adult, Ns, loss_="manhattan",
                                            folds = 5, show_time=True, visualize=True)
"""



In [None]:
Image("img/dataset_size_influence.png")

Considering the above results, when working on hyperpara

In [None]:
def n_neighbors_influence_fixed_datasize(Ks, dataset_, labels_, N, loss_="manhattan",
                                         weights = "uniform",folds = 5, show_time=False,
                                         visualize=True):
    nrows = dataset_.shape[0]
    mean_trainings = []
    std_trainings = []
    mean_validations = []
    std_validations = []
    
    if show_time: times = []
    
    for k in Ks:
        print(k)
        indices = list(np.random.choice(nrows, N))
        d = dataset_.iloc[indices]
        l = labels_.iloc[indices]
        
        if show_time: t = time.time()
        mean_training, std_training, mean_validation, std_validation = cross_validation(knn, d, l, folds = folds, algo_kwargs={"n_neighbors": k, "weights":weights})
        if show_time: times.append(time.time() - t)
        
        mean_trainings.append(mean_training)
        std_trainings.append(std_training)
        mean_validations.append(mean_validation)
        std_validations.append(std_validation)
    
    if visualize:
        fig = plt.figure(figsize=(15,10))
        #fig.suptitle("Influence of training/validation dataset size")
        ax1 = fig.gca()
        plotMeanAndStd([i for i in zip(mean_trainings, std_trainings)], Ks, ax = ax1, color = 'b', legend="Training")
        plotMeanAndStd([i for i in zip(mean_validations, std_validations)], Ks, ax = ax1, color = 'r', legend="Validation")
        ax1.set_ylim(0,1.1)
        ax1.grid()
        ax1.title.set_text("Precision")
        ax1.set_xlabel("K")
        ax1.set_ylabel("Precision")
        ax1.legend()
        ax1.set_xlim(min(Ks), max(Ks))
        if show_time:
            ax5 = fig.add_subplot(235)
            ax5.title.set_text("Computation time")
            ax5.plot(Ks, times)

        plt.show()
    
    if show_time: return (mean_trainings, std_trainings, mean_validations, std_validations, times)
    else: 
        return (mean_trainings, std_trainings, mean_validations, std_validations)
    

### Knn results for dataset 1

First we create a normalized version of the dataset, as well as a version where, after normalization, features are weighted according to the importance a decision tree algorithm gives them. As a trade-off between computation time and precision, we only work with a subset whose size is 75% of the original one.

In [None]:
nrows = dataset_adult.shape[0]
N = int(0.75 * nrows)
indices = list(np.random.choice(nrows, N))
dataset_adult_75 = dataset_adult.iloc[indices]
labels_adult_75 = labels_adult.iloc[indices]

normalized_dataset_adult = dataset_adult_75.copy(deep=True)
normalize(normalized_dataset_adult)

normalized_test_dataset_adult = test_dataset_adult.copy(deep=True)
normalize(normalized_test_dataset_adult)

In [None]:
weighted_dataset_adult = normalized_dataset_adult.copy(deep=True)
weights = features_importance(normalized_dataset_adult, labels_adult_75, criterion='gini', splitter='best')
weight_features(weighted_dataset_adult, weights)


weighted_test_dataset_adult = normalized_test_dataset_adult.copy(deep=True)
weight_features(weighted_test_dataset_adult, weights)


#### Selecting best hyperparameter

###### Uniform weights, no normalization

In [None]:
(mean_trainings, std_trainings, mean_validations, std_validations) = n_neighbors_influence_fixed_datasize(list(range(1, 15)), dataset_adult_75, labels_adult_75, N, loss_="manhattan",
                                     weights="uniform", folds = 5, show_time=False, visualize=True)
print(f"Best result is {max(mean_validations)} for k = {mean_validations.index(max(mean_validations)) + 1}")

Considering the above results, we choose to use the hyperparameter k =.

###### Uniform weights, normalization

In [None]:
(mean_trainings, std_trainings, mean_validations, std_validations) = n_neighbors_influence_fixed_datasize(list(range(1, 15)), normalized_dataset_adult, labels_adult, N, loss_="manhattan",
                                     weights="uniform", folds = 5, show_time=False, visualize=True)
print(f"Best result is {max(mean_validations)} for k = {mean_validations.index(max(mean_validations)) + 1}")

Considering the above results, we choose to use the hyperparameter k =.

###### Uniform weights, normalization and weighted features

In [None]:
(mean_trainings, std_trainings, mean_validations, std_validations) =n_neighbors_influence_fixed_datasize(list(range(1, 15)), weighted_dataset_adult, labels_adult, N, loss_="manhattan",
                                     weights="uniform", folds = 5, show_time=False, visualize=True)

print(f"Best result is {max(mean_validations)} for k = {mean_validations.index(max(mean_validations)) + 1}")

Considering the above results, we choose to use the hyperparameter k =.

###### Distance weights, no normalization 

In [None]:
(mean_trainings, std_trainings, mean_validations, std_validations) =n_neighbors_influence_fixed_datasize(list(range(1, 15)), dataset_adult_75, labels_adult_75, N, loss_="manhattan",
                                     weights="distance", folds = 5, show_time=False, visualize=True)
print(f"Best result is {max(mean_validations)} for k = {mean_validations.index(max(mean_validations)) + 1}")

Considering the above results, we choose to use the hyperparameter k =.

###### Distance weights, normalization 

In [None]:
(mean_trainings, std_trainings, mean_validations, std_validations) =n_neighbors_influence_fixed_datasize(list(range(1, 15)), normalized_dataset_adult, labels_adult, N, loss_="manhattan",
                                     weights="distance", folds = 5, show_time=False, visualize=True)
print(f"Best result is {max(mean_validations)} for k = {mean_validations.index(max(mean_validations)) + 1}")

Considering the above results, we choose to use the hyperparameter k =.

###### Distance weights, normalization and weighted features

In [None]:
(mean_trainings, std_trainings, mean_validations, std_validations) =n_neighbors_influence_fixed_datasize(list(range(1, 15)), weighted_dataset_adult, labels_adult, N, loss_="manhattan",
                                     weights="uniform", folds = 5, show_time=False, visualize=True)
print(f"Best result is {max(mean_validations)} for k = {mean_validations.index(max(mean_validations)) + 1}")

Considering the above results, we choose to use the hyperparameter k =.

#### Results on test dataset

##### Uniform weights, no normalization

In [None]:
k1 = 3

In [None]:
predicted_labels1 = knn(dataset_adult_75, labels_adult_75, test_dataset_adult, n_neighbors=k1, weights = "uniform", algorithm="auto", p=2)

In [None]:
precision1 = (len(test_labels_adult) - error(predicted_labels1, test_labels_adult))/len(test_labels_adult)
print(f"Precision on the test dataset is {precision1}")
disp = ConfusionMatrixDisplay(confusion_matrix(test_labels_adult, predicted_labels1))
disp.plot()

##### Uniform weights, normalization

In [None]:
k2 = 5

In [None]:
predicted_labels2 = knn(normalized_dataset_adult, labels_adult_75, normalized_test_dataset_adult, n_neighbors=k2, weights = "uniform", algorithm="auto", p=2)

In [None]:
precision2 = (len(test_labels_adult) - error(predicted_labels2, test_labels_adult))/len(test_labels_adult)
print(f"Precision on the test dataset is {precision2}"")
ConfusionMatrixDisplay.from_predictions(test_labels_adult, predicted_labels2)

##### Uniform weights, normalization and weighted features

In [None]:
k3 = 5

In [None]:
predicted_labels3 = knn(weighted_dataset_adult, labels_adult_75, weighted_test_dataset_adult, n_neighbors=k3, weights = "uniform", algorithm="auto", p=2)

In [None]:
precision3 = (len(test_labels_adult) - error(predicted_labels3, test_labels_adult))/len(test_labels_adult)
print(f"Precision on the test dataset is {precision3}"")
ConfusionMatrixDisplay.from_predictions(test_labels_adult, predicted_labels3)

##### Distance weights, no normalization

In [None]:
k4 = 5

In [None]:
predicted_labels4 = knn(dataset_adult, labels_adult_75, test_dataset_adult, n_neighbors=k4, weights = "distance", algorithm="auto", p=2)

In [None]:
precision4 = (len(test_labels_adult) - error(predicted_labels4, test_labels_adult))/len(test_labels_adult)
print(f"Precision on the test dataset is {precision4}"")
ConfusionMatrixDisplay.from_predictions(test_labels_adult, predicted_labels4)

##### Distance weights,  normalization

In [None]:
k5 = 5

In [None]:
predicted_labels5 = knn(normalized_dataset_adult, labels_adult_75, normalized_test_dataset_adult, n_neighbors=k5, weights = "distance", algorithm="auto", p=2)

In [None]:
precision5 = (len(test_labels_adult) - error(predicted_labels5, test_labels_adult))/len(test_labels_adult)
print(f"Precision on the test dataset is {precision5}"")
ConfusionMatrixDisplay.from_predictions(test_labels_adult, predicted_labels5)

##### Distance weights,  normalization and weighted features

In [None]:
k6 = 5

In [None]:
predicted_labels6 = knn(weighted_dataset_adult, labels_adult_75, weighted_test_dataset_adult, n_neighbors=k6, weights = "distance", algorithm="auto", p=2)

In [None]:
precision6 = (len(test_labels_adult) - error(predicted_labels6, test_labels_adult))/len(test_labels_adult)
print(f"Precision on the test dataset is {precision6}"")
ConfusionMatrixDisplay.from_predictions(test_labels_adult, predicted_labels6)

In [None]:
# TO DO : print precision of test dataset, add confusion matrix

### Knn results for dataset 2

First we create a normalized version of the dataset, as well as a version where, after normalization, features are weighted according to the importance a decision tree algorithm gives them.

In [None]:
normalized_dataset_chess = dataset_chess.copy(deep=True)
normalize(normalized_dataset_chess)

normalized_test_dataset_chess = test_dataset_chess.copy(deep=True)
normalize(normalized_test_dataset_chess)

In [None]:
weighted_dataset_chess = normalized_dataset_chess.copy(deep=True)
weights = features_importance(normalized_dataset_chess, labels_chess, criterion='gini', splitter='best')
weight_features(weighted_dataset_chess, weights)


weighted_test_dataset_chess = normalized_test_dataset_chess.copy(deep=True)
weight_features(weighted_test_dataset_chess, weights)


#### Selecting best hyperparameter

##### Uniform weights, no normalization

In [None]:
(mean_trainings, std_trainings, mean_validations, std_validations) =n_neighbors_influence_fixed_datasize(list(range(1, 50)), dataset_chess, labels_chess, nrows, loss_="manhattan",
                                     weights="uniform", folds = 5, show_time=False, visualize=True)

Considering the above results, we choose to use the hyperparameter k =.

##### Uniform weights, normalization

In [None]:
(mean_trainings, std_trainings, mean_validations, std_validations) =n_neighbors_influence_fixed_datasize(list(range(1, 50)), normalized_dataset_chess, labels_chess, nrows, loss_="manhattan",
                                     weights="uniform", folds = 5, show_time=False, visualize=True)

Considering the above results, we choose to use the hyperparameter k =.

##### Uniform weights,  normalization and weighted features

In [None]:
(mean_trainings, std_trainings, mean_validations, std_validations) =n_neighbors_influence_fixed_datasize(list(range(1, 50)), weighted_dataset_chess, labels_chess, nrows, loss_="manhattan",
                                     weights="uniform", folds = 5, show_time=False, visualize=True)

Considering the above results, we choose to use the hyperparameter k =.

##### Distance weights, no normalization

In [None]:
(mean_trainings, std_trainings, mean_validations, std_validations) =n_neighbors_influence_fixed_datasize(list(range(1, 50)), dataset_chess, labels_chess, nrows, loss_="manhattan",
                                     weights="distance", folds = 5, show_time=False, visualize=True)

Considering the above results, we choose to use the hyperparameter k =.

##### Distance weights, normalization

In [None]:
(mean_trainings, std_trainings, mean_validations, std_validations) =n_neighbors_influence_fixed_datasize(list(range(1, 50)), normalized_dataset_chess, labels_chess, nrows, loss_="manhattan",
                                     weights="distance", folds = 5, show_time=False, visualize=True)

Considering the above results, we choose to use the hyperparameter k =.

##### Distance weights, normalization and weighted features

In [None]:
(mean_trainings, std_trainings, mean_validations, std_validations) =n_neighbors_influence_fixed_datasize(list(range(1, 50)), weighted_dataset_chess, labels_chess, nrows, loss_="manhattan",
                                     weights="distance", folds = 5, show_time=False, visualize=True)

Considering the above results, we choose to use the hyperparameter k =.

In [None]:
# TO DO : plot the performance depending on K for the entire dataset, choose simplest best K

#### Results on test dataset

##### Uniform weights, no normalization

In [None]:
k1 = 5

In [None]:
predicted_labels1 = knn(dataset_chess, labels_chess, test_dataset_chess, n_neighbors=k1, weights = "uniform", algorithm="auto", p=2)

In [None]:
precision1 = (len(test_labels_chess) - error(predicted_labels1, test_labels_chess))/len(test_labels_chess)
print(f"Precision on the test dataset is {precision1}"")
ConfusionMatrixDisplay.from_predictions(test_labels_chess, predicted_labels1)

##### Uniform weights, normalization

In [None]:
k2 = 5

In [None]:
predicted_labels2 = knn(normalized_dataset_chess, labels_chess, normalized_test_dataset_chess, n_neighbors=k2, weights = "uniform", algorithm="auto", p=2)

In [None]:
precision2 = (len(test_labels_chess) - error(predicted_labels2, test_labels_chess))/len(test_labels_chess)
print(f"Precision on the test dataset is {precision2}")
ConfusionMatrixDisplay.from_predictions(test_labels_chess, predicted_labels2)

##### Uniform weights, normalization and weighted features

In [None]:
k3 = 5

In [None]:
predicted_labels3 = knn(weighted_dataset_adult, labels_adult, weighted_test_dataset_adult, n_neighbors=k3, weights = "uniform", algorithm="auto", p=2)

In [None]:
precision3 = (len(test_labels_chess) - error(predicted_labels3, test_labels_chess))/len(test_labels_chess)
print(f"Precision on the test dataset is {precision3}")
ConfusionMatrixDisplay.from_predictions(test_labels_chess, predicted_labels3)

##### Distance weights, no normalization

In [None]:
k4 = 5

In [None]:
predicted_labels4 = knn(dataset_chess, labels_chess, test_dataset_chess, n_neighbors=k4, weights = "distance", algorithm="auto", p=2)

In [None]:
precision4 = (len(test_labels_chess) - error(predicted_labels4, test_labels_chess))/len(test_labels_chess)
print(f"Precision on the test dataset is {precision4}"")
ConfusionMatrixDisplay.from_predictions(test_labels_chess, predicted_labels4)

##### Distance weights,  normalization

In [None]:
k5 = 5

In [None]:
predicted_labels5 = knn(normalized_dataset_chess, labels_chess, normalized_test_dataset_chess, n_neighbors=k5, weights = "distance", algorithm="auto", p=2)

In [None]:
precision5 = (len(test_labels_chess) - error(predicted_labels5, test_labels_chess))/len(test_labels_chess)
print(f"Precision on the test dataset is {precision5}"")
ConfusionMatrixDisplay.from_predictions(test_labels_chess, predicted_labels5)

##### Distance weights,  normalization and weighted features

In [None]:
k6 = 5

In [None]:
predicted_labels6 = knn(weighted_dataset_chess, labels_chess, weighted_test_dataset_chess, n_neighbors=k6, weights = "distance", algorithm="auto", p=2)

In [None]:
precision6 = (len(test_labels_chess) - error(predicted_labels6, test_labels_chess))/len(test_labels_chess)
print(f"Precision on the test dataset is {precision6}"")
ConfusionMatrixDisplay.from_predictions(test_labels_chess, predicted_labels6)

In [None]:
# TO DO : print precision of test dataset, add confusion matrix

In [None]:
# TO DO : print precision of test dataset, add confusion matrix

# Decision tree

We use scikit-learn's decision tree function to design a decision tree classifier.

In [None]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

In [None]:
def decision_tree(training_features, training_labels, to_predict_features,
                  criterion='gini', splitter='best'):
    """
    :param training_features: training features (x)
    :param training_labels: training labels (y)
    :param to_predict_features: features that we want to predict
    :param criterion: {"best", "random"} Default is "gini" for the Gini impurity and "entropy" for the information gain
    :param splitter: {"best", "random"} Default is "best" to choose the best split and "random" to choose the best random split.
    :return:Numpy array containing the labels predicted by Decision Tree for the given 'to_predict_features'
    """
    classifier = DecisionTreeClassifier(criterion=criterion, splitter=splitter)
    classifier.fit(training_features, training_labels)
    
    return classifier.predict(to_predict_features)

In [None]:
# TO DO : see influence of cost function on precision (same graph as for KNN : dataset size on x axis, one trace for each cost function)

In [None]:
# TO DO : see influence of max depth on precision (same graph as for KNN : fixed data set size, max depth on x axis, precision on y)

### Decision tree results for dataset 1

##### Selecting best hyperparameter

In [None]:
#TO DO plot precision depending on max depth, one line for each cost function, selecting simplest best model

##### Results on dataset 1

In [None]:
# Print precision and confusion matrix on test dataset

### Decision tree results for dataset 2

##### Selecting best hyperparameter

In [None]:
#TO DO plot precision depending on max depth, one line for each cost function, selecting simplest best model

#### Results on dataset 2

In [None]:
# Print precision and confusion matrix on test dataset

### Identify the features considered as most important from decision tree

The goal of this section is to be able to identify the data features that decision trees identify as most signficant. This could be useful, in particular to weight features according to their importance when performing KNN classication.

In [None]:
def features_importance(training_features, training_labels, criterion='gini',
                        splitter='best', visualize = False, nb_top=5):
    """
    Input: training features and labels, decision tree's parameters, whether we want to visualize feature's
    importance
    
    Output: a dictionary giving each feature a score according to its importance in the decision tree
    """
    
    classifier = DecisionTreeClassifier(criterion=criterion, splitter=splitter)
    classifier.fit(training_features, training_labels)
    features_importances = classifier.feature_importances_
    
    importance = dict()
    cols = training_features.columns
    for i,v in enumerate(features_importances):
        importance[cols[i]] = v
    
    if visualize:
        # Say which features are most important
        ordered_features = sorted(importance.keys(), key = lambda k:(importance[k])) 
        ordered_features.reverse()
        for i in range(nb_top):
            feature = ordered_features[i]
            print(f"Feature ranked {i + 1} is {feature} with score {importance[feature]}")
            
        # Plot feature importance
        plt.bar([x for x in range(len(features_importances))], features_importances)
        plt.show()
    
    return importance

# Influence of dataset's size

Now we study the influence of the size of the training / validation dataset on the training / validation errors.

In [None]:
data_size_influence(knn, dataset, labels, 50, 10000, 1000, loss_="manhattan", folds = 5, algo_kwargs={"n_neighbors":3}, show_time=True, visualize=True)
pass