In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean

# For results repeatability
np.random.seed(0)

# Dataset preprocessing

## Dataset 1

In [2]:
dataset = pd.read_csv("data/adult.data", header=None)

In [3]:
dataset.columns = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","salary"]

In [4]:
dataset = dataset[(dataset!=" ?").all(axis=1)].reset_index(drop=True)
df_strings = dataset.select_dtypes(['object'])
dataset[df_strings.columns] = df_strings.apply(lambda x: x.str.strip())

In [5]:
clean_dataset = pd.DataFrame(dataset["age"])
for col in dataset.columns[1:-1]:
    if(dataset[col].dtype =='O'):
        clean_dataset = clean_dataset.join(pd.get_dummies(dataset[col], prefix=col))
    else:
        clean_dataset = clean_dataset.join(dataset[col])
labels = (dataset["salary"]==">50K")*1
dataset = clean_dataset

## Dataset 2

# Error measure

In [6]:
def error(predicted_labels, real_labels, loss="euclidean"):
    """
    Input: numpy array containing respectively the labels an algorithm predicted, and the real labels corresponding
    to the data. Type of loss we want to use.
    
    Output: float, the computed loss.
    """
    if loss == "euclidean": return euclidean(predicted_labels, real_labels)
    elif loss == "manhattan": return sum(abs(predicted_labels - real_labels))
    
    

# Knn

We use scikit-learn's knn function to design a knn classifier.

In [7]:
from sklearn.neighbors import KNeighborsClassifier

In [8]:
def knn(training_features, training_labels, to_predict_features,
        n_neighbors=5, weights = "uniform", algorithm="auto", p=2):
    
    """
    Input: Training data, features for which we want to predict the labels, number of neighbors k for knn algo,
    features weights system (uniform or distance), algorithm usewd to find closer k neighbors, p is the value
    used in the computation of the minkowski distance that is used here, p=1 gives a manhattan distance, p=2 a
    euclidian distance.
    
    Output: Numpy array containing the labels predicted by KNN for the given 'to_predict_features'
    """
    neigh = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, p=p)
    neigh.fit(training_features, training_labels)
    
    return neigh.predict(to_predict_features)

In [9]:
K = 2

training_features = dataset[0:1000]
training_labels = labels[0:1000]
to_predict_features = dataset[1000:1200]
predicted_labels = knn(training_features, training_labels, to_predict_features, K)
real_labels = labels[1000:1200]
print(error(predicted_labels, real_labels, loss="manhattan"))


54


Now we study the influence of the size of the training / validation dataset on the training / validation errors.

# Decision tree

# Cross validation

In [16]:
def cross_validation(algo, dataset_, labels_, loss_="manhattan", folds = 5, algo_kwargs={}):
    """
    Input : Predictor function that works by supplying training set and labels and test set and return predicted labels
            dataset  and corresponding labels
            folds
            algo_kwargs : a dict with additional params for the algo : ex. {'n_neighbors':5}
    Output : Precision mean and variance
    """
    dataset_size = dataset_.shape[0]
    group_ids = np.random.choice(folds+1, size=dataset_size)
    training_precisions = []
    validation_precisions = []
    for N in range(folds):
        training_set = dataset_[group_ids != N]
        training_labels = labels_[group_ids != N]
        test_set = dataset_[group_ids == N]
        test_labels = labels_[group_ids == N]
        
        # Training error
        training_predicted_labels = algo(training_set, training_labels, training_set, **algo_kwargs)
        training_precisions += [(len(training_labels) - error(training_predicted_labels, training_labels, loss = loss_))/len(training_labels)]
    
        
        # Validation error
        validation_predicted_labels = algo(training_set, training_labels, test_set, **algo_kwargs)
        validation_precisions += [(len(test_labels) - error(validation_predicted_labels, test_labels, loss = loss_))/len(test_labels)]
    
    
    return (np.mean(training_precisions), np.std(training_precisions), np.mean(validation_precisions), np.std(validation_precisions))

In [19]:
cross_validation(knn, dataset[:1000], labels[:1000], folds = 10, algo_kwargs={'n_neighbors':2})

(0.838475527595383,
 0.0023740329442958315,
 0.7552687446448404,
 0.037238998391097206)