In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean

# Dataset preprocessing

## Dataset 1

In [2]:
dataset = pd.read_csv("data/adult.data", header=None)

In [3]:
dataset.columns = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","salary"]

In [4]:
dataset = dataset[(dataset!=" ?").all(axis=1)].reset_index(drop=True)
df_strings = dataset.select_dtypes(['object'])
dataset[df_strings.columns] = df_strings.apply(lambda x: x.str.strip())

In [5]:
clean_dataset = pd.DataFrame(dataset["age"])
for col in dataset.columns[1:-1]:
    if(dataset[col].dtype =='O'):
        clean_dataset = clean_dataset.join(pd.get_dummies(dataset[col], prefix=col))
    else:
        clean_dataset = clean_dataset.join(dataset[col])
labels = (dataset["salary"]==">50K")*1
dataset = clean_dataset

In [6]:
labels

0        0
1        0
2        0
3        0
4        0
        ..
30157    0
30158    1
30159    0
30160    0
30161    1
Name: salary, Length: 30162, dtype: int64

## Dataset 2

# Error measure

In [7]:
def error(predicted_labels, real_labels, loss="euclidean"):
    """
    Input: numpy array containing respectively the labels an algorithm predicted, and the real labels corresponding
    to the data. Type of loss we want to use.
    
    Output: float, the computed loss.
    """
    if loss == "euclidean": return euclidean(predicted_labels, real_labels)
    elif loss == "manhattan": return sum(abs(predicted_labels - real_labels))
    
    

# Knn

In [8]:
from sklearn.neighbors import KNeighborsClassifier

In [9]:
def knn(training_features, training_labels, to_predict_features,
        n_neighbors, weights = "uniform", algorithm="auto", p=2):
    
    """
    Input: Training data, features for which we want to predict the labels, number of neighbors k for knn algo,
    features weights system (uniform or distance), algorithm usewd to find closer k neighbors, p is the value
    used in the computation of the minkowski distance that is used here, p=1 gives a manhattan distance, p=2 a
    euclidian distance.
    
    Output: Numpy array containing the labels predicted by KNN for the given 'to_predict_features'
    """
    
    neigh = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, p=p)
    neigh.fit(training_features, training_labels)
    
    return (neigh.predict(to_predict_features))

In [10]:
K = 250

training_features = dataset[0:300]
training_labels = labels[0:300]
to_predict_features = dataset[1000:1200]
predicted_labels = knn(training_features, training_labels, to_predict_features, K)
real_labels = labels[1000:1200]
print(error(predicted_labels, real_labels, loss="manhattan"))

54


# Decision tree

# Cross validation