# What to classify ?

Lecture 3 : we saw the KNN algorithm to classify samples into classes. 

In our dataset, several electronic devices consume power. Each device is part of a category of devices :  
-multimedia = [computer, 3D_printer, internet_router, laptop, phone_charger, printer, screen, tv, sound_system],  
-kitchen = [boiler, coffee, freezer, fridge, micro_wave_oven],  
-cooling = [air_conditioner, fan],  
-other = [air_purifier, dehumidifier, radiator, solar_panel, vacuum].

Each category might have similar features, which could permit us to classify an unknown device. For example, in the category "washing", we expect all devices to be consuming power mainly during off-peak time, about once a week.

1-DIVIDE BY WEEKS EACH DEVICE so we get more data. Start on Monday at 00:00:01 for each device, each week. Create new files (save) so we don't have to compute the code all the time

2-Compute relevant components  
Try to plot each component for each class to see if we can already see some differences between the classes, see if the components are relevant. Plot boxplot

3-Create X and Y : use the "plug_name" of the file "0_smart_plugs..."  

4- KNN on all the components (multiple dimensions KNN, more than  2 dim). Rescale
For the KNN : plot the accuracy, the graphs for different values of neighbors (see Lecture 3)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 2- Relevant components

-maximum power
-time of use
-period of use
-used during the night or not
-number of times used during a day

In [None]:
#function that takes device file and extract the relevant components
def components(file_device):
    "function that takes the 0_smart_plugs_devices.csv and the device file and extract the relevant components"
    "file device : path to the file, ex : household_power_consumption/solar_panel_325.csv"
    
    f_0=
    

# 3- KNN and classification

We now have the components that can distinguish two classes of devices. We perform the KNN algorithm to classify the devices.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split


#Simple knn

"X array of the form np.array([[x11,x12,x13,x14],...,[x_40_1,x_40_2,_40_3,_40_4]]) if 40 devices and 4 componenets to class them"
"Y array of the form np.array([y1,...,y40]) if 40 devices and 4 components to class them, with y1,...y40 is an int to indicate to which class the device belongs"

X_train, X_test, y_train, y_test = train_test_split(X, y)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
print("accuracy: {:.2f}".format(knn.score(X_test, y_test)))


#Influence of number of neighbors
#If two major components : can plot the 2d-classification with a different number of neighbors
#Can choose two components to represent it once

def plot_2d_classification(classifier, X, fill=False, ax=None, eps=None, alpha=1):                                       
    # multiclass                                                                                                                 
    if eps is None:
        eps = X.std() / 2.                                                                                                       

    if ax is None:
        ax = plt.gca()                                                                                                           

    x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps                                                                      
    y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps                                                                      
    xx = np.linspace(x_min, x_max, 1000)                                                                                         
    yy = np.linspace(y_min, y_max, 1000)                                                                                         

    X1, X2 = np.meshgrid(xx, yy)                                                                                                 
    X_grid = np.c_[X1.ravel(), X2.ravel()]                                                                                       
    decision_values = classifier.predict(X_grid)                                                                                 
    ax.imshow(decision_values.reshape(X1.shape), extent=(x_min, x_max,                                                           
                                                       y_min, y_max),                                                          
            aspect='auto', origin='lower', alpha=alpha)                                                               
    ax.set_xlim(x_min, x_max)                                                                                                    
    ax.set_ylim(y_min, y_max)                                                                                                    
    ax.set_xticks(())                                                                                                            
    ax.set_yticks(())
    
fig, axes = plt.subplots(2, 2, figsize=(4, 4))
for ax, n_neighbors in zip(axes.ravel(), [2, 5, 10, 50]):
    ax.set_title("n_neighbors={}".format(n_neighbors))
    clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
    ax.scatter(X[:, 0], X[:, 1], c=y, edgecolor='k')
    plot_2d_classification(clf, X, ax=ax, alpha=.5)
    ax.set_aspect("equal")
    
#Model complexity

neighbors = range(1, 30, 2)

training_scores = []
test_scores = []
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

for n_neighbors in neighbors:
    knn = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X_train, y_train)
    training_scores.append(knn.score(X_train, y_train))
    test_scores.append(knn.score(X_test, y_test))
    
plt.figure()
plt.plot(neighbors, training_scores, label="training scores")
plt.plot(neighbors, test_scores, label="test scores")
plt.ylabel("accuracy")
plt.xlabel("n_neighbors")
plt.legend()


#Improving the results : Cross validation

from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y)
cross_val_scores = []

for i in neighbors:
    knn = KNeighborsClassifier(n_neighbors=i)
    scores = cross_val_score(knn, X_train, y_train, cv=5)
    cross_val_scores.append(np.mean(scores))
    
print("best cross-validation score: {:.3f}".format(np.max(cross_val_scores)))
best_n_neighbors = neighbors[np.argmax(cross_val_scores)]
print("best n_neighbors:", best_n_neighbors)

knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
knn.fit(X_train, y_train)
print("test-set score: {:.3f}".format(knn.score(X_test, y_test)))


#Improving again the complexity of the model : GridSearchCV

from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

param_grid_knn = {'n_neighbors': np.arange(1, 15, 2)}

grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid_knn,
                    cv=10, return_train_score=True)
grid.fit(X_train, y_train)

print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))

print("test-set score: {:.3f}".format(grid.score(X_test, y_test)))

import pandas as pd
results = pd.DataFrame(grid.cv_results_)
print(results)

#ploting the results

results.plot('param_n_neighbors', 'mean_test_score', ax=plt.gca())
plt.fill_between(results.param_n_neighbors.astype(int),
                 results['mean_train_score'] + results['std_train_score'],
                 results['mean_train_score'] - results['std_train_score'], alpha=0.2)
plt.fill_between(results.param_n_neighbors.astype(int),
                 results['mean_test_score'] + results['std_test_score'],
                 results['mean_test_score'] - results['std_test_score'], alpha=0.2)
plt.legend()