In [1]:
!pip install scikit-learn



In [19]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
# from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn import tree
import numpy as np
import pandas as pd

In [20]:
file = pd.read_csv('Data.csv')
file = pd.DataFrame(file)
file

Unnamed: 0,period,temperature,hours before sunrise,hours before sunset,demand
0,1,8.4,6.016667,17.633333,496.0
1,2,8.1,5.516667,17.133333,535.0
2,3,7.8,5.016667,16.633333,511.0
3,4,7.5,4.516667,16.133333,496.0
4,5,7.3,4.016667,15.633333,490.0
...,...,...,...,...,...
52555,52556,12.4,-15.516667,-3.800000,
52556,52557,12.3,-16.016667,-4.300000,
52557,52558,12.2,-16.516667,-4.800000,
52558,52559,11.9,-17.016667,-5.300000,


In [21]:
df = file[file.demand.notnull()]
df

Unnamed: 0,period,temperature,hours before sunrise,hours before sunset,demand
0,1,8.4,6.016667,17.633333,496.0
1,2,8.1,5.516667,17.133333,535.0
2,3,7.8,5.016667,16.633333,511.0
3,4,7.5,4.516667,16.133333,496.0
4,5,7.3,4.016667,15.633333,490.0
...,...,...,...,...,...
48235,48236,13.2,-17.666667,-1.183333,998.0
48236,48237,12.1,-18.166667,-1.683333,867.0
48237,48238,12.1,-18.666667,-2.183333,730.0
48238,48239,12.1,-19.166667,-2.683333,608.0


In [22]:
y = df.demand
y

0        496.0
1        535.0
2        511.0
3        496.0
4        490.0
         ...  
48235    998.0
48236    867.0
48237    730.0
48238    608.0
48239    517.0
Name: demand, Length: 48240, dtype: float64

In [23]:
X = df.drop('period', axis=1).drop('demand', axis = 1)
X

Unnamed: 0,temperature,hours before sunrise,hours before sunset
0,8.4,6.016667,17.633333
1,8.1,5.516667,17.133333
2,7.8,5.016667,16.633333
3,7.5,4.516667,16.133333
4,7.3,4.016667,15.633333
...,...,...,...
48235,13.2,-17.666667,-1.183333
48236,12.1,-18.166667,-1.683333
48237,12.1,-18.666667,-2.183333
48238,12.1,-19.166667,-2.683333


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [25]:
pipe_lr=Pipeline([('scl',StandardScaler()),
                 ('pca',PCA(n_components=2)),
                 ('clf',LogisticRegression(random_state=42))])


pipe_svm=Pipeline([('scl',StandardScaler()),
                  ('pca',PCA(n_components=2)),
                  ('clf',svm.SVC(random_state=42))])


pipe_dt=Pipeline([('scl',StandardScaler()),
                 ('pca',PCA(n_components=2)),
                 ('clf',tree.DecisionTreeClassifier(random_state=42))])


pipe_adaboost=Pipeline([('scl',StandardScaler()),
                       ('pca',PCA(n_components=2)),
                       ('clf',AdaBoostClassifier())])


pipe_gradientboosting=Pipeline([('scl',StandardScaler()),
                       ('pca',PCA(n_components=2)),
                       ('clf',GradientBoostingClassifier())])


pipe_knn=Pipeline([('scl',StandardScaler()),
                  ('pca',PCA(n_components=2)),
                  ('clf',KNeighborsClassifier(n_neighbors=3))])

In [26]:
#List of pipelines for ease of iteration
pipelines=[pipe_lr,pipe_svm,pipe_dt,pipe_adaboost,pipe_gradientboosting,pipe_knn]
# pipelines=[pipe_svm,pipe_knn]

In [27]:
#Dictionery of pipelines and classifier types for ease of reference
# pipe_dict={0:'Support Vector Machine',1:'KNearestNeighbors'}
pipe_dict={0:'LogisticRegression',1:'Support Vector Machine',2:'Decision tree',3:'AdaBoostClassifier',4:'GradientBoosting',5:'KNearestNeighbors'}

In [30]:
#fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train,y_train)

In [31]:
#compare accuracies
for idx,val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' %(pipe_dict[idx],val.score(X_test,y_test)))

Support Vector Machine pipeline test accuracy: 0.003
KNearestNeighbors pipeline test accuracy: 0.002


In [None]:
#identify the most accurate model on test data
best_acc=0.0
best_clf=0.0
best_pipe=''

for idx , val in enumerate(pipelines):
    if val.score(X_test,y_test)>best_acc:
        best_acc=val.score(X_test,y_test)
        best_pipe=val
        best_clf=idx
print('Classifier with best accuracy:%s'%pipe_dict[best_clf])

#save pipeline to file
joblib.dump(best_pipe,'best_pipeline.pkl',compress=1)
print('Saved %s pipeline to file'% pipe_dict[best_clf])