# 1. Import packages

In [1]:
import sys
sys.path.append('..')
from modules import cross_validation, preprocess
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.neural_network import MLPClassifier

Invoking __init__.py for modules
loading region bounding boxes for computing carbon emissions region, this may take a moment...
 454/454... rate=445.20 Hz, eta=0:00:00, total=0:00:01 
Done!


# 2. Load dataset

In [2]:
df = preprocess.load_csv_data('../dataset/New_Occupancy_Estimation.csv')
df.head()

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
0,24.94,24.75,24.56,25.38,121,34,53,40,0.08,0.19,0.06,0.06,390,0.769231,0,0,1
1,24.94,24.75,24.56,25.44,121,33,53,40,0.93,0.05,0.06,0.06,390,0.646154,0,0,1
2,25.0,24.75,24.5,25.44,121,34,53,40,0.43,0.11,0.08,0.06,390,0.519231,0,0,1
3,25.0,24.75,24.56,25.44,121,34,53,40,0.41,0.1,0.1,0.09,390,0.388462,0,0,1
4,25.0,24.75,24.56,25.44,121,34,54,40,0.18,0.06,0.06,0.06,390,0.253846,0,0,1


# 3. Tuning Hyperparameters: Cross Validation

In [3]:
# Split dataset into features and output
X = preprocess.get_features(df)
y = preprocess.get_output(df)

# Split dataset into training and test
X_train, X_test, y_train, y_test = preprocess.split_dataset(X, y)

## 1. Logistic Regression

In [4]:
# Accuracy with default LogisticRegression hyperparamenters
lg = Pipeline([('scaler', StandardScaler()), ('estimator', LogisticRegression())])
lg.fit(X_train,y_train)
y_pred = lg.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9970384995064165


In [4]:
# Cross Validation with GridSearchSV
cross_validation.training_with_grid("LR", X_train, y_train, 5)

Fitting 5 folds for each of 42 candidates, totalling 210 fits
[CV] END estimator__C=0.001, estimator__penalty=l1, estimator__solver=lbfgs; total time=   0.0s
[CV] END estimator__C=0.001, estimator__penalty=l1, estimator__solver=lbfgs; total time=   0.0s
[CV] END estimator__C=0.001, estimator__penalty=l1, estimator__solver=lbfgs; total time=   0.0s
[CV] END estimator__C=0.001, estimator__penalty=l1, estimator__solver=lbfgs; total time=   0.0s
[CV] END estimator__C=0.001, estimator__penalty=l1, estimator__solver=lbfgs; total time=   0.0s
[CV] END estimator__C=0.001, estimator__penalty=l1, estimator__solver=liblinear; total time=   0.1s
[CV] END estimator__C=0.001, estimator__penalty=l1, estimator__solver=liblinear; total time=   0.0s
[CV] END estimator__C=0.001, estimator__penalty=l1, estimator__solver=liblinear; total time=   0.0s
[CV] END estimator__C=0.001, estimator__penalty=l1, estimator__solver=liblinear; total time=   0.0s
[CV] END estimator__C=0.001, estimator__penalty=l1, estima

In [5]:
# Cross Validation with RandomizedSearchSV
cross_validation.training_with_randomized("LR", X_train, y_train, 5)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END estimator__C=100, estimator__penalty=l2, estimator__solver=lbfgs; total time=   0.3s
[CV] END estimator__C=100, estimator__penalty=l2, estimator__solver=lbfgs; total time=   0.3s
[CV] END estimator__C=100, estimator__penalty=l2, estimator__solver=lbfgs; total time=   0.3s
[CV] END estimator__C=100, estimator__penalty=l2, estimator__solver=lbfgs; total time=   0.2s
[CV] END estimator__C=100, estimator__penalty=l2, estimator__solver=lbfgs; total time=   0.1s
[CV] END estimator__C=0.01, estimator__penalty=l1, estimator__solver=lbfgs; total time=   0.0s
[CV] END estimator__C=0.01, estimator__penalty=l1, estimator__solver=lbfgs; total time=   0.0s
[CV] END estimator__C=0.01, estimator__penalty=l1, estimator__solver=lbfgs; total time=   0.0s
[CV] END estimator__C=0.01, estimator__penalty=l1, estimator__solver=lbfgs; total time=   0.0s
[CV] END estimator__C=0.01, estimator__penalty=l1, estimator__solver=lbfgs; total time=  

## 2. Random Forest (RF)

In [4]:
# Accuracy with default RandomForestClassifier hyperparamenters
rf = Pipeline([('scaler', StandardScaler()), ('estimator', RandomForestClassifier())])
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9995064165844028


In [5]:
# Cross Validation with GridSearchSV
cross_validation.training_with_grid("RF", X_train, y_train, 5)

Fitting 5 folds for each of 1944 candidates, totalling 9720 fits
[CV] END estimator__bootstrap=True, estimator__max_depth=3, estimator__max_features=log2, estimator__max_leaf_nodes=3, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=25; total time=   0.1s
[CV] END estimator__bootstrap=True, estimator__max_depth=3, estimator__max_features=log2, estimator__max_leaf_nodes=3, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=25; total time=   0.1s
[CV] END estimator__bootstrap=True, estimator__max_depth=3, estimator__max_features=log2, estimator__max_leaf_nodes=3, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=25; total time=   0.1s
[CV] END estimator__bootstrap=True, estimator__max_depth=3, estimator__max_features=log2, estimator__max_leaf_nodes=3, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=25; total time=   0.1s
[CV] END estimator_

In [6]:
# Cross Validation with RandomizedSearchSV
cross_validation.training_with_randomized("RF", X_train, y_train, 5)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END estimator__bootstrap=False, estimator__max_depth=9, estimator__max_features=sqrt, estimator__max_leaf_nodes=6, estimator__min_samples_leaf=4, estimator__min_samples_split=2, estimator__n_estimators=100; total time=   0.5s
[CV] END estimator__bootstrap=False, estimator__max_depth=9, estimator__max_features=sqrt, estimator__max_leaf_nodes=6, estimator__min_samples_leaf=4, estimator__min_samples_split=2, estimator__n_estimators=100; total time=   0.3s
[CV] END estimator__bootstrap=False, estimator__max_depth=9, estimator__max_features=sqrt, estimator__max_leaf_nodes=6, estimator__min_samples_leaf=4, estimator__min_samples_split=2, estimator__n_estimators=100; total time=   0.5s
[CV] END estimator__bootstrap=False, estimator__max_depth=9, estimator__max_features=sqrt, estimator__max_leaf_nodes=6, estimator__min_samples_leaf=4, estimator__min_samples_split=2, estimator__n_estimators=100; total time=   0.4s
[CV] END estima

## 3. Support Vector Machine (SVM)

In [7]:
# Accuracy with default SVC hyperparamenters
svc = Pipeline([('scaler', StandardScaler()), ('estimator', svm.SVC())])
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.998025666337611


In [8]:
# Cross Validation with GridSearchSV
cross_validation.training_with_grid("SVM", X_train, y_train, 5)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END estimator__C=0.1, estimator__gamma=1, estimator__kernel=rbf; total time=   1.0s
[CV] END estimator__C=0.1, estimator__gamma=1, estimator__kernel=rbf; total time=   1.1s
[CV] END estimator__C=0.1, estimator__gamma=1, estimator__kernel=rbf; total time=   0.8s
[CV] END estimator__C=0.1, estimator__gamma=1, estimator__kernel=rbf; total time=   0.8s
[CV] END estimator__C=0.1, estimator__gamma=1, estimator__kernel=rbf; total time=   0.8s
[CV] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=rbf; total time=   0.3s
[CV] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=rbf; total time=   0.3s
[CV] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=rbf; total time=   0.3s
[CV] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=rbf; total time=   0.4s
[CV] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=rbf; total time=   0.2s
[CV] END estimator__C=0.1, estimator__

In [10]:
# Cross Validation with RandomizedSearchSV
cross_validation.training_with_randomized("SVM", X_train, y_train, 5)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END estimator__C=10, estimator__gamma=0.0001, estimator__kernel=rbf; total time=   0.3s
[CV] END estimator__C=10, estimator__gamma=0.0001, estimator__kernel=rbf; total time=   0.3s
[CV] END estimator__C=10, estimator__gamma=0.0001, estimator__kernel=rbf; total time=   0.3s
[CV] END estimator__C=10, estimator__gamma=0.0001, estimator__kernel=rbf; total time=   0.3s
[CV] END estimator__C=10, estimator__gamma=0.0001, estimator__kernel=rbf; total time=   0.3s
[CV] END estimator__C=1000, estimator__gamma=0.0001, estimator__kernel=rbf; total time=   0.1s
[CV] END estimator__C=1000, estimator__gamma=0.0001, estimator__kernel=rbf; total time=   0.1s
[CV] END estimator__C=1000, estimator__gamma=0.0001, estimator__kernel=rbf; total time=   0.1s
[CV] END estimator__C=1000, estimator__gamma=0.0001, estimator__kernel=rbf; total time=   0.1s
[CV] END estimator__C=1000, estimator__gamma=0.0001, estimator__kernel=rbf; total time=   0.1s

## 4. Multilayer Perceptron (MLP)

In [11]:
# Accuracy with default SVC hyperparamenters
svc = Pipeline([('scaler', StandardScaler()), ('estimator',MLPClassifier())])
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9985192497532083


In [5]:
# Cross Validation with GridSearchSV
cross_validation.training_with_grid("MLP", X_train, y_train, 5)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END estimator__activation=identity, estimator__alpha=0.0001, estimator__hidden_layer_sizes=(50, 50, 50), estimator__learning_rate=constant, estimator__solver=sgd; total time=   2.8s
[CV] END estimator__activation=identity, estimator__alpha=0.0001, estimator__hidden_layer_sizes=(50, 50, 50), estimator__learning_rate=constant, estimator__solver=sgd; total time=   4.5s
[CV] END estimator__activation=identity, estimator__alpha=0.0001, estimator__hidden_layer_sizes=(50, 50, 50), estimator__learning_rate=constant, estimator__solver=sgd; total time=   4.6s
[CV] END estimator__activation=identity, estimator__alpha=0.0001, estimator__hidden_layer_sizes=(50, 50, 50), estimator__learning_rate=constant, estimator__solver=sgd; total time=   2.4s
[CV] END estimator__activation=identity, estimator__alpha=0.0001, estimator__hidden_layer_sizes=(50, 50, 50), estimator__learning_rate=constant, estimator__solver=sgd; total time=   2.9s
[

In [6]:
# Cross Validation with RandomizedSearchSV
cross_validation.training_with_randomized("MLP", X_train, y_train, 5)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END estimator__activation=logistic, estimator__alpha=0.0001, estimator__hidden_layer_sizes=(100,), estimator__learning_rate=invscaling, estimator__solver=sgd; total time=   0.4s
[CV] END estimator__activation=logistic, estimator__alpha=0.0001, estimator__hidden_layer_sizes=(100,), estimator__learning_rate=invscaling, estimator__solver=sgd; total time=   1.4s
[CV] END estimator__activation=logistic, estimator__alpha=0.0001, estimator__hidden_layer_sizes=(100,), estimator__learning_rate=invscaling, estimator__solver=sgd; total time=   0.4s
[CV] END estimator__activation=logistic, estimator__alpha=0.0001, estimator__hidden_layer_sizes=(100,), estimator__learning_rate=invscaling, estimator__solver=sgd; total time=   0.6s
[CV] END estimator__activation=logistic, estimator__alpha=0.0001, estimator__hidden_layer_sizes=(100,), estimator__learning_rate=invscaling, estimator__solver=sgd; total time=   0.4s
[CV] END estimator__acti