# [Room Occupancy] Classification


### Prepare Workspace

In [1]:
import sys
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV  
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import precision_recall_fscore_support

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

sys.path.append('../code')
import prepare as pr

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
data_dir = 'C:/Users/makayser/Desktop/wattx_local/'
fn_prepared = 'prepared.csv'

### Prepare Data

In [4]:
df = pd.read_csv(data_dir + fn_prepared)
df['timestamp'] = pd.to_datetime(df['timestamp'])
tuples = list(zip(df['device'],df['timestamp']))
df.index = pd.MultiIndex.from_tuples(tuples, names=['device', 'timestamp'])

In [5]:
df.dtypes

device                       int64
timestamp           datetime64[ns]
device.1                     int64
device_activated             int64
timestamp.1                 object
occupied                     int64
weekday                      int64
timeperiod                   int64
time                        object
dtype: object

## Multivariate Classification

### Prepare Data

In [6]:
df_c = pd.read_csv(data_dir + fn_prepared)
df_c['timestamp'] = pd.to_datetime(df_c['timestamp'])
df_c = df_c[['occupied','device','weekday','timeperiod']]

In [7]:
# split data into X and y
X, Y = df_c[['device','weekday','timeperiod']], df_c['occupied']
# split data into train and test sets
seed = 7
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

### Prepare Hyperparameter Tuning

### Run Tests

In [8]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['occupied'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['occupied'].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['occupied'], dtrain_predprob))

In [9]:
target = 'occupied'
X_train['occupied'] = y_train
predictors = [x for x in X_train.columns if x not in [target]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, X_train, predictors)


Model Report
Accuracy : 0.8885
AUC Score (Train): 0.944900


### TEST 1  - max depth, min child weight

In [11]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train[predictors],X_train[target])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=5, min_child_weight=1,
                                     missing=None, n_estimators=140, n_jobs=1,
                                     nthread=4, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=27, silent=None,
                                     subsample=0.8, verbosity=1),
             iid=False, n_jobs=4,
             param_grid={'max_depth': range(3, 10, 2),
                         'min_child_weight': range(1, 6, 2)},
             pre_dispatch='2*n_jobs', refit=True, retur

In [12]:
pd.DataFrame(gsearch1.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.620729,0.447111,0.010602,0.001744,3,1,"{'max_depth': 3, 'min_child_weight': 1}",0.947998,0.936374,0.93252,0.95038,0.935567,0.940568,0.007195,1
1,0.703922,0.007831,0.008003,0.001547,3,3,"{'max_depth': 3, 'min_child_weight': 3}",0.948137,0.936059,0.932913,0.949583,0.934729,0.940284,0.007088,2
2,0.694818,0.004066,0.010001,0.001675,3,5,"{'max_depth': 3, 'min_child_weight': 5}",0.947672,0.93616,0.932462,0.949242,0.934113,0.93993,0.007078,3
3,0.909399,0.037882,0.011205,0.002483,5,1,"{'max_depth': 5, 'min_child_weight': 1}",0.948248,0.934435,0.932656,0.948858,0.935438,0.939927,0.007102,4
4,1.002194,0.060439,0.010201,0.001328,5,3,"{'max_depth': 5, 'min_child_weight': 3}",0.948445,0.933832,0.931723,0.948613,0.934474,0.939417,0.007495,5
5,0.885601,0.01606,0.0088,0.001165,5,5,"{'max_depth': 5, 'min_child_weight': 5}",0.947848,0.935238,0.931896,0.948541,0.933003,0.939305,0.007341,6
6,1.114507,0.005775,0.009403,0.001017,7,1,"{'max_depth': 7, 'min_child_weight': 1}",0.948035,0.933069,0.931848,0.948531,0.934947,0.939286,0.007414,7
7,1.096103,0.010923,0.009403,0.001018,7,3,"{'max_depth': 7, 'min_child_weight': 3}",0.948048,0.933215,0.931122,0.948401,0.933609,0.938879,0.007678,9
8,1.048767,0.0133,0.0088,0.0004,7,5,"{'max_depth': 7, 'min_child_weight': 5}",0.947652,0.934971,0.930708,0.947921,0.932186,0.938688,0.007555,11
9,1.194831,0.015187,0.009203,0.000979,9,1,"{'max_depth': 9, 'min_child_weight': 1}",0.94772,0.932907,0.931678,0.948408,0.934511,0.939045,0.007422,8


In [13]:
gsearch1.best_score_, gsearch1.best_params_ 

(0.940567567971845, {'max_depth': 3, 'min_child_weight': 1})

### TEST 2 - gamma

In [14]:
param_test2 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=3,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(X_train[predictors],X_train[target])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=140, n_jobs=1,
                                     nthread=4, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=27, silent=None,
                                     subsample=0.8, verbosity=1),
             iid=False, n_jobs=4,
             param_grid={'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', ve

In [15]:
gsearch2.best_score_, gsearch2.best_params_ 

(0.940654853194637, {'gamma': 0.2})

### TEST 3  - subsample, colsamble

In [16]:
param_test3 = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=3,
 min_child_weight=1, gamma=0.2, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(X_train[predictors],X_train[target])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, gamma=0.2,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=177, n_jobs=1,
                                     nthread=4, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=27, silent=None,
                                     subsample=0.8, verbosity=1),
             iid=False, n_jobs=4,
             param_grid={'colsample_bytree': [0.75, 0.8, 0.85],
                         'subsample': [0.75, 0.8, 0.85]},
             pre_dispatch='2*n_jobs', refit=True

In [17]:
gsearch3.best_score_, gsearch3.best_params_ 

(0.9406893600357581, {'colsample_bytree': 0.75, 'subsample': 0.75})

### TEST 4 - alpha

In [18]:
param_test4 = {
 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=3,
 min_child_weight=1, gamma=0.2, subsample=0.75, colsample_bytree=0.75,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(X_train[predictors],X_train[target])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.75, gamma=0.2,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=177, n_jobs=1,
                                     nthread=4, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=27, silent=None,
                                     subsample=0.75, verbosity=1),
             iid=False, n_jobs=4,
             param_grid={'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='

In [19]:
gsearch4.best_score_, gsearch4.best_params_ 

(0.9407037221047648, {'reg_alpha': 0.05})

### TEXT X - Final Test

In [20]:
xgb_final = XGBClassifier(
 learning_rate =0.1,
 n_estimators=500,
 max_depth=6,
 min_child_weight=1,
 gamma=0.2,
 subsample=0.75,
 colsample_bytree=0.75,
 reg_alpha=0.05,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb_final, X_train, predictors)


Model Report
Accuracy : 0.8902
AUC Score (Train): 0.945409


In [21]:
xgb_final.fit(X_train[predictors], y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.75, gamma=0.2,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, n_estimators=77, n_jobs=1,
              nthread=4, objective='binary:logistic', random_state=0,
              reg_alpha=0.05, reg_lambda=1, scale_pos_weight=1, seed=27,
              silent=None, subsample=0.75, verbosity=1)

In [22]:
# make predictions for test data
y_pred = xgb_final.predict(X_test)
predictions = [round(value) for value in y_pred]

In [23]:
# evaluate predictions
accuracy = metrics.accuracy_score(y_test, predictions)
pr, re, fs, _ = precision_recall_fscore_support(y_test, predictions)
print(f"Accuracy: {accuracy:.2%}")
print(f"F1-Score: {fs}")

Accuracy: 89.24%
F1-Score: [0.9321366  0.73993289]


### Save Best Performing Model

In [24]:
with open('../assets/classify.ml','wb') as fn:
    pickle.dump(xgb_final, fn)