In [25]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier


## Loading training data

In [26]:
datadir = "./data/"
df1 = pd.read_csv(datadir + "datatest.txt")
df2 = pd.read_csv(datadir + "datatest2.txt")
dft = pd.read_csv(datadir + "datatraining.txt")

## Naive Bayes

#### Train Naive Bayes

In [27]:
# remove date and occupancy
Xcols = dft.columns.values[1:-1]
# create input array
X = dft[Xcols].to_numpy()
print(Xcols)
print(X.shape)

# occupancy array
ycol = dft.columns.values[-1]
y = dft[ycol].to_numpy()
print(ycol)
print(y.shape)



# create test input arrays
X_test1 = df1[Xcols].to_numpy()
X_test2 = df2[Xcols].to_numpy()


# occupancy test arrays
y_test1 = df1[ycol].to_numpy()
y_test2 = df2[ycol].to_numpy()

['Temperature' 'Humidity' 'Light' 'CO2' 'HumidityRatio']
(8143, 5)
Occupancy
(8143,)


In [28]:
gnb = GaussianNB()

gnb.fit(X,y)


0,1,2
,priors,
,var_smoothing,1e-09


#### Test Naive Bayes

On Training set

In [29]:

yhat = gnb.predict(X)
error2 = np.sum((yhat - y)**2)/len(y)

proba_hat = gnb.predict_proba(X) # [P_unoccupied, P_occupied]
proba_hat_occupied = proba_hat[:, 1][y==1]
proba_hat_unoccupied = proba_hat[:, 1][y==0]


fig = go.Figure()
fig.add_trace(go.Histogram(x=proba_hat_occupied, name="Occupied", histnorm="percent"))
fig.add_trace(go.Histogram(x=proba_hat_unoccupied, name="Unoccupied", histnorm="percent"))
fig.update_layout(
    barmode='overlay',
    template='plotly_white',
    title='Probability of predicted Occupancy on occupied and unoccupied rooms (Training Set)')
fig.update_traces(opacity=0.75)
fig.show()


print(f"Squared error on training set : {round(error2*100, 1)} %" )
# print("Log probas: ",gnb.predict_log_proba(X))

Squared error on training set : 2.2 %


On testing set 1

In [30]:

yhat_test1 = gnb.predict(X_test1)
error2_test1 = np.sum((yhat_test1 - y_test1)**2)/len(y_test1)

proba_hat_test1 = gnb.predict_proba(X_test1) # [P_unoccupied, P_occupied]
proba_hat_occupied_test1 = proba_hat_test1[:, 1][y_test1==1]
proba_hat_unoccupied_test1 = proba_hat_test1[:, 1][y_test1==0]


fig = go.Figure()
fig.add_trace(go.Histogram(x=proba_hat_occupied_test1, name="Occupied", histnorm="percent"))
fig.add_trace(go.Histogram(x=proba_hat_unoccupied_test1, name="Unoccupied", histnorm="percent"))
fig.update_layout(
    barmode='overlay',
    template='plotly_white',
    title='Probability of predicted Occupancy on occupied and unoccupied rooms (Testing Set 1)')
fig.update_traces(opacity=0.75)
fig.show()


print(f"Squared error on testing set 1 : {round(error2_test1*100, 1)} %" )


Squared error on testing set 1 : 2.3 %


On testing set 2

In [31]:

yhat_test2 = gnb.predict(X_test2)
error2_test2 = np.sum((yhat_test2 - y_test2)**2)/len(y_test2)

proba_hat_test2 = gnb.predict_proba(X_test2) # [P_unoccupied, P_occupied]
proba_hat_occupied_test2 = proba_hat_test2[:, 1][y_test2==1]
proba_hat_unoccupied_test2 = proba_hat_test2[:, 1][y_test2==0]


fig = go.Figure()
fig.add_trace(go.Histogram(x=proba_hat_occupied_test2, name="Occupied", histnorm="percent"))
fig.add_trace(go.Histogram(x=proba_hat_unoccupied_test2, name="Unoccupied", histnorm="percent"))
fig.update_layout(
    barmode='overlay',
    template='plotly_white',
    title='Probability of predicted Occupancy on occupied and unoccupied rooms (Testing Set 2)')
fig.update_traces(opacity=0.75)
fig.show()


print(f"Squared error on testing set 2 : {round(error2_test2*100, 1)} %" )


Squared error on testing set 2 : 1.2 %


## Random Forest

#### Train Random Forest

In [32]:

rf = RandomForestClassifier(n_estimators=101, criterion='entropy')
rf.fit(X,y)


0,1,2
,n_estimators,101
,criterion,'entropy'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


#### Test Random Forest

In [33]:
# on training set
yhat_rf = rf.predict(X)
error2_rf = np.sum((yhat_rf - y)**2)/len(y)

print(f"Squared error on training set : {round(error2_rf*100, 1)} %" )




# on testing set 1
yhat_rf_test1 = rf.predict(X_test1)
error2_rf_test1 = np.sum((yhat_rf_test1 - y_test1)**2)/len(y_test1)

print(f"Squared error on training set : {round(error2_rf_test1*100, 1)} %" )




# on testing set 2
yhat_rf_test2 = rf.predict(X_test2)
error2_rf_test2 = np.sum((yhat_rf_test2 - y_test2)**2)/len(y_test2)

print(f"Squared error on training set : {round(error2_rf_test2*100, 1)} %" )

Squared error on training set : 0.0 %
Squared error on training set : 4.5 %
Squared error on training set : 3.9 %


## Test de tous les params possibles de random forest

In [34]:
criterias = ["gini", "entropy", "log_loss"]
ntrees = [25, 50, 100, 150, 200]
max_depths=[4, 5, 10, None]
parameters = []
err1 = []
err2 = []

for c in criterias :
    for n in ntrees :
        for d in max_depths :
            rf = RandomForestClassifier(n_estimators=n, criterion=c, max_depth=d)
            rf.fit(X,y)
            print("\n===================")
            print(f"Testing with parameters {n} trees, {c}, depth {d}")
            # on training set
            yhat_rf = rf.predict(X)
            error2_rf = np.sum((yhat_rf - y)**2)/len(y)
            print(f"Squared error on training set  : {round(error2_rf*100, 1)} %" )
            # on testing set 1
            yhat_rf_test1 = rf.predict(X_test1)
            error2_rf_test1 = np.sum((yhat_rf_test1 - y_test1)**2)/len(y_test1)
            print(f"Squared error on testing set 1 : {round(error2_rf_test1*100, 1)} %" )
            # on testing set 2
            yhat_rf_test2 = rf.predict(X_test2)
            error2_rf_test2 = np.sum((yhat_rf_test2 - y_test2)**2)/len(y_test2)
            print(f"Squared error on testing set 2 : {round(error2_rf_test2*100, 1)} %" )

            # store results
            parameters.append(f"trees:{n}, crit:{c}, depth:{d}")
            err1.append(round(error2_rf_test1*100, 2))
            err2.append(round(error2_rf_test2*100, 2))
        


Testing with parameters 25 trees, gini, depth 4
Squared error on training set  : 1.0 %
Squared error on testing set 1 : 2.1 %
Squared error on testing set 2 : 1.3 %

Testing with parameters 25 trees, gini, depth 5
Squared error on training set  : 1.0 %
Squared error on testing set 1 : 2.3 %
Squared error on testing set 2 : 2.6 %

Testing with parameters 25 trees, gini, depth 10
Squared error on training set  : 0.2 %
Squared error on testing set 1 : 4.0 %
Squared error on testing set 2 : 2.8 %

Testing with parameters 25 trees, gini, depth None
Squared error on training set  : 0.0 %
Squared error on testing set 1 : 5.1 %
Squared error on testing set 2 : 2.7 %

Testing with parameters 50 trees, gini, depth 4
Squared error on training set  : 1.0 %
Squared error on testing set 1 : 2.2 %
Squared error on testing set 2 : 1.2 %

Testing with parameters 50 trees, gini, depth 5
Squared error on training set  : 0.9 %
Squared error on testing set 1 : 6.2 %
Squared error on testing set 2 : 3.8 %


In [35]:
# plot results depending on parameters
xparams = np.arange(len(parameters))
# error metrics (?)
yparams = (np.array(err1)*len(y_test1) + np.array(err2)*len(y_test2)) / (len(y_test1) + len(y_test2))


df = pd.DataFrame({"trial":xparams, "error %":yparams, "params":parameters})

# bin error metrics in category
df['Error range'] = pd.cut(df['error %'], [0, 1, 2, 3, 4, 5, np.inf], labels=None)
# fitness
df["fitness"] = np.max(df["error %"]) - df["error %"] + 1

fig = px.scatter(df, x="trial", 
                    y="error %", 
                    hover_data=['params'], 
                    template='plotly_white', 
                    color="error %", 
                    size="fitness", size_max=20*2)#color="Error range")
fig.show()

## Benchmarks all classifiers