In [1]:
import os
import numpy as np
import pandas as pd
import datetime as dt

In [2]:
user = 'user5'
all_data = []
for filename in os.listdir(user):
    if filename.startswith('data-'):
        print(filename)
        data = pd.read_csv(user+'/'+filename,sep=',', header='infer', parse_dates=[0], infer_datetime_format=True)
        outdoor_name = filename.replace('data-'+user, 'environment-outdoor')
        print(outdoor_name)
        try:
            outdoor = pd.read_csv('outdoor/%s'%outdoor_name,sep=',', header=None, 
                                  names=['Time','Temperature','RH'],parse_dates=[0], infer_datetime_format=True)
        except IOError:
            print('Error: no corresponding outdoor file')
        out_average = pd.DataFrame(np.array([outdoor[['Temperature', 'RH']][(outdoor['Time']>t-dt.timedelta(minutes=30)) & (outdoor['Time']<t)]
                                                 .median() for t in data['Time']]), columns=['Outdoor_T (C)','Outdoor_RH (%)'])
        all_data.append(pd.concat([data, out_average], axis=1))
        

data-user5-2018-3-10.csv
environment-outdoor-2018-3-10.csv
data-user5-2018-3-19.csv
environment-outdoor-2018-3-19.csv
data-user5-2018-3-20.csv
environment-outdoor-2018-3-20.csv
data-user5-2018-3-22.csv
environment-outdoor-2018-3-22.csv
data-user5-2018-3-25.csv
environment-outdoor-2018-3-25.csv
data-user5-2018-3-4.csv
environment-outdoor-2018-3-4.csv
data-user5-2018-3-8.csv
environment-outdoor-2018-3-8.csv


In [3]:
df = pd.concat(all_data, axis=0).reset_index(drop=True).fillna(method='ffill')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 7 columns):
Time               226 non-null datetime64[ns]
Sensation          226 non-null int64
Satisfaction       226 non-null int64
Temperature (C)    226 non-null float64
RH (%)             226 non-null float64
Outdoor_T (C)      226 non-null float64
Outdoor_RH (%)     226 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 12.4 KB


In [5]:
df.head()

Unnamed: 0,Time,Sensation,Satisfaction,Temperature (C),RH (%),Outdoor_T (C),Outdoor_RH (%)
0,2018-03-10 16:25:00,1,-1,23.9,19.3,7.9,26.2
1,2018-03-10 16:31:00,1,-1,24.2,19.45,7.6,26.5
2,2018-03-10 16:36:00,1,-1,24.7,19.3,7.6,26.6
3,2018-03-10 16:41:00,1,-1,24.9,18.8,7.6,26.6
4,2018-03-10 16:46:00,1,-1,25.0,18.75,7.6,26.7


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [7]:
features = df[['Temperature (C)', 'RH (%)', 'Outdoor_T (C)', 'Outdoor_RH (%)']]
scaler = StandardScaler()
scaler.fit(features)
features_std = scaler.transform(features)

In [8]:
features_std

array([[-8.57434793e-02,  1.70646096e-01, -8.71252055e-02,
        -3.20230241e-01],
       [ 1.25389873e-02,  2.11549035e-01, -1.49337832e-01,
        -3.02522522e-01],
       [ 1.76343098e-01,  1.70646096e-01, -1.49337832e-01,
        -2.96619950e-01],
       [ 2.41864743e-01,  3.43029662e-02, -1.49337832e-01,
        -2.96619950e-01],
       [ 2.74625565e-01,  2.06686532e-02, -1.49337832e-01,
        -2.90717377e-01],
       [ 3.23766798e-01, -2.02342857e-02, -1.70075375e-01,
        -2.78912232e-01],
       [ 4.71190498e-01, -7.47715376e-02, -1.90812917e-01,
        -2.55301941e-01],
       [ 4.38429676e-01, -7.47715376e-02, -2.11550459e-01,
        -2.49399368e-01],
       [ 5.36712142e-01, -1.02040164e-01, -2.11550459e-01,
        -2.37594222e-01],
       [ 7.00516253e-01, -2.11114667e-01, -2.32288002e-01,
        -2.25789077e-01],
       [ 7.33277075e-01, -2.11114667e-01, -2.32288002e-01,
        -2.19886504e-01],
       [ 3.72908031e-01,  6.15715921e-02, -2.32288002e-01,
      

In [125]:
forest = RandomForestClassifier(n_estimators = 50) 
X_train, X_test, y_train, y_test = train_test_split(features_std, df['Sensation'], test_size=0.25, shuffle=True)

In [126]:
forest.fit(X_train, y_train)
prediction_rf = forest.predict(X_test)

In [127]:
acc = sum(prediction_rf-y_test==0)/len(y_test)
acc

0.7719298245614035

In [34]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [86]:
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

In [123]:
# 5-folder cv
label = df['Sensation']
predicted_label = cross_val_predict(forest, features_std, label, cv=4)
# accuracy of the model
accuracy = metrics.accuracy_score(label, predicted_label)

In [14]:
from numpy.random import permutation

In [25]:
features = df[['Temperature (C)', 'RH (%)', 'Outdoor_T (C)', 'Outdoor_RH (%)']]
scaler = StandardScaler()
scaler.fit(features)
features_std = scaler.transform(features)
labels = df['Sensation']

In [26]:
n = len(label)
idx_shuffle = permutation(n)

In [35]:
# self-implemented 5 folder cv
k = 5
idx_all = set(idx_shuffle)
forest = RandomForestClassifier(n_estimators = 50)
neigh = KNeighborsClassifier(n_neighbors=3)
logreg = LogisticRegression(C=1e5)
err_rf, err_knn, err_lr = 0, 0, 0
for i in range(k):
    idx_v = idx_shuffle[n//k*i:n//k*(i+1)]
    idx_train = list(idx_all-set(idx_v))
    X_train = features_std[idx_train]
    y_train = labels[idx_train]
    X_v = features_std[idx_v]
    y_v = labels[idx_v]
    forest.fit(X_train, y_train)
    prediction_rf = forest.predict(X_v)
    err_rf += sum(prediction_rf-y_v!=0)
    
    neigh.fit(X_train, y_train)
    prediction_knn = neigh.predict(X_v)
    err_knn += sum(prediction_knn-y_v!=0)
    
    logreg.fit(X_train, y_train)
    prediction_lr = logreg.predict(X_v)
    err_lr += sum(prediction_lr-y_v!=0)
    

acc_rf = 1-err_rf/n
acc_knn = 1-err_knn/n
acc_lr = 1-err_lr/n
    

In [36]:
acc_rf

0.8628318584070797

In [37]:
acc_knn

0.7743362831858407

In [39]:
acc_lr

0.6017699115044248