In [None]:
!pip install gcsfs

In [80]:
import pandas as pd
import gcsfs
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
from sklearn.model_selection import TimeSeriesSplit

In [None]:
Reading the data files from the google cloud storage

In [81]:
fs = gcsfs.GCSFileSystem(project='omina-gcp-resource')
with fs.open('omina-test-set/occupancy-data/occupancy_data.csv') as f:
    df = pd.read_csv(f)

Cangint the date variable to python datetime variable

In [83]:
df['date'] = pd.to_datetime(df['date'])
df.index = df['date']
del df['Unnamed: 0']

Sorting the dataframe by date

In [84]:
df = df.sort_values(by='date')

Creating the NSM variavle and WS variable [NSM stands for Number of seconds since midnight for each day and WS stands for week day (1) or weekend (0)]

In [85]:
df_group = df.groupby(pd.Grouper(key = 'date', freq = 'D'))
df['NSM'] = df.date.apply(lambda x: x - x.replace(hour=0, minute=0, second=0)).dt.total_seconds()
df['WS'] = ((pd.DatetimeIndex(df.index).dayofweek) < 5).astype(int)
del df['date']
df.head()

Unnamed: 0_level_0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,color,NSM,WS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-02-02 14:19:00,23.7,26.272,585.2,749.2,0.004764,1,limegreen,51540.0,1
2015-02-02 14:19:59,23.718,26.29,578.4,760.4,0.004773,1,limegreen,51599.0,1
2015-02-02 14:21:00,23.73,26.23,572.666667,769.666667,0.004765,1,limegreen,51660.0,1
2015-02-02 14:22:00,23.7225,26.125,493.75,774.75,0.004744,1,limegreen,51720.0,1
2015-02-02 14:23:00,23.754,26.2,488.6,779.0,0.004767,1,limegreen,51780.0,1


Generating the correlation table considering the entire data samples

In [86]:
df_corr =df.corr()
df_corr.head(7)

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,NSM,WS
Temperature,1.0,-0.156964,0.688571,0.449989,0.20528,0.55561,0.235422,0.4124
Humidity,-0.156964,1.0,-0.029459,0.299746,0.932724,0.04624,0.114961,-0.215841
Light,0.688571,-0.029459,1.0,0.448105,0.223329,0.91485,0.095023,0.26715
CO2,0.449989,0.299746,0.448105,1.0,0.477965,0.501582,0.127287,0.372715
HumidityRatio,0.20528,0.932724,0.223329,0.477965,1.0,0.257324,0.201703,-0.051735
Occupancy,0.55561,0.04624,0.91485,0.501582,0.257324,1.0,0.103522,0.341949
NSM,0.235422,0.114961,0.095023,0.127287,0.201703,0.103522,1.0,-0.004215


In [87]:
def corrcoef_loop(df): # a function to calculate the associated p-values of correlation
    cols = df.shape[1]
    p = np.zeros(shape=(cols, cols))
    for i in range(cols):
        for j in range(i+1, cols):
            r, p_values = stats.pearsonr(df.iloc[:, i], df.iloc[:, j])
            p[i, j] = p[j, i] = p_values
    return p

Calculate p-values

In [88]:
df1 = df[['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio', 'NSM', 'WS']]
df_pval = pd.DataFrame(corrcoef_loop(df1), columns = df1.columns, index = df1.columns)
df_pval.head(7)

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,NSM,WS
Temperature,0.0,1.5363820000000002e-113,0.0,0.0,1.728531e-194,6.865398e-257,0.0
Humidity,1.5363820000000002e-113,0.0,2.392676e-05,0.0,0.0,1.957769e-61,2.745444e-215
Light,0.0,2.392676e-05,0.0,0.0,1.002775e-230,1.8813769999999998e-42,0.0
CO2,0.0,0.0,0.0,0.0,0.0,5.239133e-75,0.0
HumidityRatio,1.728531e-194,0.0,1.002775e-230,0.0,0.0,1.0582399999999999e-187,1.147644e-13
NSM,6.865398e-257,1.957769e-61,1.8813769999999998e-42,5.239133e-75,1.0582399999999999e-187,0.0,0.5456448
WS,0.0,2.745444e-215,0.0,0.0,1.147644e-13,0.5456448,0.0


Generating training and testing data set based on timeseries splitting

In [89]:
df_train = df['2015-02-04 17:51:00' : '2015-02-10 09:33:00']

In [90]:
df_test = df['2015-02-02 14:19:00' : '2015-02-04 10:43:00']

In [91]:
df_test1 = df['2015-02-11 14:48:00' : '2015-02-18 09:19:00']

Plotting correlations for training dataset

In [92]:
df1 = df_train[['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio', 'NSM', 'WS']]
df_train_corr =df1.corr()
df_train_corr.head(7)

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,NSM,WS
Temperature,1.0,-0.141759,0.649942,0.559894,0.151762,0.259958,0.418657
Humidity,-0.141759,1.0,0.037828,0.439023,0.955198,0.016974,0.108551
Light,0.649942,0.037828,1.0,0.664022,0.23042,0.085417,0.279519
CO2,0.559894,0.439023,0.664022,1.0,0.626556,0.209348,0.394834
HumidityRatio,0.151762,0.955198,0.23042,0.626556,1.0,0.095987,0.243146
NSM,0.259958,0.016974,0.085417,0.209348,0.095987,1.0,-0.010865
WS,0.418657,0.108551,0.279519,0.394834,0.243146,-0.010865,1.0


Calculating p-values for training dataset

In [93]:
df_train_pval = pd.DataFrame(corrcoef_loop(df1), columns = df1.columns, index = df1.columns)
df_train_pval.head(7)

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,NSM,WS
Temperature,0.0,8.045843999999999e-38,0.0,0.0,3.731142e-43,6.898404000000001e-126,0.0
Humidity,8.045843999999999e-38,0.0,0.0006396081,0.0,0.0,0.1256261,8.957692e-23
Light,0.0,0.0006396081,0.0,0.0,1.384224e-98,1.155777e-14,4.848840999999999e-146
CO2,0.0,0.0,0.0,0.0,0.0,2.50712e-81,5.602596e-302
HumidityRatio,3.731142e-43,0.0,1.384224e-98,0.0,0.0,3.943321e-18,6.771405e-110
NSM,6.898404000000001e-126,0.1256261,1.155777e-14,2.50712e-81,3.943321e-18,0.0,0.3269109
WS,0.0,8.957692e-23,4.848840999999999e-146,5.602596e-302,6.771405e-110,0.3269109,0.0
