# Notebook with CVAE model and conditionning over calendar features or temperature with bike sharing kaggle dataset

## Loading libraries

In [1]:
#import external libraries
import sys
import os
import datetime
import pandas as pd
import numpy as np
import pickle
from matplotlib import pyplot as plt
import seaborn as sn
from scipy import stats
import cv2 #from open-cv, to convert array to images

In [2]:
#paths in git

#root git folder 
path_main_folder = '/home/goubetcle/Documents/CVAE/marota_cvae'
#path_main_folder = '/home/jovyan'#specify the root folder of the git repo

#add  to path root git folder 
sys.path.append(path_main_folder)
#add  to path source code folder
sys.path.append(path_main_folder+'/src')

In [3]:
#import class and methods from src
from keras import backend as K
from CVAE.callbacks import NEpochLogger,callbackWeightLoss
#from CVAE.cvae import compile_cvae, run_cvae
from CVAE.cvae_model import CVAE, CVAE_emb, CAE
from conso.load_shape_data import *  

import Visualisation.buildProjector
from Visualisation.buildProjector import *
from FeaturesScore.scoring import *
#from conso.load_shape_data import get_x_conso_autoencoder
from conso.conso_helpers import plot_latent_space_projection, pyplot_latent_space_projection_temp, pyplot_latent_space_projection_error
from sklearn.manifold import TSNE
%load_ext autoreload
%autoreload

Using TensorFlow backend.


In [4]:
#directories to store trained model and the related projector

log_dir_projector=path_main_folder+"/notebooks/logs/Expe-bike/CVAE_W_M_H/projector"
log_dir_model=path_main_folder+"/notebooks/logs/Expe-bike/CVAE_W_M_H//model"
if not(os.path.isdir(log_dir_projector)):
    os.makedirs(log_dir_projector)
if not(os.path.isdir(log_dir_model)):
    os.makedirs(log_dir_model)

# Table of contents:
- Load Data
- Make Training Set
- Define and Train Model
- Build Projector
- Compute Feature Scores in latent space
- Study reconstruction Error
- Study Holidays prediction
- Detect atypical events
- Conclusion

## Load dataset

In [120]:
# Load dataframe
path_data = os.path.join(path_main_folder, 'data/bike-sharing')
dataset_train= os.path.join(path_data, "train.csv")
dataset_test= os.path.join(path_data, "test.csv")

train = pd.read_csv(dataset_train)
test = pd.read_csv(dataset_test)
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [121]:
train.datetime = pd.to_datetime(train.datetime)

## Make training set of daily renting shares profiles and conditions

We use some conditions from expert knowledge we previosuly recovered (temperature, month, days of the week) to learn a new residual latent space.

In [122]:
timeserie = train.datetime.dt
nPoints = len(np.unique(timeserie.date))
nPoints

456

In [123]:
columns=train.columns
columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')

In [124]:
x_conso = train[['datetime', 'count', 'temp', 'humidity', 'windspeed', 'holiday']]
x_conso[['count', 'temp', 'humidity', 'windspeed', 'holiday']] = x_conso[['count', 'temp', 'humidity', 'windspeed', 'holiday']].astype('float64').values
x_conso = x_conso.rename(columns={'datetime':'ds', 'count':'consumption', 'temp':'temperature', 'holiday':'is_holiday_day'})
x_conso.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 6 columns):
ds                10886 non-null datetime64[ns]
consumption       10886 non-null float64
temperature       10886 non-null float64
humidity          10886 non-null float64
windspeed         10886 non-null float64
is_holiday_day    10886 non-null float64
dtypes: datetime64[ns](1), float64(5)
memory usage: 510.4 KB


In [125]:
dict_xconso = {'train': x_conso}
name_set_plot = 'train'
version = '-v1'
# Normalize input variables
dict_xconso, _ = normalize_xconso(dict_xconso, type_scaler = 'standard')

['consumption']


In [126]:
type_x = ['conso']
list_cond = ['day','month','humidity', 'windspeed']
dataset = get_dataset_autoencoder(dict_xconso=dict_xconso, type_x=type_x, list_cond=list_cond)

day (456, 7)
month (456, 12)
humidity (456, 24)
windspeed (456, 24)
(456, 67)


In [128]:
sum(np.isnan(dataset['train']['x'][0]))

array([ 1,  2,  8, 23, 14,  4,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0])

In [137]:
mask = np.isnan(dataset['train']['x'][0])
dataset['train']['x'][0][mask]=np.ones(dataset['train']['x'][0].shape[0]).reshape(-1,1).dot(np.nanmean(dataset['train']['x'][0], axis=0).reshape(1,-1))[mask] #hours without information
sum(np.isnan(dataset['train']['x'][0]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [139]:
nPoints=dataset['train']['x'][1].shape[0]

#get conditions in array
days_emb =dataset['train']['x'][1][:,0:7]
month_emb =dataset['train']['x'][1][:,7:19]
temp_emb=dataset['train']['x'][1][:,19:43]
hum_emb=dataset['train']['x'][1][:,43:]

to_emb=dataset['train']['x'][1]

x = dataset['train']['x'][0]

dataset['train']['x'] = [x,days_emb,month_emb,temp_emb, hum_emb]

In [140]:
calendar_info = pd.DataFrame(dataset[name_set_plot]['ds'])
calendar_info['month'] = calendar_info.ds.dt.month
calendar_info['weekday'] = calendar_info.ds.dt.weekday
calendar_info['is_weekday'] = (calendar_info.weekday < 5).apply(lambda x:int(x))
calendar_info = pd.merge(calendar_info, x_conso[['ds', 'is_holiday_day']], on='ds', how ='left')
calendar_info.loc[calendar_info['is_holiday_day'].isna(),'is_holiday_day'] = 0

In [141]:
calendar_info.head()

Unnamed: 0,ds,month,weekday,is_weekday,is_holiday_day
0,2011-01-01,1,5,0,0.0
1,2011-01-02,1,6,0,0.0
2,2011-01-03,1,0,1,0.0
3,2011-01-04,1,1,1,0.0
4,2011-01-05,1,2,1,0.0


In [142]:
calendar_info=calendar_info.join(pd.get_dummies(train.weather).rename(columns={1:'clear', 2:'mist', 3:'significant', 4:'extreme'}))
calendar_info.head()

Unnamed: 0,ds,month,weekday,is_weekday,is_holiday_day,clear,mist,significant,extreme
0,2011-01-01,1,5,0,0.0,1,0,0,0
1,2011-01-02,1,6,0,0.0,1,0,0,0
2,2011-01-03,1,0,1,0.0,1,0,0,0
3,2011-01-04,1,1,1,0.0,1,0,0,0
4,2011-01-05,1,2,1,0.0,1,0,0,0


# Build and train model CVAE

In [143]:
#on sauvegarde le dataset
path_out = log_dir_model

In [144]:
# Parameters for autoencoder
e_dims=[48,35,24,12]#encoder dim
d_dims=[48,35,24,12]#decoder dim. Dense Blocks in skip connections can make the dimensions bigger when layers are concatenated with the previous one
to_emb_dim=[7,12,24,24] #input dimensions for conditions
cond_pre_dim = 0
input_dim = dataset['train']['x'][0].shape[1]
z_dim= 4
lambda_val = 0.001 #hyper-parameter which value was selected after cross-validation

In [145]:
name_model = 'cvae_sharings-W_M_T_1H-journalier'
#name_model = 'cvae_classification'

In [146]:
#if needs to relaod model classes after modification wothout restarting the kernel

import CVAE.cvae_model
import CVAE.callbacks
import importlib
importlib.reload(CVAE.cvae_model)
importlib.reload(CVAE.callbacks)

%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [147]:
Lambda = K.variable(lambda_val, dtype='float32')
model = CVAE.cvae_model.CVAE_emb(input_dim=input_dim,
                  e_dims=e_dims, 
                  d_dims=d_dims, 
                  cond_pre_dim=cond_pre_dim,
                  z_dim=z_dim, 
                  beta=Lambda,
                  name=name_model, 
                  output=path_out,
                 to_emb_dim=to_emb_dim,
                 emb_dims=[[5,3],[6,3],[12,4],[12,4]], emb_to_z_dim=[5,5],
                is_L2_Loss=False,has_BN=2)#these dimensions define the dimension layer of the conitional network

5
L1 loss
False
complete model: 
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
emb_input_0 (InputLayer)        (None, 7)            0                                            
__________________________________________________________________________________________________
emb_input_1 (InputLayer)        (None, 12)           0                                            
__________________________________________________________________________________________________
emb_input_2 (InputLayer)        (None, 24)           0                                            
__________________________________________________________________________________________________
emb_input_3 (InputLayer)        (None, 24)           0                                            
____________________________________________________________________________

## Training model

In [117]:
from keras.callbacks import TensorBoard
from time import time

#embeddingsMetadata = {'dec_dense_0': 'metadata.tsv'}
tensorboard = TensorBoard(log_dir="logs/{}".format(name_model +str(time())),write_graph=True)#,write_images=True,embeddings_freq=10, embeddings_layer_names=['dec_dense_0'],embeddings_metadata= embeddingsMetadata)


In [148]:
import warnings
warnings.filterwarnings('ignore')

lambda_decreaseRate=0.0
lambda_min=0.0001 #p

#Turn it to True to train the model. Otherwise you can directly load on already trained model below
runTraining=True
runBatchCallback=True #In this callback we compute feature scores which is a bit long

if runTraining:#Training a neural network requires some computing power and the CPUs in MyBinder environment can be a bit slow. If you don't use callbacks it can be faster also 
    
    if runBatchCallback:
        out_batch = NEpochLogger(x_train_data=dataset['train']['x'], display=100,x_conso=x_conso,calendar_info=calendar_info)
        model.main_train(dataset, training_epochs=1500, batch_size=32, verbose=1,callbacks=[tensorboard,out_batch],validation_split=0.1)
    else:
        #use verbose=1 to see logs of training at every epoch
        model.main_train(dataset, training_epochs=1500, batch_size=32, verbose=0,callbacks=[tensorboard],validation_split=0.1)




--- START TRAINING ---

Train on 410 samples, validate on 46 samples
Epoch 1/1500
emb_input_0
emb_input_1
emb_input_2
emb_input_3
[nan nan nan nan]


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:

calendar_info.head()

In [50]:
CVAE.fit(dataset['train']['x'], training_epochs=10, batch_size=100)

AttributeError: module 'CVAE' has no attribute 'fit'

In [119]:
["test1", "test2"]+["abra"]

['test1', 'test2', 'abra']