In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False
tf.random.set_seed(13)

## Reading and Cleaning data

In [2]:
def evo_cleaning(data):
    # only travels with more than 30 minutes of duration, that is the limit of cancellation of a reserve
    data = data.loc[(data['duration'] > 30) | (data['distance'] > 3)]

    data.Start_time = pd.to_datetime(data.Start_time)
    data.End_time = pd.to_datetime(data.End_time)

    # Colleting vehicle ids
    car_ids = list(data.Id.unique())

    # Removing uncommon ids
    # Ex: 4c5865a3-4b03-40f6-a3a8-d4e94aae3b17
    ids_uncommon = [id for id in car_ids if id.find('-') != -1]
    car_ids = [id for id in car_ids if id.find('-') == -1]

    data = data.loc[~data.Id.isin(ids_uncommon)]
    
    # Removing microseconds from the dates
    data.Start_time = data.Start_time.apply(lambda x: x.replace(microsecond=0))
    data.End_time = data.End_time.apply(lambda x: x.replace(microsecond=0))
    
    return data

In [3]:
# reading data
evo_data = pd.read_csv('data/evo_travels.csv')
modo_data = pd.read_csv('data/modo_travels.csv')
c2g_data = pd.read_csv('data/car2go_travels.csv', index_col=0)

In [4]:
#cleaning process
evo_data = evo_cleaning(evo_data)

modo_data.start = modo_data.start.apply(lambda x: datetime.fromtimestamp(x))
modo_data.end = modo_data.end.apply(lambda x: datetime.fromtimestamp(x))

c2g_data.init_time = c2g_data.init_time.apply(lambda x: datetime.fromtimestamp(x))
c2g_data.final_time = c2g_data.final_time.apply(lambda x: datetime.fromtimestamp(x))

In [5]:
init_period = '05-25'
end_period = '06-15'

evo_data = evo_data[(evo_data.Start_time >= '2018-'+init_period) & (evo_data.End_time <= '2018-'+end_period)]
modo_data = modo_data[(modo_data.start >= '2018-'+init_period) & (modo_data.end <= '2018-'+end_period)]
c2g_data = c2g_data[(c2g_data.init_time >= '2017-'+init_period) & (c2g_data.final_time <= '2017-'+end_period)]

## LSTM Data preparation

In [6]:
def univariate_data(dataset, start_index, end_index, history_size, target_size):
    """
    Reshape the data to usual representation of [batch size, history size, dimensionality]
    batch size - length of the data
    history size - size of the past window of information.
    dimensionality - number of variables
    
    target_size - Is how far in the future does the model need to learn to predict.
    The target_size is the label that needs to be predicted
    """
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i)
        # Reshape data from (history_size,) to (history_size, 1)
        data.append(np.reshape(dataset[indices], (history_size, 1)))
        labels.append(dataset[i+target_size])
    return np.array(data), np.array(labels)

In [7]:
def train_size(data, size=0.7):
    # 70% of the data to use as train set
    train_split = int(len(data) * size)
    return train_split

In [8]:
def norm_data(data):
    values = data.values
    norm_data = (values - values.min())/(values.max() - values.min())
    return norm_data

For this analysis the aim is to predict the number of travels using a univariate LSTM model

In [9]:
# organization of the data for the model
evo_travel_count = evo_data.Start_time.value_counts(sort=False)
modo_travel_count = modo_data.start.value_counts(sort=False)
c2g_travel_count = c2g_data.init_time.value_counts(sort=False)

evo_norm = norm_data(evo_travel_count)
modo_norm = norm_data(modo_travel_count)
c2g_norm = norm_data(c2g_travel_count)

In [10]:
def train_val_split(data, train_split, history_length=60, future_target=0, batch_size=256, buffer_size=10000):
    
    univariate_past_history = history_length
    univariate_future_target = future_target

    x_train_uni, y_train_uni = univariate_data(data, 0, train_split,
                                               univariate_past_history,
                                               univariate_future_target)
    x_val_uni, y_val_uni = univariate_data(data, train_split, None,
                                           univariate_past_history,
                                           univariate_future_target)

    train_univariate = tf.data.Dataset.from_tensor_slices((x_train_uni, y_train_uni))
    train_univariate = train_univariate.cache().shuffle(buffer_size).batch(batch_size).repeat()

    val_univariate = tf.data.Dataset.from_tensor_slices((x_val_uni, y_val_uni))
    val_univariate = val_univariate.batch(batch_size).repeat()
    
    shape = x_train_uni.shape[-2:]
    
    return train_univariate, val_univariate, shape

In [11]:
def lstm_model(train_data, validate_data, shape, epochs=10, evaluation_interval=200):
    simple_lstm_model = tf.keras.models.Sequential([
        tf.keras.layers.LSTM(8, input_shape=shape),
        tf.keras.layers.Dense(1)
    ])

    simple_lstm_model.compile(optimizer='adam', loss='mae')
    
    simple_lstm_model.fit(train_data, epochs=epochs,
                      steps_per_epoch=evaluation_interval,
                      validation_data=validate_data, validation_steps=50)
    
    return simple_lstm_model

In [12]:
# split data on train and validate sets
evo_train, evo_val, evo_shape = train_val_split(data=evo_norm, train_split=train_size(evo_norm))
modo_train, modo_val, modo_shape = train_val_split(data=modo_norm, train_split=train_size(modo_norm))
c2g_train, c2g_val, c2g_shape = train_val_split(data=c2g_norm, train_split=train_size(c2g_norm))

In [13]:
# generating the model for each dataset
print('Evo Model')
evo_model = lstm_model(evo_train, evo_val, evo_shape)
print('Modo Model')
modo_model = lstm_model(modo_train, modo_val, modo_shape)
print('Car2Go Model')
c2g_model = lstm_model(c2g_train, c2g_val, c2g_shape)

Evo Model
Train for 200 steps, validate for 50 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Modo Model
Train for 200 steps, validate for 50 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Car2Go Model
Train for 200 steps, validate for 50 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Plotting prediction

In [22]:
a = evo_val.take(1)

In [28]:
for x, y in evo_val.take(1):
    print(x[0].numpy(), y[0].numpy())

[[0.01234568]
 [0.02314815]
 [0.04475309]
 [0.02623457]
 [0.        ]
 [0.02160494]
 [0.03395062]
 [0.04012346]
 [0.03395062]
 [0.02314815]
 [0.02932099]
 [0.02314815]
 [0.00925926]
 [0.0617284 ]
 [0.03703704]
 [0.02623457]
 [0.04012346]
 [0.02469136]
 [0.01080247]
 [0.00925926]
 [0.02314815]
 [0.01851852]
 [0.        ]
 [0.00617284]
 [0.02932099]
 [0.05864198]
 [0.02160494]
 [0.02469136]
 [0.02160494]
 [0.02160494]
 [0.02777778]
 [0.02777778]
 [0.03240741]
 [0.00925926]
 [0.00308642]
 [0.04475309]
 [0.        ]
 [0.02006173]
 [0.0462963 ]
 [0.02623457]
 [0.03858025]
 [0.03549383]
 [0.04783951]
 [0.0308642 ]
 [0.03240741]
 [0.02469136]
 [0.04166667]
 [0.0308642 ]
 [0.02469136]
 [0.00462963]
 [0.01388889]
 [0.02623457]
 [0.00154321]
 [0.02160494]
 [0.        ]
 [0.00154321]
 [0.00462963]
 [0.00308642]
 [0.06790123]
 [0.02777778]] 0.007716049382716049
