In [1]:
#Importing essentials libraries
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import *

from sklearn import preprocessing

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from sklearn.preprocessing import MinMaxScaler

import collections
import datetime
import scipy

import graphviz
import pydot
import plotly
from plotly.graph_objs import Scatter, Heatmap, Layout
plotly.offline.init_notebook_mode(connected=True)


Using TensorFlow backend.


In [2]:
#Essential functions for the code

#Funciton for retrieving the factors of a numbers. It's used to optimize the batch size.
def get_factors(x):
    facts = [] #array that contains the factors.
    
    for i in range(1, x + 1):
        if x % i == 0:
            facts.append(i)
    
    return facts



In [3]:
#Read dataset file
file = pd.read_csv("improved_spvm_2015-2018.csv")

#Manipulating the dataset to reflect the usage
file["DATE"] =  pd.to_datetime(file['JOUR'], format='%Y-%m-%d') #to transform the column into a datetime type

#transform the quart section to numeric (should be 0,1,2 for morning, afternoon and evening)
file = file.replace('jour',0)
file = file.replace('soir',1)
file = file.replace('nuit',2)

#split the 'DATE' columns into multiple ones.
file['JOUR'] = pd.DatetimeIndex(file['DATE']).day
file['MOIS'] = pd.DatetimeIndex(file['DATE']).month
file['ANNEE'] = pd.DatetimeIndex(file['DATE']).year

#Manipulating the 'CATEGORIE' column to transform it into a numeric
values_categories = np.unique(file['CATEGORIE'])
file = file.replace(values_categories, np.array([0, 1, 2, 3, 4 ,5])) 

#quick overview on the file to make sure it has been loaded properly
print(file.info())
#print(file.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93078 entries, 0 to 93077
Data columns (total 10 columns):
CATEGORIE    93078 non-null int64
JOUR         93078 non-null int32
QUART        93078 non-null int64
PDQ          93078 non-null int64
LAT          93078 non-null float64
LONG         93078 non-null float64
PLACE_ID     93078 non-null int64
DATE         93078 non-null datetime64[ns]
MOIS         93078 non-null int32
ANNEE        93078 non-null int32
dtypes: datetime64[ns](1), float64(2), int32(3), int64(4)
memory usage: 6.0 MB
None


In [4]:
#Firsts steps in the analysis : correlation analysis.
# In fact, we'll use several features for the SVM but before that, we need to make sure that each fetures
df = file

donnees = list(df.corr()) #getting the columns names.
vals = df.corr().values #getting the computed values for the correlation between each feature.

#print(df.corr())

plotly.offline.iplot({
    "data": [Heatmap(z=vals, y=donnees, x=donnees, colorscale='Viridis')],
    "layout": Layout(title='Correlation values')
})

In [5]:
#Building the dataset : 70% training and 30% test set.

scaler = MinMaxScaler(feature_range=(0,1))

datas = pd.DataFrame(file, columns=['JOUR', 'QUART', 'LAT', 'LONG', 'PLACE_ID', 'MOIS', 'ANNEE'])
values = np.asarray(datas.values,dtype=np.uint8)
size = len(datas)
x = int(0.70*size)

trainset_x = values[0:x]
testset_x = values[x+1:-1]

values = np.asarray(file['CATEGORIE'],dtype=np.uint8)
trainset_y = values[0:x]
testset_y = values[x+1:-1]

#print(trainset_x.shape)
#print(testset_x.shape)
#print(trainset_y.shape)
#print(testset_y.shape)

trainX = np.reshape(trainset_x, (trainset_x.shape[0], 1, trainset_x.shape[1]))
trainY = np.reshape(trainset_y, (trainset_y.shape[0],1))

testX = np.reshape(testset_x, (testset_y.shape[0], 1, testset_x.shape[1]))
testY = np.reshape(testset_y, (testset_y.shape[0],1))

print(trainX.shape)
print(trainY.shape)


(65154, 1, 7)
(65154, 1)


In [61]:
#Building the model

factors = get_factors(trainX.shape[0])
print("Possible batch size =" + str(factors) + " \n\n" )

model = Sequential()
model.add(keras.layers.Embedding(trainX.shape[0],1))
model.add(keras.layers.GlobalAveragePooling1D())
model.add( keras.layers.Dense(len(values_categories), activation=tf.nn.softmax))

model.summary() #output model

model.compile(optimizer=tf.train.AdamOptimizer(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

#model = Sequential()
#model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
#model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
#model.add(MaxPooling1D(pool_size=2))
#model.add(LSTM(100))
#model.add(Dense(1, activation='sigmoid'))

history = model.fit(trainX, trainY, epochs=30,batch_size=factors[4],verbose=1,shuffle=True)


Possible batch size =[1, 2, 3, 6, 10859, 21718, 32577, 65154] 


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, None, 1)           65154     
_________________________________________________________________
global_average_pooling1d_7 ( (None, 1)                 0         
_________________________________________________________________
dense_77 (Dense)             (None, 6)                 12        
Total params: 65,166
Trainable params: 65,166
Non-trainable params: 0
_________________________________________________________________


ValueError: Error when checking input: expected embedding_10_input to have 2 dimensions, but got array with shape (65154, 1, 7)

In [37]:
#Metrics evaluations + graphics

test_loss, test_acc = model.evaluate(testX, testY)
print('Test accuracy:', test_acc)

#SVG(model_to_dot(model).create(prog='dot', format='svg'))

# Plot training & validation accuracy values
plt.plot(history.history['acc'])
#plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
#plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

#f1_score(testset_y, predictions, average='macro')  
#f1_score(testset_y, predictions, average='micro')  
#f1_score(testset_y, predictions, average='weighted')  


Test accuracy: 0.312907384858


KeyError: 'val_acc'

In [None]:
# Make predictions here
