In [132]:
#Importing essentials libraries

import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC

import collections
import datetime
import scipy
from scipy import stats
import graphviz
import pydot
import plotly
import plotly.plotly as py
from plotly.graph_objs import Scatter, Heatmap, Layout
plotly.offline.init_notebook_mode(connected=True)

In [4]:
#Function for building the model

# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
  dataX, dataY = [], []
  for i in range(len(dataset)-look_back-1):
    a = dataset[i:(i+look_back), 0]
    dataX.append(a)
    dataY.append(dataset[i + look_back, 0])
  return numpy.array(dataX), numpy.array(dataY)


def remove_extreme_vals(keys,values):
    quartiles = numpy.percentile(values, [5,95])
    minVal = quartiles[0] #5% of the values are inferior or equal to his values.
    maxVal = quartiles[1] #95% of the values are inferior or equal to this value
    jours_x = []
    jours_y = []
    
    print(minVal)
    print(maxVal)
    
    for i in range(0, len(values) ):
        if minVal < values[i] < maxVal:
            jours_x.append(keys[i])
            jours_y.append(values[i])
            
    return jours_x,jours_y

def dividers(x):
    liste_dividers = []
    
    for i in range(1,x):
        if ( x%i == 0 ):
            liste_dividers.append(i)
            
    return liste_dividers

In [78]:
#Read dataset file
file = pandas.read_csv("../data/improved_spvm_2015-2019.csv")

#Manipulating the dataset to reflect the usage
file["DATE"] =  pd.to_datetime(file['JOUR'], format='%Y-%m-%d') #to transform the column into a datetime type

#transform the quart section to numeric (should be 0,1,2 for morning, afternoon and evening)
file = file.replace('jour',0)
file = file.replace('soir',1)
file = file.replace('nuit',2)

#split the 'DATE' columns into multiple ones.
file['JOUR'] = pd.DatetimeIndex(file['DATE']).day
file['MOIS'] = pd.DatetimeIndex(file['DATE']).month
file['ANNEE'] = pd.DatetimeIndex(file['DATE']).year - 2015

#Manipulating the 'CATEGORIE' column to transform it into a numeric
values_categories = np.unique(file['CATEGORIE'])
file = file.replace(values_categories, np.array([0, 1, 2, 3, 4 ,5])) 

#quick overview on the file to make sure it has been loaded properly
#print(file.info())
#print(file.head())

x_values = np.stack((file['JOUR'], file['MOIS'], file['ANNEE'], file['QUART'], file['LAT'], file['LONG']), axis=-1)
y_values = file['CATEGORIE']


In [97]:
#Preparing the dataset : 70% trainset ; 30% test set.

x_values = np.array(x_values, dtype=np.float)
y_values = np.array(y_values, dtype=np.float)

y_values = y_values.reshape(-1,1)


#Just to make sure we did not drop any values on the road.
assert len(x_values) == len(y_values)

size = int(0.70 * len(x_values) )

x_train = x_values[0 : size]
x_test = x_values[size+1:-1]

y_train = y_values[0 : size]
y_test = y_values[size+1:-1]

assert len(x_train) == len(y_train)
assert len(y_test) == len(x_test)

#Data Standardisation

scaler = MinMaxScaler(feature_range=(0,1))

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

y_train_scaled = scaler.fit_transform(y_train)
y_test_scaled = scaler.fit_transform(y_test)


print("Preparing dataset finished !")

Preparing dataset finished !


In [133]:
# Training the models.

#clf = LogisticRegression(random_state=0, solver='newton-cg',multi_class='multinomial').fit(x_train, np.ravel(y_train))
#clf = SGDClassifier( loss = 'log', n_iter = 1000 )
clf = SVC(c = 2.0)
clf.fit(x_train, np.ravel(y_train))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [134]:
#Training Metrics
print("Training Score = %s" % clf.score(x_train, np.ravel(y_train)))

pred = clf.predict(x_test[0])

print( "Prediction = %s | Actual = %s" % ( pred, y_test[0]) )

Training Score = 0.371390770533
Prediction = [ 1.] | Actual = [ 4.]



Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.



In [290]:
#Showing Predictions

predictions = (scaler.inverse_transform(testPredict).flatten() / 100).round()
normal_vals = (scaler.inverse_transform(x_test)).flatten()
normal_vals = normal_vals[look_back:-1]

random_x = jours_x[y+1:-1]


# Create traces
trace_pred = Scatter(
    x = random_x,
    y = predictions.tolist(),
    mode = 'markers',
    name = 'Predictions'
)
trace_normal = Scatter(
    x = random_x,
    y = normal_vals.tolist(),
    mode = 'markers',
    name = 'Actual Values'
)

data = [trace_pred, trace_normal]

plotly.offline.iplot(data, filename='Res')

In [291]:
equals = numpy.intersect1d(predictions, normal_vals)
print(len(equals))
print(len(predictions))
print(len(normal_vals))
print(max(predictions))
print(max(normal_vals))

0
90
90
56.0
101.0
