### 1- Importer des librairies

In [3]:
!pip install tensorflow
!pip install keras



In [4]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import math

import tensorflow as tf
import keras

from keras import Sequential
from keras.layers import Dense

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

#### 2- Choix des Villes et les Intervalles de Coordonnées Géographiques

In [5]:
cities = {'LosAngeles': [33.700615, 34.353627, -118.683511, -118.074559], 
           'Houston': [29.497907,30.129003,-95.797178,-94.988191],
           'Austin': [30.079327, 30.596764,-97.968881,-97.504838],
           'Dallas': [32.559567,33.083278,-97.036586,-96.428928],
           'Charlotte': [34.970168,35.423667,-81.060925,-80.622687],
           'Atlanta': [33.612410,33.916999,-84.575600,-84.231911]}

#### 3- Pre-processing de tous les datasets

In [6]:
geohash_map = pd.read_csv("data/geohash_to_poi_vec.csv")
geo_dict = dict(zip(geohash_map.Geohash.unique(), range(len(geohash_map.Geohash.unique()))))

In [7]:
def clean_data(filepath, cityname):
    df = pd.read_csv(filepath)
    print ("Le taux de zero accident dans la ville de {} est égal à :".format(cityname),float(df[df['T-Accident']==0].shape[0])/df.shape[0])
    def fun_hash(geohash):
        return geo_dict[geohash]
    df['geohash_code'] = df.apply(lambda row: fun_hash(row['Geohash']), axis=1)
    def week_day(DOW):
        if DOW < 5:
            return 1
        else:
            return 0
    def shift(group):
        df_list=[]
        for idx,df in group:
            df['predicted_accident'] = df['T-Accident'].shift(-1)
            df.drop(df.tail(1).index,inplace=True)
            df_list.append(df)
        return pd.concat(df_list)

    def time_interval(HOD):
        if HOD >=6 and HOD <10:
            return 0
        if HOD >= 10 and HOD<15:
            return 1
        if HOD >=15 and HOD< 18:
            return 2;
        if HOD >=18 and HOD< 22:
            return 3
        else:
            return 4; 
    def make_binary(d):
        if d > 0:
            return 1
        else:
            return 0    
    df['DOW'] = df.apply(lambda row: week_day(row['DOW']), axis=1)   
    df['HOD'] = df.apply(lambda row: time_interval(row['HOD']), axis=1) 
    df['T-Accident'] = df.apply(lambda row: make_binary(row['T-Accident']), axis=1) 
    group = df.groupby('Geohash')
    df = shift(group)
    return df.to_csv("data/Clean_TW_Data/{}_Clean_TW_Data.csv".format(cityname), index=False)

In [8]:
LosAngeles = pd.read_csv("data/Clean_TW_Data/LosAngeles_Clean_TW_Data.csv")
Houston = pd.read_csv("data/Clean_TW_Data/Houston_Clean_TW_Data.csv")
Austin = pd.read_csv("data/Clean_TW_Data/Austin_Clean_TW_Data.csv")
Dallas = pd.read_csv("data/Clean_TW_Data/Dallas_Clean_TW_Data.csv")
Charlotte = pd.read_csv("data/Clean_TW_Data/Charlotte_Clean_TW_Data.csv")
Atlanta = pd.read_csv("data/Clean_TW_Data/Atlanta_Clean_TW_Data.csv")

In [34]:
LosAngeles.tail()

Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Circle,Turning_Loop,geohash_code,predicted_accident
20574,9qh52,8122,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,307,0.0
20575,9qh52,7611,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,307,0.0
20576,9qh52,7734,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,307,0.0
20577,9qh52,1197,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,307,1.0
20578,9qh52,59,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,307,1.0


In [36]:
for city in cities:
    clean_data("data/Sample_TW_Event_Poi_vectors/Sample_Geohash_{}_TW_Event_Poi.csv".format(city), city)

Le taux de zero accident dans la ville de LosAngeles est égal à : 0.6269592476489029
Le taux de zero accident dans la ville de Houston est égal à : 0.6895820072140887
Le taux de zero accident dans la ville de Austin est égal à : 0.752924823352253
Le taux de zero accident dans la ville de Dallas est égal à : 0.7944267905157664
Le taux de zero accident dans la ville de Charlotte est égal à : 0.7106155023504974
Le taux de zero accident dans la ville de Atlanta est égal à : 0.8316274309109519


### 3-1- Concaténation des datasets

In [37]:
for city in cities :
    df = pd.read_csv("data/Clean_TW_Data/{}_Clean_TW_Data.csv".format(city))
    df = df.drop(["Geohash", "T-Accident","T-BrokenVehicle","T-Congestion","T-Construction","T-Event", "T-FlowIncident", "T-RoadBlocked", "T-Other"], axis = 1)
    df.to_csv("data/clean_twpoi_data/{}.csv".format(city), index = False)

In [38]:
df = pd.read_csv("data/clean_twpoi_data/Atlanta.csv")

In [39]:
df.tail()

Unnamed: 0,TimeStep,DOW,HOD,DayLight,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,...,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Circle,Turning_Loop,geohash_code,predicted_accident
15559,8206,0,1,1,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,0.0
15560,8529,1,3,0,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,0.0
15561,7149,1,1,1,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,1.0
15562,1126,1,2,1,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,1.0
15563,4500,1,3,0,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,1.0


In [40]:
dataset = pd.concat([LosAngeles, Houston, Austin, Dallas, Charlotte, Atlanta])

In [41]:
dataset.to_csv("data/clean_twpoi_data/clean_twpoi_data.csv", index=False)

In [42]:
df = pd.read_csv("data/clean_twpoi_data/clean_twpoi_data.csv")

In [43]:
print ("Le taux de zero accident  est égal à :" ,float(df[df['predicted_accident']==0].shape[0])/df.shape[0])

Le taux de zero accident  est égal à : 0.7257439159812539


In [44]:
df = pd.read_csv("data/clean_twpoi_data/TrafficWeatherEvent_June18_Aug18_Publish.csv")
df.tail()

Unnamed: 0,TimeStep,DOW,HOD,DayLight,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,...,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Circle,Turning_Loop,geohash_code,predicted_accident
106257,8206,0,1,1,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,0.0
106258,8529,1,3,0,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,0.0
106259,7149,1,1,1,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,1.0
106260,1126,1,2,1,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,1.0
106261,4500,1,3,0,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,1.0


### 3-2- Importation du dataset

In [58]:
tw = pd.read_csv("data/clean_twpoi_data/TrafficWeatherEvent_June18_Aug18_Publish.csv")
tw.tail()

Unnamed: 0,TimeStep,DOW,HOD,DayLight,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,...,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Circle,Turning_Loop,geohash_code,predicted_accident
106257,8206,0,1,1,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,0.0
106258,8529,1,3,0,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,0.0
106259,7149,1,1,1,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,1.0
106260,1126,1,2,1,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,1.0
106261,4500,1,3,0,0,0,0,0,0,0,...,20,0,0,0,8,5,71,0,489,1.0


### 4- Models de prédictions

#### 4-1- Modèle de Regression Logistic

In [46]:
def logistic_regression(filepath, cityname):
    df = pd.read_csv(filepath)
    X = df.loc[:, df.columns != "predicted_accident"]
    y = df.loc[:, df.columns == "predicted_accident"]
    
    # Split in train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 0)
    
    

    # Standardisize
    sc_x = StandardScaler()
    X_train = sc_x.fit_transform(X_train)
    X_test = sc_x.transform(X_test)
    
    # Modele de regression Logistique
    print("{},  logistic regression  ... ".format(cityname))
    classifier = LogisticRegression( )
    classifier.fit(X_train, y_train)
    print()
    print("{}, score de performance du modèle (train) : {}  ".format(cityname, classifier.score(X_train, y_train)))
    print("{}, score de performance du modèle (test) : {}  ".format(cityname, classifier.score(X_test, y_test)))
    print()
    predictions = classifier.predict(X_test)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print()
    print("Classification Report")
    print(classification_report(y_test, predictions))
    print("....Done")
    print()

In [47]:
c = {"TrafficWeatherEvent_June18_Aug18_Publish": 1}

In [48]:
for city in c :
    logistic_regression("data/clean_twpoi_data/{}.csv".format(city), city)

TrafficWeatherEvent_June18_Aug18_Publish,  logistic regression  ... 

TrafficWeatherEvent_June18_Aug18_Publish, score de performance du modèle (train) : 0.7850580526767754  
TrafficWeatherEvent_June18_Aug18_Publish, score de performance du modèle (test) : 0.780501576248059  

Confusion Matrix:
[[14326  1098]
 [ 3567  2262]]

Classification Report
              precision    recall  f1-score   support

         0.0       0.80      0.93      0.86     15424
         1.0       0.67      0.39      0.49      5829

    accuracy                           0.78     21253
   macro avg       0.74      0.66      0.68     21253
weighted avg       0.77      0.78      0.76     21253

....Done



In [49]:
for city in cities :
    logistic_regression("data/clean_twpoi_data/{}.csv".format(city), city)  

LosAngeles,  logistic regression  ... 

LosAngeles, score de performance du modèle (train) : 0.7235619267448217  
LosAngeles, score de performance du modèle (test) : 0.7074829931972789  

Confusion Matrix:
[[2192  377]
 [ 827  720]]

Classification Report
              precision    recall  f1-score   support

         0.0       0.73      0.85      0.78      2569
         1.0       0.66      0.47      0.54      1547

    accuracy                           0.71      4116
   macro avg       0.69      0.66      0.66      4116
weighted avg       0.70      0.71      0.69      4116

....Done

Houston,  logistic regression  ... 

Houston, score de performance du modèle (train) : 0.8166913049321328  
Houston, score de performance du modèle (test) : 0.8132222520827734  

Confusion Matrix:
[[2313  237]
 [ 458  713]]

Classification Report
              precision    recall  f1-score   support

         0.0       0.83      0.91      0.87      2550
         1.0       0.75      0.61      0.67      11

#### 4-2- Modèle de Gradient Boosting Classifier

In [51]:
def gradient_boosting(filepath, cityname):
    df = pd.read_csv(filepath)
    X = df.loc[:, df.columns != "predicted_accident"]
    y = df.loc[:, df.columns == "predicted_accident"]
    
    # Split in train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 42)
    
    

    # Standardisize
    sc_x = StandardScaler()
    X_train = sc_x.fit_transform(X_train)
    X_test = sc_x.transform(X_test)
    
    # Gradient Boosting Classifier
    print("{},  Gradient Boosting Classifier  ... ".format(cityname))
   
    gb_clf2 = GradientBoostingClassifier(n_estimators = 90, learning_rate=0.95, 
                                        random_state=0)
    gb_clf2.fit(X_train, y_train)
    print()
    print("Accuracy score (training): {0:.3f}".format(gb_clf2.score(X_train, y_train)))
    print("Accuracy score (test): {0:.3f}".format(gb_clf2.score(X_test, y_test)))
    print()
    
    predictions = gb_clf2.predict(X_test)

    print()
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    
    print()
    print("Classification Report")
    print(classification_report(y_test, predictions))
    print("....Done")
    print()

In [52]:
for city in c :
    gradient_boosting("data/clean_twpoi_data/{}.csv".format(city), city)

TrafficWeatherEvent_June18_Aug18_Publish,  Gradient Boosting Classifier  ... 

Accuracy score (training): 0.833
Accuracy score (test): 0.827


Confusion Matrix:
[[20665  2471]
 [ 3058  5685]]

Classification Report
              precision    recall  f1-score   support

         0.0       0.87      0.89      0.88     23136
         1.0       0.70      0.65      0.67      8743

    accuracy                           0.83     31879
   macro avg       0.78      0.77      0.78     31879
weighted avg       0.82      0.83      0.82     31879

....Done



In [53]:
def gradient_boosting(filepath, cityname):
    df = pd.read_csv(filepath)
    X = df.loc[:, df.columns != "predicted_accident"]
    y = df.loc[:, df.columns == "predicted_accident"]
    
    # Split in train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 42)
    
    

    # Standardisize
    sc_x = StandardScaler()
    X_train = sc_x.fit_transform(X_train)
    X_test = sc_x.transform(X_test)
    
    # Gradient Boosting Classifier
    print("{},  Gradient Boosting Classifier  ... ".format(cityname))
    parameters = {'n_estimators':[20, 30, 40, 50, 70, 100, 150, 200],
                  "learning_rate":[1.0, 0.9, 0.8, 0.7, 0.6, 0.5]}
    gb_clf = GradientBoostingClassifier()
    gb_clf_best = GridSearchCV(gb_clf, parameters)
    gb_clf_best.fit(X_train, y_train)
    print()
    print("Best parameters: ", gb_clf_best.best_params_)
    print()
    print("Accuracy score (training): {0:.3f}".format(gb_clf_best.best_estimator_.score(X_train, y_train)))
    print("Accuracy score (test): {0:.3f}".format(gb_clf_best.best_estimator_.score(X_test, y_test)))
    print()
    best_params = gb_clf_best.best_params_
    n_estimators = best_params['n_estimators']
    learning_rate = best_params['learning_rate']
    gb_clf2 = GradientBoostingClassifier(n_estimators = n_estimators, learning_rate=learning_rate, 
                                         random_state=0)
    gb_clf2.fit(X_train, y_train)
    predictions = gb_clf2.predict(X_test)

    print()
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    
    print()
    print("Classification Report")
    print(classification_report(y_test, predictions))
    print("....Done")
    print()

In [54]:
for city in cities :
    gradient_boosting("data/clean_twpoi_data/{}.csv".format(city), city)  

LosAngeles,  Gradient Boosting Classifier  ... 

Best parameters:  {'learning_rate': 0.5, 'n_estimators': 20}

Accuracy score (training): 0.756
Accuracy score (test): 0.751


Confusion Matrix:
[[3087  766]
 [ 769 1552]]

Classification Report
              precision    recall  f1-score   support

         0.0       0.80      0.80      0.80      3853
         1.0       0.67      0.67      0.67      2321

    accuracy                           0.75      6174
   macro avg       0.74      0.73      0.73      6174
weighted avg       0.75      0.75      0.75      6174

....Done

Houston,  Gradient Boosting Classifier  ... 

Best parameters:  {'learning_rate': 0.7, 'n_estimators': 20}

Accuracy score (training): 0.842
Accuracy score (test): 0.838


Confusion Matrix:
[[3421  404]
 [ 498 1258]]

Classification Report
              precision    recall  f1-score   support

         0.0       0.87      0.89      0.88      3825
         1.0       0.76      0.72      0.74      1756

    accuracy    

In [60]:
def model_ann(filepath, cityname) :
    df = pd.read_csv(filepath)
    X = df.loc[:, df.columns != "predicted_accident"]
    y = df.loc[:, df.columns == "predicted_accident"]
    y = y.values
    # Split in train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 0)

    # Standardisize
    sc_x = StandardScaler()
    X_train = sc_x.fit_transform(X_train)
    X_test = sc_x.transform(X_test)
    # Modele ANN
    print("{},  Artifical Neural Network (ANN) train ... ".format(cityname))
    print()
    classifier = keras.Sequential([
                        keras.layers.Dense(512, activation = tf.nn.relu, input_dim = 30),
                        keras.layers.Dense(256, activation=tf.nn.relu),
                        #keras.layers.Dense(64, activation=tf.nn.relu),
                        keras.layers.Dense(1, activation=tf.nn.sigmoid),
                    ])
    # Compiling the ANN
    classifier.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    # Fitting the ANN to the Training set
    classifier.fit(X_train, y_train,  batch_size = 10, validation_split= 0.2,  epochs = 10)
    print("....Done")
    print()
    print("{},  Artifical Neural Network (ANN) Score test  ... ".format(cityname))
    classifier.evaluate(X_test, y_test)
    predictions = classifier.predict(X_test)
    predictions = (predictions>0.5)
    print("....Done")
    print()
    print("{},  Artifical Neural Network (ANN) Confusion Matrix et Classification Report  ... ".format(cityname))
    print()
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print()
    print("Classification Report")
    print(classification_report(y_test, predictions))
    print("....Done")
    print()


    

In [56]:
for city in cities :
    model_ann("data/clean_twpoi_data/{}.csv".format(city), city)  

LosAngeles,  Artifical Neural Network (ANN) train ... 

Train on 13170 samples, validate on 3293 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
....Done

LosAngeles,  Artifical Neural Network (ANN) Score test  ... 
....Done

LosAngeles,  Artifical Neural Network (ANN) Confusion Matrix et Classification Report  ... 

Confusion Matrix:
[[2089  480]
 [ 621  926]]

Classification Report
              precision    recall  f1-score   support

         0.0       0.77      0.81      0.79      2569
         1.0       0.66      0.60      0.63      1547

    accuracy                           0.73      4116
   macro avg       0.71      0.71      0.71      4116
weighted avg       0.73      0.73      0.73      4116

....Done

Houston,  Artifical Neural Network (ANN) train ... 

Train on 11905 samples, validate on 2977 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoc

....Done

Dallas,  Artifical Neural Network (ANN) Score test  ... 
....Done

Dallas,  Artifical Neural Network (ANN) Confusion Matrix et Classification Report  ... 

Confusion Matrix:
[[2355  212]
 [ 340  333]]

Classification Report
              precision    recall  f1-score   support

         0.0       0.87      0.92      0.90      2567
         1.0       0.61      0.49      0.55       673

    accuracy                           0.83      3240
   macro avg       0.74      0.71      0.72      3240
weighted avg       0.82      0.83      0.82      3240

....Done

Charlotte,  Artifical Neural Network (ANN) train ... 

Train on 11635 samples, validate on 2909 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
....Done

Charlotte,  Artifical Neural Network (ANN) Score test  ... 
....Done

Charlotte,  Artifical Neural Network (ANN) Confusion Matrix et Classification Report  ... 

Confusion Matrix:
[[2273  304]
 [ 339  720

In [None]:
for city in c :
    model_ann("data/clean_twpoi_data/{}.csv".format(city), city)

TrafficWeatherEvent_June18_Aug18_Publish,  Artifical Neural Network (ANN) train ... 

Train on 68007 samples, validate on 17002 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10