## Data clenaing and normalization:
First steps were to select the data that we wnat to work with. So the attributes were selected and putted in a specific dataframe.

In [None]:
#IMPORT LIBRARIES NECESSARY
from IPython.display import display_html
from IPython.display import Image 
from IPython.core.display import HTML
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import requests
from selenium import webdriver

import sys
import csv
import datetime as dt
import requests
import random
import seaborn as sns
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.datasets.samples_generator import make_blobs
from sklearn.utils import resample
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

print('Libraries imported.')

Open the CSV dataset provided by IBM

In [None]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('Data-Collisions.csv', low_memory=False)

Having a scoop on the dataset. We need to see what kind of attributes we have to work with and the type of each of them. Decided to get a more detailed look at the number of accidents associated with speeding. 

In [None]:
print("Data frame shape:", df.shape, '\n')

print("Data types:\n\n",df.dtypes, '\n')

print("Amounts:\n", df['SPEEDING'].value_counts())

In [None]:
df.head()

After being able to see a small sample of the dataset, we can see that we need to work on it. Since it is not a balanced labeled dataset, we need to process it and normalize the dataset. First set, before anything else, is to choose the attributes that we want to work with.

In [None]:
col = ["SEVERITYCODE", "INCDTTM", "WEATHER", "ROADCOND", "LIGHTCOND", "VEHCOUNT", "PERSONCOUNT", 'PEDCOUNT', 'PEDCYLCOUNT']
df_ac = df[col]
df_ac[:5]

##### Remove Null or NaN values and reseting the index

In [None]:
#Taking the null values from and reset the index
df_ac = df_ac.dropna()
df_ac = df_ac.reset_index()
df_ac.head()

##### Since the data format was not even for every row, we are creating a column with a normalized data (month-year)

In [None]:
#Change INCDTTM to datetime format for extraction of year and month for new columns. 
df_ac['INCDTTM_dt'] = pd.to_datetime(df_ac.INCDTTM)
df_ac['INCDTTM_year'] = df_ac.INCDTTM_dt.dt.year
df_ac['INCDTTM_month'] = df_ac.INCDTTM_dt.dt.month
df_ac['INCDTTM_year_str'] = df_ac.INCDTTM_year.astype(str)
df_ac['INCDTTM_month_str'] = df_ac.INCDTTM_month.astype(str)
df_ac['INCDTTM_year_month'] = df_ac[['INCDTTM_month_str', 'INCDTTM_year_str']].apply(lambda x: '-'.join(x),axis=1)

##### Normailizing the data

In [None]:
weather_dict = {'Overcast':1, 'Raining':2, 'Clear':3, 'Other':4, 'Snowing':5,
               'Fog/Smog/Smoke':6, 'Sleet/Hail/Freezing Rain':7, 'Blowing Sand/Dirt':8,
               'Severe Crosswind':9, 'Partly Cloudy':10}
df_ac['WEATHER_int'] = df_ac.WEATHER.apply(lambda x: weather_dict.get(x)).fillna(0).astype(int)

roadcond_dict = {'Wet':1, 'Dry':2, 'Unknown':3, 'Snow/Slush':4, 'Ice':5, 'Other':6,
                'Sand/Mud/Dirt':7, 'Standing Water':8, 'Oil':9}
df_ac['ROADCOND_int'] = df_ac.ROADCOND.apply(lambda x: roadcond_dict.get(x)).fillna(0).astype(int)

light_dict = {'Daylight':1, 'Dark - Street Lights On':2, 'Dark - No Street Lights':3,
             'Unknown':4, 'Dusk':5, 'Dawn':6, 'Dark - Street Lights Off':7,
             'Other':8, 'Dark - Unknown Lighting':9}
df_ac['LIGHTCOND_int'] = df_ac.LIGHTCOND.apply(lambda x: light_dict.get(x)).fillna(0).astype(int)


#View the first 5 rows
df_ac[:5]

Now all the data is normalized and ready to be used. We can try to have a little scoop in the attributes that we want to use to compare with the Severity of the accident. For this aspect we will use the raw data, to represent the three main attributes on this case study Weather, Road and Light conditions.

#### Creating a few graphs to visualize the amount of accidents per year and condition (Weather, Road and Light) 

In [None]:
#Creating a base for the graphic representation
def create_barh_plot(df_ac, title, save, color):
    plt.figure(figsize=(15,7))
    ax = df_ac.plot(kind='barh', color = color )
    plt.xlabel('Counts')
    plt.ylabel('Types')
    ax.xaxis.label.set_fontsize(18)
    ax.yaxis.label.set_fontsize(18)
    graph_title = title
    plt.title(graph_title, fontsize = 20, fontweight = 'bold')
    graph_title_for_save = save
    plt.show()

In [None]:
#Severity of accidents representation
severity = df_ac.SEVERITYCODE.value_counts()
print(severity)
create_barh_plot(severity, 'Severity Type', 'SEVERITYCODE', 'indianred')

This gives a visualization of the accident severity type, when 2 represents "Injury", with more than 57.000 accidents, and 1 is the total with "Propriety damage", more than 132.000. Unfortunately, the data is not provided with a most deeply on other types, for example fatalities. 

In [None]:
#Accidents per Weather types representation
weather = df_ac.WEATHER.value_counts()
print(weather)
create_barh_plot(weather, 'Weather Type', 'weather_counts', 'lightseagreen')

Normally it is expected that the worst types of weather will cause more accidents, but most of the accidents occurred with Clear weather (111.000 accidents). Following it by 33.000 accidents in raining conditions.

In [None]:
#Accidents per Road Condition types representation
roadcond = df_ac.ROADCOND.value_counts()
print(roadcond)
create_barh_plot(roadcond, 'Road Condition Type', 'roadcond_counts', 'burlywood')

When we observed the "Road condition" we see that mainly the accidents happened with a Dry floor (124.000).

In [None]:
#Accidents per Light Conditions types representation
lightcond = df_ac.LIGHTCOND.value_counts()
print(lightcond)
create_barh_plot(lightcond, 'Light Condition Type', 'lightcond_counts', 'lightcoral')

Regarding the "Light conditions" the largest group represent that the accidents happened by Day.
Ending the representation of the three main attributes in study we can see that the accidents happened mostly during the day, with dry roads and with clear sky. Now we will need to process the rest of the information to be able to see the correlation between accident severity and the conditions represented above.

In [None]:
#Representation of accidents per year
year = df_ac.INCDTTM_year_str.value_counts()
print(year)
create_barh_plot(year, 'Accidents per Year', 'count', 'darkseagreen')



Lastly, we can see that since 2006 till 2019 (not considering 2020 since the year is not ended) a steady decrease on the number of accidents. Even so, the complete year with less accidents (2019) presents around 9.000 accidents. 

## Analysis:
We will start now the analyses of the elements. We will start with saving the attributes that we want to study in a specific dataframe and from there we will create the Machine Learning model

In [None]:
int_cols = ['SEVERITYCODE', 'INCDTTM_year_month','WEATHER_int', 'ROADCOND_int', 'LIGHTCOND_int']
predict = df_ac[int_cols]
predict[:5]

In [None]:
predict.head()

### One Hot Encoding

Next we will be putting all the attributes necessary to a numerical code, so that we can be able to process the data and create the model 

In [None]:
Feature = predict[['WEATHER_int', 'ROADCOND_int', 'LIGHTCOND_int']]
Feature = pd.concat([Feature,pd.get_dummies(predict['SEVERITYCODE'])], axis=1)

Feature.head()

In [None]:
X = Feature
X[0:5]

Just for fun, decided to see the correlation between each attribute. First with Severity code all together, and then the Severity code separated in the two types of accident severity, using the One Hot Encoding technique

In [None]:
corr = predict.corr()
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)


Well, it seems that Road and Light conditions have the greater correlation with the accident severity variable. 

In [None]:
corr = Feature.corr()
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

Surprising looks like the same attributes are important for both types of accident severity, Road and Light conditions. Other variable ta have great correlation is in both graphs the Light and Weather condition. 

In [None]:
predict_corr = predict[['WEATHER_int', 'ROADCOND_int', 'LIGHTCOND_int']].corr()
ax = sns.heatmap(predict_corr, cmap='coolwarm', annot=True, fmt='.1f')
ax.figure.set_size_inches(15,5)
plt.xlabel('Features')
plt.ylabel('Features')
ax.xaxis.label.set_fontsize(14)
ax.yaxis.label.set_fontsize(14)
graph_title = 'Pearson R Correlation for Possible Features'
plt.title(graph_title, fontsize = 17)

plt.show()

As a last form of representation, decided to use the Pearson R correlation to represent the correlation between the variables to study face the Severity attribute.

#### Prediction

First step is to get the Panda dataframe into a Numpy array, to be able to get the model running.

X - feature vector ("WEATHER_int", "ROADCOND_int" and "LIGHTCOND_int")

y - predicted variable ("SEVERITYCODE")

We decided to use three types of models to run, to see the best to provide an answer to our issue: Decision Tree, K Nearest Neighbor (KNN), Logistic Regression and Support Vector Machine (SVM).

Some of the aspects to have in consideration regarding the models in use:
- Precision quantifies the number of positive class predictions that actually belong to the positive class.
- Recall quantifies the number of positive class predictions made out of all positive examples in the dataset.
- F-Measure provides a single score that balances both the concerns of precision and recall in one number.
- Jaccard Index compares members for two sets to see which members are shared and which are distinct. It’s a measure of similarity for the two sets of data, with a range from 0% to 100%. The higher the percentage, the more similar the two populations.

In [None]:
y = predict[['SEVERITYCODE']].values
y[0:5]

In [None]:
X = predict[['WEATHER_int', 'ROADCOND_int', 'LIGHTCOND_int']].values
X[0:5]

Time to divide the data set into Training and test set. It was decided to attribute 20% of the data set to the Test set. 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, jaccard_similarity_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
shift_ml = 'ML for Shift \n'
shift_ml += '\t X_train: ' + str(X_train.shape[0]) + '\n'
shift_ml += '\t y_train: ' + str(y_train.shape[0]) + '\n'
shift_ml += '\t X_test: ' + str(X_test.shape[0]) + '\n'
shift_ml += '\t y_test: ' + str(y_test.shape[0]) + '\n'
print(shift_ml)

In [None]:
#Defining the base for the visual representation of each section
def heatmap_for_confusion(df):
    fix,ax = plt.subplots(figsize = (15, 7))
    sns.heatmap(df, annot = True, fmt = 'd', cmap= 'Greens')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    graph_title = 'Heatmap for Confusion Matrix for Machine Learning Algorithms' 
    plt.show()

#### Decision Tree classifier

In [None]:
model = DecisionTreeClassifier()
print(model)

model.fit(X_train, y_train)
y_DecisionTree = model.predict(X_test)

matrix_DecisionTree = confusion_matrix(y_test, y_DecisionTree)
print(matrix_DecisionTree)

class_report = classification_report(y_test, y_DecisionTree)
print(class_report)
heatmap_for_confusion(matrix_DecisionTree)

#### K Nearest Neighbor(KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.metrics import accuracy_score
ks = range(1, 10)
mean_accuracy = []

for n in ks:
    knn_model  = knn(n_neighbors = n)
    knn_model.fit(X_train, np.ravel(y_train, order='C'))
    knn_yhat = knn_model.predict(X_test)
    mean_accuracy.append(accuracy_score(y_test, knn_yhat))

In [None]:
for i in range(0, len(mean_accuracy)):
    print("k = {} has a Score = {} ".format(i + 1, mean_accuracy[i]))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(ks, mean_accuracy)
plt.xlabel('Values of K')
plt.ylabel('Testing Accuracy Values')

In [None]:
model = KNN(n_neighbors = 4)
print(model)

model.fit(X_train, np.ravel(y_train, order='C'))
y_pred_knn = model.predict(X_test)
print(y_pred_knn[:5])

matrix = confusion_matrix(y_test, y_pred_knn)

class_report = classification_report(y_test, y_pred_knn)
print(class_report)
heatmap_for_confusion(matrix)

### Logistic Regression


In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
LogisticRegression(solver='lbfgs')

LRmodel = LogisticRegression()
print(LRmodel)
LRmodel.fit(X_train, np.ravel(y_train, order='C'))
y_pred_logReg = model.predict(X_test)
# print(y_pred_logReg)

matrix_logReg = confusion_matrix(y_test, y_pred_logReg)
class_report = classification_report(y_test, y_pred_logReg)
print(class_report)

heatmap_for_confusion(matrix_logReg)


#### Support Vector Machine

In [None]:
from sklearn import svm
SVM_model = svm.SVC(kernel='rbf')
SVM_model.fit(X_train, np.ravel(y_train, order='C')) 

In [None]:
yhat = SVM_model.predict(X_test)
yhat [0:5]

In [None]:
SVM_yhat = SVM_model.predict(X_test)
print("SVM Jaccard index: %.2f" % jaccard_similarity_score(y_test, SVM_yhat))
print("SVM F1-score: %.2f" % f1_score(y_test, SVM_yhat, average='weighted') )

### Results

#### Model Evaluation using Test set

In [None]:
ml_output_scores = 'Jaccard and F1 Scores \n'
ml_output_scores += '\t Decision Trees Scores: ' + '\n'
ml_output_scores += '\t\t f1 score: ' +  str(f1_score(y_test, y_DecisionTree, average='weighted'))+ '\n'
ml_output_scores += '\t\t jaccard score: ' +  str(jaccard_similarity_score(y_test, y_DecisionTree))+ '\n'
ml_output_scores += '\n'
ml_output_scores += '\t KNN Scores: ' + '\n'
ml_output_scores += '\t\t f1 score: ' +  str(f1_score(y_test, y_pred_knn, average='weighted')) + '\n'
ml_output_scores += '\t\t jaccard score: ' +  str(jaccard_similarity_score(y_test, y_pred_knn)) +'\n'
ml_output_scores += '\n'
ml_output_scores += '\t Logistic Regression Scores: ' + '\n'
ml_output_scores += '\t\t f1 score: ' +  str(f1_score(y_test, y_pred_logReg, average='weighted')) + '\n'
ml_output_scores += '\t\t jaccard score: ' +  str(jaccard_similarity_score(y_test, y_pred_logReg))+ '\n'
ml_output_scores += '\n'
ml_output_scores += '\t Support Vector Machine: ' + '\n'
ml_output_scores += '\t\t f1 score: ' +  str(f1_score(y_test, yhat, average='weighted')) + '\n'
ml_output_scores += '\t\t jaccard score: ' +  str(jaccard_similarity_score(y_test, yhat))+ '\n'
ml_output_scores += '\n'
print(ml_output_scores)

In [None]:
#Lets see how the models performed
models = [
          KNN(),
          LogisticRegression(),
          DecisionTreeClassifier(),
          svm.SVC()
]
CV=5
cv_df = pd.DataFrame(index = range(CV*len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, X, y, scoring = 'accuracy', cv = CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    cv_df = pd.DataFrame(entries, columns = ['model_name', 'fold_idx', 'accuracy'])

In [None]:
cv_df.groupby('model_name').accuracy.mean()

In [None]:
#Box-Whisker Plot of the outputs of the models used above.
f, ax = plt.subplots(figsize = (15,5))
ax = sns.boxplot(x = 'model_name', y = 'accuracy', data = cv_df, palette='rocket_r')
ax = sns.stripplot(x = 'model_name', y= 'accuracy', data= cv_df, size = 8, jitter = True, color = 'black',
                   edgecolor = 'yellow', linewidth = 2)
plt.title ('Box and Whisker Plot of Machine Learning Performance', fontsize = 20, fontweight = 'bold')
ax.set_xlabel('Machine Learning Technique', fontsize = 14)
ax.set_ylabel('Accuracy', fontsize = 14)

plt.show()