# 1. Obtaining and Viewing the Data

In [0]:
import pandas as pd
from sklearn.base import TransformerMixin
import numpy as np
import plotly.graph_objects as go
import numpy as np

#### Telangana Road Dataset

In [6]:
df_train = pd.read_csv(r"C:\Users\Dell\Desktop\Accident_train.csv")
df_test = pd.read_csv(r"C:\Users\Dell\Desktop\Accident_test.csv")

FileNotFoundError: ignored

#### Size of Training Dataset


In [2]:
print('Records:', df_train.shape[0], '\nColumns:', df_train.shape[1])

NameError: ignored

#### Size of Test Dataset

In [3]:
print('Records:', df_test.shape[0], '\nColumns:', df_test.shape[1])

NameError: ignored

# 2. Preprocessing the Data

In [0]:
print('Proportion of Missing Values in Training Dataset:', 
      round(df_train.isna().sum().sum()/len(df_train),3), '%')

Proportion of Missing Values in Training Dataset: 0.242 %


In [0]:
print('Proportion of Missing Values in Testing Dataset:', 
      round(df_test.isna().sum().sum()/len(df_test),3), '%')

Proportion of Missing Values in Testing Dataset: 0.277 %


In [0]:
train_values=df_train.values
test_values=df_test.values

In [0]:
X_train=pd.DataFrame(train_values)
X_test=pd.DataFrame(test_values)

###### 2.1 Handling Missing Values

In [0]:
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.
        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [0]:
df_train_transform = DataFrameImputer().fit_transform(X_train)
df_test_transform  = DataFrameImputer().fit_transform(X_test)

In [0]:
df_train_transform.columns=['Collision_Ref_No', 'Policing_Area', 'Collision_Severity', 'Weekday_of_Collision', 'Day_of_Collision',
                            'Month_of_Collision', 'Hour_of_Collision', 'Carriageway_Type', 'Speed_Limit', 'Junction_Detail', 
                            'Junction_Control', 'Ped_Crossing_HC', 'Ped_Crossing_PC', 'Light_Conditions', 'Weather_Conditions',
                            'Road_Surface_Conditions', 'Special_Conditions_at_Site']

In [0]:
df_test_transform.columns=['Collision_Ref_No', 'Policing_Area', 'Collision_Severity', 'Weekday_of_Collision', 'Day_of_Collision',
                            'Month_of_Collision', 'Hour_of_Collision', 'Carriageway_Type', 'Speed_Limit', 'Junction_Detail', 
                            'Junction_Control', 'Ped_Crossing_HC', 'Ped_Crossing_PC', 'Light_Conditions', 'Weather_Conditions',
                            'Road_Surface_Conditions', 'Special_Conditions_at_Site']

# 3. Exploratory Data Analysis (EDA)

##### On which major factors does accidents most likely to be caused?

###### 3.1 Preparing dataframe that calculates  accidents per weekday:

In [0]:
week_day_cnts = df_train_transform['Weekday_of_Collision'].value_counts()
groupby_week_day = week_day_cnts.to_dict()
labels = list(groupby_week_day.keys())
values = list(groupby_week_day.values())
fig = go.Figure(go.Bar(x=values, y=labels, orientation='h'))
fig.show()

NameError: ignored

###### 3.2 Road type when accident happened 

In [0]:
carriageway_types = {1 : 'Roundabout', 2: 'One way street', 10: 'Other / unknown', 11: 'Dual carriageway', 12: 'Motorway', 13: 'Single carriageway', 14: 'Slip road'}
ct = df_train_transform['Carriageway_Type'].value_counts()
groupby_carriageway_types = ct.to_dict()
labels = list(groupby_carriageway_types.keys())
final_labels = [carriageway_types[i] for i in labels]
values = list(groupby_carriageway_types.values())
fig = go.Figure(data=[go.Pie(labels=final_labels, values=values)])
fig.show()

###### 3.3 Accidents categorized by hour of collision

In [0]:
def when_was_it(hour):
    if hour >= 5.0 and hour < 10.0:
        return "morning rush (5.0-10.0)"
    elif hour >= 10.0 and hour < 15.0:
        return "office hours (10.0-15.0)"
    elif hour >= 15.0 and hour < 19.0:
        return "afternoon rush (15.0-19.0)"
    elif hour >= 19.0 and hour < 23.0:
        return "evening (19.0-23.0)"
    else:
        return "night (23.0-5.0)"

In [0]:
hr_of_col = df_train_transform['Hour_of_Collision'].value_counts()
groupby_hr_col = hr_of_col.to_dict()
labels = list(groupby_hr_col.keys())
final_labels = [when_was_it(i) for i in labels]
values = list(groupby_hr_col.values())
fig = go.Figure(data=[go.Pie(labels=final_labels, values=values)])
fig.show()

###### 3.4 Weather conditions at accident site

In [0]:
wthr_conds_at_site = {1: "Fine without high winds", 2: "Raining without high winds", 3: "Snowing without high winds", 4: "Fine with high winds", 5: "Raining with high winds", 6: "Snowing with high winds", 7: "Fog or mist - if hazard", 8: "Strong sun (glaring)", 9: "Other", 10: "Unknown"}
wthr_cond = df_train_transform['Weather_Conditions'].value_counts()
groupby_whtr_conds = wthr_cond.to_dict()
labels = list(groupby_whtr_conds.keys())
final_labels = [wthr_conds_at_site[i] for i in labels]
values = list(groupby_whtr_conds.values())
fig = go.Figure(data=[go.Pie(labels=final_labels, values=values)])
fig.show()

###### 3.5 Percentage of each category of accident severity 

In [0]:
col_sev_types = {1 : 'Fatal injury collision', 2: 'Serious injury collision', 3: 'Slight injury collision'}
col_sev = df_train_transform['Collision_Severity'].value_counts()
groupby_col_sev = col_sev.to_dict()
labels = list(groupby_col_sev.keys())
final_labels = [col_sev_types[i] for i in labels]
values = list(groupby_col_sev.values())
fig = go.Figure(data=[go.Pie(labels=final_labels, values=values, hole=.5)])
fig.show()

---

---

#### Encoding weekdays into numeric form

In [0]:
week_days = {'FRI': 5, 'MON': 1, 'SAT': 6, 'SUN': 7, 'THU': 4, 'TUE': 2, 'WED': 3}
df_train_transform['Weekday_of_Collision'] = [week_days[day] for day in df_train_transform['Weekday_of_Collision']]

##### Defining features as X and y as class labels

In [0]:
y = df_train_transform['Collision_Severity']
X = df_train_transform.drop(['Collision_Severity', 'Policing_Area'], axis=1)

In [0]:
print(len(X.columns))


15


---

# Define a cross validation strategy

I use the cross_val_score function of Sklearn. However this function has no shuffle attribute, I add one line of code, in order to shuffle the dataset prior to cross-validation.

For the performance metric, as I mentioned in the beginning, this is an imbalanced dataset. Thus, instead of using accuracy, I used F1-score as metric. F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into account.


---

Typical metrics used in multiclass are the same as the metrics used in the binary classification case. The metric is calculated for each class by treating it as a binary classification problem after grouping all the other classes as belonging to the second class. Then the binary metric is averaged over all the classes to get either a macro average (treat each class equally) or weighted average (weighted by class frequency) metric.

In [0]:
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import KFold,cross_val_score
n_folds = 5
def f1_cv(model):
    kf = KFold(n_folds, shuffle = True, random_state = 29).get_n_splits(X)
    scorer = make_scorer(f1_score, average = 'weighted')
    f1 = cross_val_score(model, X, y, scoring = scorer, cv = kf)
    return (f1)

# 4. Implementing Machine Learning algorithms 

###### 4.1 KNN
###### 4.2 Naive bayes classifier
###### 4.3 Random forest classifier
###### 4.4 Logistic regression
###### 4.5 Gradient boosting Machine
###### 4.6 SVM
###### 4.7 XGBoost

In [0]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3,weights='distance')

In [0]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [0]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 31, random_state = 32)

In [0]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression( max_iter=5000,penalty='l2', n_jobs=3,
                    solver='lbfgs', verbose=1)

In [0]:
from sklearn.ensemble import GradientBoostingClassifier
GBoost = GradientBoostingClassifier( n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   random_state =5)

In [0]:
from sklearn.svm import SVC
svc = SVC(kernel = 'sigmoid', gamma = 1.0)

In [0]:
from xgboost import XGBClassifier
xgboost = XGBClassifier(learning_rate =0.07, n_estimators=300,
                      class_weight="balanced_subsample",
                      max_depth=8, min_child_weight=1,
                      scale_pos_weight=7,
                      seed=27,subsample=0.8,colsample_bytree=0.8)

#### Predicting the Mean and Standard Deviation of each classifier

In [0]:
score = f1_cv(svc)
print ('\nSVC score: {:4f}({:4f})\n'.format(score.mean(), score.std()))


SVC score: 0.841893(0.000266)



In [0]:
score = f1_cv(gnb)
print ('gnb score: {:4f}({:4f})\n'.format(score.mean(), score.std()))

gnb score: 0.835021(0.007351)



In [0]:
score = f1_cv(rfc)
print ('rfc score: {:4f}({:4f})\n'.format(score.mean(), score.std()))

rfc score: 0.858807(0.004937)



In [0]:
score = f1_cv(lg)
print ('lg score: {:4f}({:4f})\n'.format(score.mean(), score.std()))

[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:   12.5s finished
[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:   11.6s finished
[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:    4.5s finished
[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:    6.7s finished


lg score: 0.841843(0.000303)



[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:    6.0s finished


In [0]:
# score = f1_cv(GBoost)
# print ('\gboost score: {:4f}({:4f})\n'.format(score.mean(), score.std()))

In [0]:
# score = f1_cv(xgmodel)
# print ('\nxgmodel score: {:4f}({:4f})\n'.format(score.mean(), score.std()))

In [0]:
score = f1_cv(knn)
print ('\nKNN score: {:4f}({:4f})\n'.format(score.mean(), score.std()))


KNN score: 0.831996(0.003223)



#### Spliting the train data into  training set and test set in 70-30 ratio

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [0]:
def classifier(clf, X_train, y_train):    
    clf.fit(X_train, y_train)
def predictor(clf, X_test):
    return (clf.predict(X_test))

In [0]:
from sklearn.metrics import accuracy_score
clf = {'KNN': knn, 'GuassianNB':gnb, 'RandomForest':rfc,'SVC':svc,'LogisticRegression': lg, 'GradientBoostingMachine':GBoost, 'XGBoost':xgboost}
preds = {}
for key, value in clf.items():
    classifier(value, X_train, y_train)
    pred = predictor(value,X_test)
    preds[key] = accuracy_score(y_test,pred)

[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:    6.2s finished


In [0]:
preds

{'KNN': 0.8536342515765019,
 'GuassianNB': 0.8679057417855958,
 'RandomForest': 0.8931297709923665,
 'SVC': 0.8904746100232327,
 'LogisticRegression': 0.8904746100232327,
 'GradientBoostingMachine': 0.8984400929306339,
 'XGBoost': 0.9010952538997676}

In [0]:
labels = list(preds.keys())
values = list(preds.values())
fig = go.Figure(go.Bar(x=labels, y=values))
fig.show()

---

---

In [0]:
print(X_test.values[1])
print(y_test.values[1])

[9.847e+03 1.000e+00 2.200e+01 6.000e+00 2.300e+01 2.000e+00 4.000e+01
 1.000e+00 1.000e+00 1.000e+00 1.000e+00 5.000e+00 1.000e+00 1.000e+00
 1.000e+00]
3


In [0]:
prediction = predictor(rfc, [X_test.values[1]])

In [0]:
prediction

array([3])

In [0]:
print(X_test.values[1744])
print(y_test.values[1744])

[4.045e+03 1.000e+00 8.000e+00 9.000e+00 1.600e+01 1.300e+01 6.000e+01
 1.000e+00 7.000e+00 1.000e+00 1.000e+00 2.000e+00 1.000e+00 1.000e+00
 1.000e+00]
2


In [0]:
prediction = predictor(rfc, [X_test.values[1744]])

In [0]:
prediction

array([2])

In [0]:
print(X_test.values[3010])
print(y_test.values[3010])

[1.1089e+04 4.0000e+00 1.0000e+01 9.0000e+00 2.0000e+01 1.3000e+01
 3.0000e+01 1.2000e+01 4.0000e+00 1.0000e+00 1.0000e+00 4.0000e+00
 1.0000e+01 1.0000e+00 1.0000e+00]
2


In [0]:
prediction = predictor(rfc, [X_test.values[3010]])
prediction

array([3])

---

In [0]:
df_test_transform.values[0]

array([812, 'DAST', 'Predict', 'MON', 17, 2, 21.0, 13, 60, 10.0, 7.0, 1.0,
       1.0, 5, 9, 6.0, 1.0], dtype=object)

In [0]:
# test = df_test_transform.drop(['Collision_Severity', 'Policing_Area'], axis=1)

In [0]:
week_days = {'FRI': 5, 'MON': 1, 'SAT': 6, 'SUN': 7, 'THU': 4, 'TUE': 2, 'WED': 3}
df_test_transform['Weekday_of_Collision'] = [week_days[day] for day in df_test_transform['Weekday_of_Collision']]

In [0]:
test = df_test_transform.drop(['Collision_Severity', 'Policing_Area'], axis=1)

In [0]:
test0 = [test.values[0]]

In [0]:
prediction = predictor(rfc, test0)

In [0]:
prediction

array([3])

In [0]:
prediction = predictor(rfc, [test.values[1]])

In [0]:
prediction

array([3])

In [0]:
prediction = predictor(rfc, [test.values[10]])
prediction

array([2])

In [0]:
# print(X_test.values[9])
# print(y_test.values[9])

# prediction = predictor(rfc, X_test)
# xtest =list(zip(y_test,prediction))

# for i in range(0, len(xtest)):
#     if xtest[i][0]==2 and xtest[i][0]==xtest[i][1]:
#         print(i)
# [i for i in xtest if i[0]==2 and i[0]==i[1]]