# Library imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Model Pipelines & Functions

In [2]:
def create_splits(X,y, test_size=0.15, random_state=101, stratify=None):
    X = X
    y = y
    random_state = random_state
    test_size = test_size
    stratify = stratify
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= test_size, random_state=random_state, stratify=stratify)
    return(X_train, X_test, y_train, y_test)

# Read Data and Basic Get to know the Data

In [3]:
crashesDf = pd.read_csv('https://raw.githubusercontent.com/nvyas1-git/Traffic_Crash_Project_602/main/data/Traffic_Crashes.csv')
crashesDf.head()

Unnamed: 0,OBJECTID,jurisdiction,number_of_lanes,fatal_count,young,deer,crash_id,lighting,occupants,b_level_count,...,highway_classification,crash_type,train,weather,property_damage,lane_departure,primary_road,alcohol,drug,elderly
0,1,4.0,2,0,0,0,7929715,1,2,0,...,9,4,0,2,1,0,HENDRIE,0,0,0
1,2,5.0,0,0,0,0,7931144,1,3,0,...,9,4,0,1,1,0,E STATE FAIR,0,0,0
2,3,4.0,2,0,0,0,7854992,1,0,0,...,9,5,0,2,1,0,FOREST AVE,0,0,0
3,4,4.0,2,0,0,0,7940235,1,3,0,...,9,8,0,1,1,0,W FOREST AVE,0,0,0
4,5,1.0,1,0,0,0,7932532,1,1,0,...,3,1,0,5,1,1,109C,0,0,0


In [51]:
crashesDf['jurisdiction'].nunique()

7

## Cleaning the Data

In [4]:
crashesDfClean = crashesDf.drop(columns=['OBJECTID', 'crash_id'], axis =1)

In [5]:
crashesDfNAClean = crashesDfClean.dropna(axis=0)

__Differentiating the columns based on their values such as boolean columns and columns having more than two values__

In [11]:
bi_value_cats = ['young', 'deer', 'pedestrian', 'dis_ctrl_i', 'school_bus', 'hit_and_run', 'bicycle', 'motorcycle',
                 'red_light_running', 'train', 'property_damage']

more_value_cats = ['lane_departure', 'weather', 'crash_type', 'highway_classification','weekday', 'most_severe_injury', 'a_level_count',
                   'b_level_count', 'c_level_count', 'number_of_units', 'road_condition', 'lighting', 'fatal_count', 'number_of_lanes',
                   'jurisdiction']

more_than_20_value_cats = ['hour','speed_limit', 'occupants']

## Labelling the target column

In [6]:
# Class Fragmentation as per target and then combining them into One Dataframe
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    dfADE = crashesDfNAClean[(crashesDfNAClean['alcohol']==1)&(crashesDfNAClean['drug']==1)&(crashesDfNAClean['elderly']==1)]
    dfADE['target'] = 'Alcohol & Drug Consumed Elder'
    dfAE = crashesDfNAClean[(crashesDfNAClean['alcohol']==1)&(crashesDfNAClean['drug']==0)&(crashesDfNAClean['elderly']==1)]
    dfAE['target'] = 'Alcohol Consumed Elder'
    dfDE = crashesDfNAClean[(crashesDfNAClean['alcohol']==0)&(crashesDfNAClean['drug']==1)&(crashesDfNAClean['elderly']==1)]
    dfDE['target'] = 'Drug Consumed Elder'
    dfE = crashesDfNAClean[(crashesDfNAClean['alcohol']==0)&(crashesDfNAClean['drug']==0)&(crashesDfNAClean['elderly']==1)]
    dfE['target'] = 'Non-Consumed Elder'
    dfADY = crashesDfNAClean[(crashesDfNAClean['alcohol']==1)&(crashesDfNAClean['drug']==1)&(crashesDfNAClean['elderly']==0)]
    dfADY['target'] = 'Alcohol & Drug Consumed Youth'
    dfAY = crashesDfNAClean[(crashesDfNAClean['alcohol']==1)&(crashesDfNAClean['drug']==0)&(crashesDfNAClean['elderly']==0)]
    dfAY['target'] = 'Alcohol Consumed Youth'
    dfDY = crashesDfNAClean[(crashesDfNAClean['alcohol']==0)&(crashesDfNAClean['drug']==1)&(crashesDfNAClean['elderly']==0)]
    dfDY['target'] = 'Drug Consumed Youth'
    dfY = crashesDfNAClean[(crashesDfNAClean['alcohol']==0)&(crashesDfNAClean['drug']==0)&(crashesDfNAClean['elderly']==0)]
    dfY['target'] = 'Non-Consumed Youth'

    crashesDfNAClean = pd.concat([dfADE,dfAE,dfDE,dfE,dfADY,dfAY, dfDY, dfY], axis=0)
crashesDfNAClean['target'].value_counts()

Non-Consumed Youth               116928
Non-Consumed Elder                15181
Alcohol Consumed Youth             2597
Drug Consumed Youth                 298
Alcohol & Drug Consumed Youth       289
Alcohol Consumed Elder              167
Drug Consumed Elder                  44
Alcohol & Drug Consumed Elder        40
Name: target, dtype: int64

__There is a huge class imbalance in the data, we will train the Machine Learning model with the imbalance and then will try to use upsampling methods to balance the classes__

# Modeling

## Column Selection for modeling

In [7]:
# Dropping the features that we won't use in the model
modelDF=crashesDfNAClean.drop(columns=['jurisdiction','deer','dis_ctrl_i','intersecting_road','primary_road','young',
                                       'datetime','alcohol','drug','elderly','school_bus','train'])
modelDF.columns

Index(['number_of_lanes', 'fatal_count', 'lighting', 'occupants',
       'b_level_count', 'pedestrian', 'road_condition', 'number_of_units',
       'speed_limit', 'hour', 'hit_and_run', 'bicycle', 'motorcycle',
       'a_level_count', 'c_level_count', 'most_severe_injury',
       'red_light_running', 'weekday', 'highway_classification', 'crash_type',
       'weather', 'property_damage', 'lane_departure', 'target'],
      dtype='object')

## Train Test Split

In [8]:
# Assignment of features to X and y
X = modelDF.drop(columns='target')
y = modelDF['target']

In [9]:
# Train Test Split
X_train, X_test, y_train, y_test = create_splits(X=X,y=y, stratify=y)

In [12]:
import mlflow
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
def mlflow_runLogR(model_params, exp_name='TrafficCrash', run_Name='run1'):
    pipeline = Pipeline(
        [
            ("clf", LogisticRegression())
        ]
    )
    mlflow.set_experiment(exp_name)
    with mlflow.start_run(run_name=run_Name):
        best_params = model_params
        pipeline.set_params(**best_params)
        pipeline.fit(X_train,y_train)
        prediction = pipeline.predict(X_test)
        Accuracy_score = round(metrics.accuracy_score(y_test, prediction)*100,2)
        
        print('Accuracy_score', Accuracy_score)
        print(best_params['clf__penalty'])
        mlflow.log_param("penalty", best_params['clf__penalty'])
        mlflow.log_param("max_iter", best_params['clf__max_iter'])
        mlflow.log_param("solver", best_params['clf__solver'])
        mlflow.log_metric("Accuracy_score", Accuracy_score)
        mlflow.log_metric("Recall", round(metrics.recall_score(y_test,prediction, average='weighted')*100,2))
        mlflow.log_metric("F1-score", round(metrics.f1_score(y_test,prediction,average='weighted')*100,2))


        mlflow.sklearn.log_model('clf', "model")

In [14]:
param_grid = {'clf__penalty': 'l2',
             'clf__solver': 'lbfgs',
             'clf__max_iter': 1000}
mlflow_runLogR(model_params=param_grid,exp_name='TrafficCrash', run_Name='run1' )

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy_score 86.25
l2




In [52]:
!mlflow ui

^C


In [15]:
# Counts of classes with respect to their record occurrence in the data
y.value_counts()

Non-Consumed Youth               116928
Non-Consumed Elder                15181
Alcohol Consumed Youth             2597
Drug Consumed Youth                 298
Alcohol & Drug Consumed Youth       289
Alcohol Consumed Elder              167
Drug Consumed Elder                  44
Alcohol & Drug Consumed Elder        40
Name: target, dtype: int64

In [16]:
# SMOTE Upsampling
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X,y)

y_sm.value_counts()

Alcohol & Drug Consumed Elder    116928
Alcohol Consumed Elder           116928
Drug Consumed Elder              116928
Non-Consumed Elder               116928
Alcohol & Drug Consumed Youth    116928
Alcohol Consumed Youth           116928
Drug Consumed Youth              116928
Non-Consumed Youth               116928
Name: target, dtype: int64

In [17]:
# Train Test Splits
X_train, X_test, y_train, y_test = create_splits(X_sm,y_sm)

In [18]:
param_grid = {'clf__penalty': 'l2',
             'clf__solver': 'lbfgs',
             'clf__max_iter': 1000}
mlflow_runLogR(model_params=param_grid,exp_name='TrafficCrash', run_Name='run3')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy_score 37.96
l2


In [76]:
!mlflow ui

^C


## Class Reduction

__First we have fragmented the classes as per the three column values (alcohol, elderly and drug) and then merged them into one.__

In [19]:
# Class Fragmentation as per target and then combining them into One Dataframe
drug_consumptionDf = crashesDfNAClean[(crashesDfNAClean['alcohol']==0)&(crashesDfNAClean['drug']==1)]
print('Drug: ',drug_consumptionDf.shape)

alcohol_consumptionDf = crashesDfNAClean[(crashesDfNAClean['alcohol']==1)&(crashesDfNAClean['drug']==0)]
print('Alcohol: ',alcohol_consumptionDf.shape)

alcohol_drug_consumptionDf = crashesDfNAClean[(crashesDfNAClean['alcohol']==1)&(crashesDfNAClean['drug']==1)]
print('Alcohol Drug: ',alcohol_drug_consumptionDf.shape)

unconsumedDf = crashesDfNAClean[(crashesDfNAClean['alcohol']==0)&(crashesDfNAClean['drug']==0)]
print('Unconsumed: ',unconsumedDf.shape)

print('Summation of categorized records',(drug_consumptionDf.shape[0] + alcohol_consumptionDf.shape[0]+ alcohol_drug_consumptionDf.shape[0]+unconsumedDf.shape[0]))
print('CrashesDF records: ', crashesDfNAClean.shape[0])

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    drug_consumptionDfNew = pd.concat([drug_consumptionDf,alcohol_drug_consumptionDf],axis=0)
    drug_consumptionDfNew['target'] = 'Drug'
    alcohol_consumptionDf['target'] = 'Alcohol'
    unconsumedDf['target']='Clean'

crashesEditedDf = pd.concat([drug_consumptionDfNew, alcohol_consumptionDf, unconsumedDf], axis=0)
crashesEditedDf['target'].value_counts()

Drug:  (342, 36)
Alcohol:  (2764, 36)
Alcohol Drug:  (329, 36)
Unconsumed:  (132109, 36)
Summation of categorized records 135544
CrashesDF records:  135544


Clean      132109
Alcohol      2764
Drug          671
Name: target, dtype: int64

In [78]:
# Total Columns
crashesEditedDf.columns

Index(['jurisdiction', 'number_of_lanes', 'fatal_count', 'young', 'deer',
       'lighting', 'occupants', 'b_level_count', 'pedestrian', 'dis_ctrl_i',
       'datetime', 'road_condition', 'number_of_units', 'school_bus',
       'speed_limit', 'hour', 'hit_and_run', 'bicycle', 'motorcycle',
       'a_level_count', 'c_level_count', 'most_severe_injury',
       'red_light_running', 'weekday', 'intersecting_road',
       'highway_classification', 'crash_type', 'train', 'weather',
       'property_damage', 'lane_departure', 'primary_road', 'alcohol', 'drug',
       'elderly', 'target'],
      dtype='object')

In [20]:
# Dropping the features which we don't want in the model
crashesEditedDf=crashesEditedDf.drop(columns=['jurisdiction','deer','dis_ctrl_i','intersecting_road','primary_road','young',
                                       'datetime','alcohol','drug','elderly','school_bus','train'])

In [21]:
# Assigning features to X and y
X = crashesEditedDf.drop(columns=['target'])
y = crashesEditedDf['target']

In [22]:
# SMOTE Upsampling
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X,y)

y_sm.value_counts()

Drug       132109
Alcohol    132109
Clean      132109
Name: target, dtype: int64

In [23]:
# Train Test Split
X_train, X_test, y_train, y_test = create_splits(X_sm,y_sm,stratify=y_sm)

In [24]:
param_grid = {'clf__penalty': 'l2',
             'clf__solver': 'lbfgs',
             'clf__max_iter': 1000}
mlflow_runLogR(model_params=param_grid,exp_name='TrafficCrash', run_Name='run4')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy_score 65.39
l2


In [34]:
!mlflow ui

^C


In [97]:
X_test.columns

Index(['jurisdiction', 'number_of_lanes', 'fatal_count', 'deer', 'lighting',
       'occupants', 'b_level_count', 'pedestrian', 'dis_ctrl_i',
       'road_condition', 'number_of_units', 'speed_limit', 'hour',
       'hit_and_run', 'bicycle', 'motorcycle', 'a_level_count',
       'c_level_count', 'most_severe_injury', 'red_light_running', 'weekday',
       'highway_classification', 'crash_type', 'weather', 'property_damage',
       'lane_departure', 'alcohol'],
      dtype='object')

## Deployment Process

In [42]:
import pickle
LRpipe = Pipeline([
                    ('std', StandardScaler()),
                    ('lr',LogisticRegression(C=1,penalty = 'l2', solver='lbfgs', max_iter=1000))
])
LRpipe.fit(X_train, y_train)
pickle.dump(LRpipe,open('model1.pkl', 'wb') )

In [43]:
loaded_model = pickle.load(open('model1.pkl', 'rb'))

In [46]:
loaded_model.predict()

array(['Alcohol'], dtype=object)

In [None]:
def pred_class(file):
    df = pd.read_csv(file)
    preds = loaded_model.predict(df)
    df['target'] = preds
    return(df)
    

In [33]:
X_test.head(10).to_csv('./testFile.csv')