In [1]:
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from lightgbm import LGBMClassifier
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot
from imblearn.under_sampling import RandomUnderSampler 
from collections import Counter
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
%load_ext autoreload
%matplotlib inline

# No 2020

## With resampling, TSCV

In [3]:
usAccidents = pd.read_csv("US_accidents_processed.csv", dtype = {'Severity' : object}) #Heavy
usAccidents['Start_Time'] = pd.to_datetime(usAccidents['Start_Time'])

FileNotFoundError: [Errno 2] No such file or directory: '../input/us-accidents/US_accidents_processed.csv'

In [None]:
usAccidents = usAccidents.sort_values(by=['Start_Time'])

In [None]:
print(usAccidents.Severity.value_counts())
usAccidents = usAccidents.drop(usAccidents[usAccidents.Year == 2020].index)
usAccidents
print(usAccidents.Severity.value_counts())

#Now no severity 1
#Proportion changes, before 85% were severity 2, now 78%

In [None]:
for c in usAccidents.columns:
    col_type = usAccidents[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        usAccidents[c] = usAccidents[c].astype('category')

In [None]:
#Undersample
counter = Counter(usAccidents['Severity'])
print(counter)

under = RandomUnderSampler()

X_res, y_res = under.fit_resample(usAccidents.drop(['Severity'], axis=1), usAccidents['Severity'])
print('Resampled dataset shape %s' % Counter(y_res))

#Put together to order by time
X_res['Severity'] = y_res
X_res = X_res.sort_values(by=['Start_Time'])
y_res = X_res['Severity']
X_res = X_res.drop(['Start_Time','Severity'], axis=1)

#It's a time series, need to split wrt time
#Take 80% for train and 20% for test
X_train = X_res.iloc[0:int(X_res.shape[0]*0.8)]
y_train = y_res.iloc[0:int(y_res.shape[0]*0.8)]

X_test = (X_res.merge(X_train, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']).drop(['_merge'],axis=1)
y_test = y_res.iloc[int(y_res.shape[0]*0.8):y_res.shape[0]]

## Find best param ..

## Algorithm

In [None]:
lgbm = LGBMClassifier(
    boosting_type='gbdt',
    learning_rate=0.037485092594051916, 
    max_depth=7, 
    n_estimators=396, 
    num_leaves=32, 
    subsample_for_bin=202824,
    
    metric='multi_logloss',
    num_class = 3, #No severity 1
    objective = "multiclass"
    )
# [('boosting_type', 'gbdt'),
#              ('learning_rate', 0.037485092594051916),
#              ('max_depth', 7),
#              ('n_estimators', 396),
#              ('num_leaves', 32),
#              ('subsample_for_bin', 202824)])
# LightGBM Model accuracy score: 0.8333
#               precision    recall  f1-score   support

#            2       0.93      0.92      0.92      7995
#            3       0.67      0.71      0.69      2991
#            4       0.76      0.75      0.76      3583

#     accuracy                           0.83     14569
#    macro avg       0.79      0.79      0.79     14569
# weighted avg       0.84      0.83      0.83     14569
# Again better with 2

In [None]:
lgbm.fit(X_train, y_train)

In [None]:
lgbm_prediction = lgbm.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(lgbm_prediction, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, lgbm_prediction)))

from sklearn.metrics import classification_report
print(classification_report(y_test, lgbm_prediction))

In [None]:
importances = lgbm.feature_importances_
lgbm_importances = pd.Series(importances, index=usAccidents.drop(['Severity','Start_Time'], axis=1).columns)
print(importances)
fig, ax = plt.subplots()
lgbm_importances.plot.bar(ax=ax)
ax.set_title("Gini Importance")
ax.set_ylabel("Importance")
fig.tight_layout()

## Without resampling, TSCV

In [None]:
usAccidents = pd.read_csv("../input/us-accidents/US_accidents_processed.csv", dtype = {'Severity' : object}) #Heavy
usAccidents['Start_Time'] = pd.to_datetime(usAccidents['Start_Time'])

In [None]:
usAccidents = usAccidents.sort_values(by=['Start_Time'])

In [None]:
print(usAccidents.Severity.value_counts())
usAccidents = usAccidents.drop(usAccidents[usAccidents.Year == 2020].index)
usAccidents
print(usAccidents.Severity.value_counts())

#Now no severity 1
#Proportion changes, before 85% were severity 2, now 78%

In [None]:
for c in usAccidents.columns:
    col_type = usAccidents[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        usAccidents[c] = usAccidents[c].astype('category')

In [None]:
#It's a time series, need to split wrt time
#Take 80% for train and 20% for test
#TRAIN TEST NO RESAMPLING TIME SERIES
X_res = usAccidents.drop(['Severity','Start_Time'], axis=1)
y_res = usAccidents['Severity']
X_train = X_res.iloc[0:int(X_res.shape[0]*0.8)]
y_train = y_res.iloc[0:int(y_res.shape[0]*0.8)]

X_test = (X_res.merge(X_train, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']).drop(['_merge'],axis=1)
y_test = y_res.iloc[int(y_res.shape[0]*0.8):y_res.shape[0]]

## Find best params..

## Algorithm

In [None]:
lgbm = LGBMClassifier(
    boosting_type='dart',
    learning_rate=0.29215143108583874, 
    max_depth=3, 
    n_estimators=789, 
    num_leaves=52, 
    subsample_for_bin=201458,
    
    metric='multi_logloss',
    num_class = 3,
    objective = "multiclass"
    )
# [('boosting_type', 'dart'),
#              ('learning_rate', 0.29215143108583874),
#              ('max_depth', 3),
#              ('n_estimators', 789),
#              ('num_leaves', 52),
#              ('subsample_for_bin', 201458)]
# LightGBM Model accuracy score: 0.9347
#               precision    recall  f1-score   support

#            2       0.95      0.99      0.97     45471
#            3       0.70      0.49      0.57      2701
#            4       0.73      0.47      0.57      2579

#     accuracy                           0.93     50751
#    macro avg       0.79      0.65      0.71     50751
# weighted avg       0.93      0.93      0.93     50751
# Again better with 2 but better than with resampling

In [None]:
lgbm.fit(X_train, y_train)

In [None]:
lgbm_prediction = lgbm.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(lgbm_prediction, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, lgbm_prediction)))

from sklearn.metrics import classification_report
print(classification_report(y_test, lgbm_prediction))

In [None]:
importances = lgbm.feature_importances_
lgbm_importances = pd.Series(importances, index=usAccidents.drop(['Severity','Start_Time'], axis=1).columns)
print(importances)
fig, ax = plt.subplots()
lgbm_importances.plot.bar(ax=ax)
ax.set_title("Gini Importance")
ax.set_ylabel("Importance")
fig.tight_layout()

## WIth KFold, whole data

In [None]:
usAccidents = pd.read_csv("../input/us-accidents/US_accidents_processed.csv", dtype = {'Severity' : object}) #Heavy
usAccidents['Start_Time'] = pd.to_datetime(usAccidents['Start_Time'])

In [None]:
usAccidents = usAccidents.sort_values(by=['Start_Time'])

In [None]:
print(usAccidents.Severity.value_counts())
usAccidents = usAccidents.drop(usAccidents[usAccidents.Year == 2020].index)
usAccidents
print(usAccidents.Severity.value_counts())

#Now no severity 1
#Proportion changes, before 85% were severity 2, now 78%

In [None]:
for c in usAccidents.columns:
    col_type = usAccidents[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        usAccidents[c] = usAccidents[c].astype('category')

In [None]:
#Take 80% for train and 20% for test
#TRAIN TEST NO RESAMPLING
usAccidents =usAccidents.sample(frac=1) #KFOLD
X_res = usAccidents.drop(['Severity','Start_Time'], axis=1)
y_res = usAccidents['Severity']

#KFold
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.20, 
    random_state=314, stratify= y_res)

## Find best param..

## Algorithm

In [None]:
lgbm = LGBMClassifier(
    boosting_type='gbdt',
    learning_rate=0.22222317015932044, 
    max_depth=9, 
    n_estimators=766, 
    num_leaves=58, 
    subsample_for_bin=200828,
    
    metric='multi_logloss',
    num_class = 3,
    objective = "multiclass"
    )
# [('boosting_type', 'gbdt'),
#              ('learning_rate', 0.22222317015932044),
#              ('max_depth', 9),
#              ('n_estimators', 766),
#              ('num_leaves', 58),
#              ('subsample_for_bin', 200828)]
# LightGBM Model accuracy score: 0.8611
#               precision    recall  f1-score   support

#            2       0.90      0.95      0.92     39620
#            3       0.63      0.47      0.54      6275
#            4       0.74      0.65      0.69      4856

#     accuracy                           0.86     50751
#    macro avg       0.75      0.69      0.72     50751
# weighted avg       0.85      0.86      0.85     50751
# Again better with sev 2

In [None]:
lgbm.fit(X_train, y_train)

In [None]:
lgbm_prediction = lgbm.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(lgbm_prediction, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, lgbm_prediction)))

from sklearn.metrics import classification_report
print(classification_report(y_test, lgbm_prediction))

In [None]:
importances = lgbm.feature_importances_
lgbm_importances = pd.Series(importances, index=usAccidents.drop(['Severity','Start_Time'], axis=1).columns)
print(importances)
fig, ax = plt.subplots()
lgbm_importances.plot.bar(ax=ax)
ax.set_title("Gini Importance")
ax.set_ylabel("Importance")
fig.tight_layout()

## With KFold, resampling

In [None]:
usAccidents = pd.read_csv("../input/us-accidents/US_accidents_processed.csv", dtype = {'Severity' : object}) #Heavy
usAccidents['Start_Time'] = pd.to_datetime(usAccidents['Start_Time'])

In [None]:
usAccidents = usAccidents.sort_values(by=['Start_Time'])

In [None]:
print(usAccidents.Severity.value_counts())
usAccidents = usAccidents.drop(usAccidents[usAccidents.Year == 2020].index)
usAccidents
print(usAccidents.Severity.value_counts())

#Now no severity 1
#Proportion changes, before 85% were severity 2, now 78%

In [None]:
for c in usAccidents.columns:
    col_type = usAccidents[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        usAccidents[c] = usAccidents[c].astype('category')

In [None]:
#Undersample
counter = Counter(usAccidents['Severity'])
print(counter)

under = RandomUnderSampler()

In [None]:
usAccidents =usAccidents.sample(frac=1) #KFOLD
X_res, y_res = under.fit_resample(usAccidents.drop(['Severity','Start_Time'], axis=1), usAccidents['Severity'])

X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.20, 
    random_state=314, stratify= y_res)

## Find best param..

In [None]:
# from skopt import BayesSearchCV
# from skopt.space import Real, Categorical, Integer

# lgbm_clf = LGBMClassifier(
#     metric='multi_logloss',
#     num_class = 3,
#     objective = "multiclass"
#     )

# search_space = {
#     "boosting_type": Categorical(['gbdt', 'dart','goss']),
#         "num_leaves": Integer(6, 60), 
#         "max_depth": Integer(-1, 10), 
#         "learning_rate": Real(0.001,1),
#         "n_estimators": Integer(2, 1000),
#         "subsample_for_bin": Integer(199000, 204000),
#     }

# cv = KFold(n_splits=5,shuffle=True)

# def on_step(val):
#     print("DONE")


# lgbm_bayes_search = BayesSearchCV(
#     estimator = lgbm_clf, 
#     search_spaces = search_space, 
#     n_iter=10, # specify how many iterations    
#     scoring="accuracy",
#     cv=cv,
#     verbose =0,
#     n_jobs= -1
# )
# lgbm_bayes_search.fit(X_train, y_train, callback = on_step) # callback=on_step will print after each iteration

In [None]:
# lgbm_bayes_search.best_params_

In [None]:
# lgbm_bayes_search.best_score_

## Algorithm

In [None]:
lgbm = LGBMClassifier(
    boosting_type='gbdt',
    learning_rate=0.39326998667103985, 
    max_depth=6, 
    n_estimators=657, 
    num_leaves=30, 
    subsample_for_bin=199317,
    
    metric='multi_logloss',
    num_class = 3,
    objective = "multiclass"
    )
# [('boosting_type', 'gbdt'),
#              ('learning_rate', 0.39326998667103985),
#              ('max_depth', 6),
#              ('n_estimators', 657),
#              ('num_leaves', 30),
#              ('subsample_for_bin', 199317)]
# LightGBM Model accuracy score: 0.7481
#               precision    recall  f1-score   support

#            2       0.81      0.74      0.77      4857
#            3       0.68      0.70      0.69      4856
#            4       0.76      0.81      0.78      4856

#     accuracy                           0.75     14569
#    macro avg       0.75      0.75      0.75     14569
# weighted avg       0.75      0.75      0.75     14569

In [None]:
lgbm.fit(X_train, y_train)

In [None]:
lgbm_prediction = lgbm.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(lgbm_prediction, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, lgbm_prediction)))

from sklearn.metrics import classification_report
print(classification_report(y_test, lgbm_prediction))

In [None]:
importances = lgbm.feature_importances_
lgbm_importances = pd.Series(importances, index=usAccidents.drop(['Severity','Start_Time'], axis=1).columns)
print(importances)
fig, ax = plt.subplots()
lgbm_importances.plot.bar(ax=ax)
ax.set_title("Gini Importance")
ax.set_ylabel("Importance")
fig.tight_layout()