In [None]:
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from lightgbm import LGBMClassifier
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot
from imblearn.under_sampling import RandomUnderSampler 
from collections import Counter
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [None]:
%load_ext autoreload
%matplotlib inline

## TSCV, with resampling

In [None]:
usAccidents = pd.read_csv("../input/us-accidents/US_accidents_processed.csv", dtype = {'Severity' : object}) #Heavy
usAccidents['Start_Time'] = pd.to_datetime(usAccidents['Start_Time'])

In [None]:
usAccidents = usAccidents.sort_values(by=['Start_Time'])
usAccidents

In [None]:
for c in usAccidents.columns:
    col_type = usAccidents[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        usAccidents[c] = usAccidents[c].astype('category')

In [None]:
#Undersample
counter = Counter(usAccidents['Severity'])
print(counter)

under = RandomUnderSampler()

X_res, y_res = under.fit_resample(usAccidents.drop(['Severity'], axis=1), usAccidents['Severity'])
print('Resampled dataset shape %s' % Counter(y_res))

#Put together to order by time
X_res['Severity'] = y_res
X_res = X_res.sort_values(by=['Start_Time'])
y_res = X_res['Severity']
X_res = X_res.drop(['Start_Time','Severity'], axis=1)

#It's a time series, need to split wrt time
#Take 80% for train and 20% for test
X_train = X_res.iloc[0:int(X_res.shape[0]*0.8)]
y_train = y_res.iloc[0:int(y_res.shape[0]*0.8)]

X_test = (X_res.merge(X_train, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']).drop(['_merge'],axis=1)
y_test = y_res.iloc[int(y_res.shape[0]*0.8):y_res.shape[0]]

## Find best param 

In [None]:
# from skopt import BayesSearchCV
# from skopt.space import Real, Categorical, Integer

# lgbm_clf = LGBMClassifier(
#     metric='multi_logloss',
#     num_class = 4,
#     objective = "multiclass"
#     )

# search_space = {
#     "boosting_type": Categorical(['gbdt', 'dart','goss']),
#         "num_leaves": Integer(6, 60), 
#         "max_depth": Integer(-1, 10), 
#         "learning_rate": Real(0.001,1),
#         "n_estimators": Integer(2, 1000),
#         "subsample_for_bin": Integer(199000, 204000),
#     }

# cv = TimeSeriesSplit(n_splits=5)#TSCV Remember to order by date
# #cv = KFold(n_splits=5,shuffle=True)

# def on_step(val):
#     print("DONE")


# lgbm_bayes_search = BayesSearchCV(
#     estimator = lgbm_clf, 
#     search_spaces = search_space, 
#     n_iter=10, # specify how many iterations    
#     scoring="accuracy",
#     cv=cv,
#     verbose =0,
#     n_jobs= -1
# )
# lgbm_bayes_search.fit(X_train, y_train, callback = on_step) # callback=on_step will print after each iteration

In [None]:
# lgbm_bayes_search.best_params_

In [None]:
# lgbm_bayes_search.best_score_

In [None]:
# from skopt.plots import plot_convergence
# DOESN'T WORK WITH BAYESSEARCH
# plot_convergence(result) 

## Algorithm

In [None]:
lgbm = LGBMClassifier(
    boosting_type='dart',
    learning_rate=0.5067952604484551, 
    max_depth=1, 
    n_estimators=289, 
    num_leaves=41, 
    subsample_for_bin=202188,
    
    metric='multi_logloss',
    num_class = 4,
    objective = "multiclass"
    )
#With undersampling only about 36% accuracy, changes a bit based on sampling
#So undersampling is not useful with tscv
# [('boosting_type', 'dart'), #Undersampling
#              ('learning_rate', 0.5067952604484551),
#              ('max_depth', 1),
#              ('n_estimators', 289),
#              ('num_leaves', 41),
#              ('subsample_for_bin', 202188)]
# [('boosting_type', 'goss'), #Undersampling better
#              ('learning_rate', 0.03825306874972967),
#              ('max_depth', -1),
#              ('n_estimators', 379),
#              ('num_leaves', 7),
#              ('subsample_for_bin', 201681)]

In [None]:
lgbm.fit(X_train, y_train)

In [None]:
lgbm_prediction = lgbm.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(lgbm_prediction, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, lgbm_prediction)))

from sklearn.metrics import classification_report
print(classification_report(y_test, lgbm_prediction))

In [None]:
importances = lgbm.feature_importances_
lgbm_importances = pd.Series(importances, index=usAccidents.drop(['Severity','Start_Time'], axis=1).columns)
print(importances)
fig, ax = plt.subplots()
lgbm_importances.plot.bar(ax=ax)
ax.set_title("Gini Importance")
ax.set_ylabel("Importance")
fig.tight_layout()

## TSCV, no undersampling (whole data)

In [None]:
usAccidents = pd.read_csv("../input/us-accidents/US_accidents_processed.csv", dtype = {'Severity' : object}) #Heavy
usAccidents['Start_Time'] = pd.to_datetime(usAccidents['Start_Time'])

In [None]:
usAccidents = usAccidents.sort_values(by=['Start_Time'])
usAccidents

In [None]:
for c in usAccidents.columns:
    col_type = usAccidents[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        usAccidents[c] = usAccidents[c].astype('category')

In [None]:
#It's a time series, need to split wrt time
#Take 80% for train and 20% for test
#TRAIN TEST NO RESAMPLING
X_res = usAccidents.drop(['Severity','Start_Time'], axis=1)
y_res = usAccidents['Severity']
X_train = X_res.iloc[0:int(X_res.shape[0]*0.8)]
y_train = y_res.iloc[0:int(y_res.shape[0]*0.8)]

X_test = X_res.iloc[int(X_res.shape[0]*0.8):X_res.shape[0]]
y_test = y_res.iloc[int(y_res.shape[0]*0.8):y_res.shape[0]]
X_test

## Find best Param ..

## Algorithm

In [None]:
lgbm = LGBMClassifier(
    boosting_type='dart',
    learning_rate=0.020739410170698216, 
    max_depth=4, 
    n_estimators=83, 
    num_leaves=45, 
    subsample_for_bin=201000,
    
    metric='multi_logloss',
    num_class = 4,
    objective = "multiclass"
    )
#Without undersampling 98% (See later)
# [('boosting_type', 'dart'),#tscv not resampled
#              ('learning_rate', 0.020739410170698216),
#              ('max_depth', 4),
#              ('n_estimators', 83),
#              ('num_leaves', 45),
#              ('subsample_for_bin', 201000)]

#Predict always 2 with 2020 keeping the time sequence (Because we have lots of 2 (see support) and few 4)
#              precision    recall  f1-score   support

#            2       0.99      0.99      0.99    192411
#            3       0.00      0.00      0.00         0
#            4       0.06      0.03      0.04      2835

#     accuracy                           0.98    195246
#    macro avg       0.35      0.34      0.34    195246
# weighted avg       0.97      0.98      0.97    195246

In [None]:
lgbm.fit(X_train, y_train)

In [None]:
lgbm_prediction = lgbm.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(lgbm_prediction, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, lgbm_prediction)))

from sklearn.metrics import classification_report
print(classification_report(y_test, lgbm_prediction))

In [None]:
importances = lgbm.feature_importances_
lgbm_importances = pd.Series(importances, index=usAccidents.drop(['Severity','Start_Time'], axis=1).columns)
print(importances)
fig, ax = plt.subplots()
lgbm_importances.plot.bar(ax=ax)
ax.set_title("Gini Importance")
ax.set_ylabel("Importance")
fig.tight_layout()

## With KFold

In [None]:
usAccidents = pd.read_csv("../input/us-accidents/US_accidents_processed.csv", dtype = {'Severity' : object}) #Heavy
usAccidents['Start_Time'] = pd.to_datetime(usAccidents['Start_Time'])

In [None]:
usAccidents = usAccidents.sort_values(by=['Start_Time'])
usAccidents

In [None]:
for c in usAccidents.columns:
    col_type = usAccidents[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        usAccidents[c] = usAccidents[c].astype('category')

In [None]:
#Take 80% for train and 20% for test
#TRAIN TEST NO RESAMPLING
usAccidents =usAccidents.sample(frac=1) #KFOLD
X_res = usAccidents.drop(['Severity','Start_Time'], axis=1)
y_res = usAccidents['Severity']

X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.20, 
    random_state=314, stratify= y_res)

## Find best param ...

## Algorithm

In [None]:
lgbm = LGBMClassifier(
    boosting_type='gbdt',
    learning_rate=0.408870701417182, 
    max_depth=-1, 
    n_estimators=225, 
    num_leaves=19, 
    subsample_for_bin=201124,
    
    metric='multi_logloss',
    num_class = 4,
    objective = "multiclass"
    )

#Without undersampling and KFOLD 97%
# [('boosting_type', 'gbdt'), #KFold
#              ('learning_rate', 0.408870701417182),
#              ('max_depth', -1),
#              ('n_estimators', 225),
#              ('num_leaves', 19),
#              ('subsample_for_bin', 201124)]

# LightGBM Model accuracy score: 0.8892
#               precision    recall  f1-score   support

#            1       0.69      0.47      0.56      5081
#            2       0.92      0.97      0.94    165348
#            3       0.64      0.45      0.53     14950
#            4       0.63      0.40      0.49      9867

#     accuracy                           0.89    195246
#    macro avg       0.72      0.57      0.63    195246
# weighted avg       0.88      0.89      0.88    195246

#It's better but predict more 2 due to the high frequency

In [None]:
lgbm.fit(X_train, y_train)

In [None]:
lgbm_prediction = lgbm.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(lgbm_prediction, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, lgbm_prediction)))

from sklearn.metrics import classification_report
print(classification_report(y_test, lgbm_prediction))

In [None]:
importances = lgbm.feature_importances_
lgbm_importances = pd.Series(importances, index=usAccidents.drop(['Severity','Start_Time'], axis=1).columns)
print(importances)
fig, ax = plt.subplots()
lgbm_importances.plot.bar(ax=ax)
ax.set_title("Gini Importance")
ax.set_ylabel("Importance")
fig.tight_layout()

## With KFold, resampling

In [1]:
usAccidents = pd.read_csv("../input/us-accidents/US_accidents_processed.csv", dtype = {'Severity' : object}) #Heavy
usAccidents['Start_Time'] = pd.to_datetime(usAccidents['Start_Time'])

NameError: ignored

In [None]:
usAccidents = usAccidents.sort_values(by=['Start_Time'])
usAccidents

In [None]:
for c in usAccidents.columns:
    col_type = usAccidents[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        usAccidents[c] = usAccidents[c].astype('category')

In [None]:
#Undersample
counter = Counter(usAccidents['Severity'])
print(counter)

under = RandomUnderSampler()

In [None]:
usAccidents =usAccidents.sample(frac=1) #KFOLD
X_res, y_res = under.fit_resample(usAccidents.drop(['Severity','Start_Time'], axis=1), usAccidents['Severity'])

X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.20, 
    random_state=314, stratify= y_res)

## Find best param..

## Algorithm

In [None]:
lgbm = LGBMClassifier(
    boosting_type='gbdt',
    learning_rate=0.5437101653406844, 
    max_depth=5, 
    n_estimators=663, 
    num_leaves=31, 
    subsample_for_bin=201484,
    
    metric='multi_logloss',
    num_class = 4,
    objective = "multiclass"
    )
# [('boosting_type', 'gbdt'),#With kfold
#              ('learning_rate', 0.5437101653406844),
#              ('max_depth', 5),
#              ('n_estimators', 663),
#              ('num_leaves', 31),
#              ('subsample_for_bin', 201484)]
# LightGBM Model accuracy score: 0.7807
#               precision    recall  f1-score   support

#            1       0.89      0.92      0.90      5108
#            2       0.81      0.76      0.78      5108
#            3       0.71      0.70      0.71      5108
#            4       0.72      0.75      0.73      5108

#     accuracy                           0.78     20432
#    macro avg       0.78      0.78      0.78     20432
# weighted avg       0.78      0.78      0.78     20432
#The best so far

In [None]:
lgbm.fit(X_train, y_train)

In [None]:
lgbm_prediction = lgbm.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(lgbm_prediction, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, lgbm_prediction)))

from sklearn.metrics import classification_report
print(classification_report(y_test, lgbm_prediction))

In [None]:
importances = lgbm.feature_importances_
lgbm_importances = pd.Series(importances, index=usAccidents.drop(['Severity','Start_Time'], axis=1).columns)
print(importances)
fig, ax = plt.subplots()
lgbm_importances.plot.bar(ax=ax)
ax.set_title("Gini Importance")
ax.set_ylabel("Importance")
fig.tight_layout()

# #Now try to take away 2020 that has a lot of severity 2 and more data sources