In [None]:
#Importing dependencies
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score
from sklearn.dummy import DummyRegressor, DummyClassifier
import matplotlib.pyplot as plt
import seaborn as sns 
from imblearn.pipeline import Pipeline as ImPipeline

import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/crashes_clean', index_col=[0])
df.head()

In [None]:
"""
Here I'm creating my train-test-split and setting my target variable. 
'PRIM_CONTRIBUTORY_CAUSE' is the target since we are looking for the possible cause
of accidents on the road in relation to the other columns

"""
X = df.drop('PRIM_CONTRIBUTORY_CAUSE', axis=1)
y = df['PRIM_CONTRIBUTORY_CAUSE']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=19651609)

In [None]:
categorical_columns= X.select_dtypes(include= 'object').columns
numeric_columns= X.select_dtypes(exclude= 'object').columns

In [None]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median", add_indicator=True)), ("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns),
        ("cat", categorical_transformer, categorical_columns)])

Here I am instantiating and fitting a dummy classification model just to check out my odds against random chance. It's not really a necessary step, but I'm using this basic score as my baseline model accuracy. 

In [None]:
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
dummy.score(X_train, y_train)

In [None]:
dummy.score(X_test, y_test)

In [None]:
plot_confusion_matrix(dummy, X_test, y_test);

## Random Forest Iterations

In [None]:
# Here I went on and balanced the classes as I know that an imbalance exists
# I want to get my basic rf score before gridsearching for comparisons sake
rf_model = Pipeline(steps=[('clean', preprocessor), ('rf', RandomForestClassifier(class_weight='balanced'))])
rf_model.fit(X_train, y_train)

print(f'This is the basic training score: {rf_model.score(X_train, y_train)}')
print(f'This is the basic testing score: {rf_model.score(X_test, y_test)}')

In [13]:
rf_model = Pipeline(steps=[('clean', preprocessor), 
                           ('rf', RandomForestClassifier(class_weight='balanced'))])

parameters = {'rf__n_estimators': [50, 100, 150, 200]}

rf_gs = GridSearchCV(estimator=rf_model,
                 param_grid=parameters,
                 cv=5, error_score= 'raise')

rf_gs.fit(X_train, y_train)

print(f'This is the 1st grid search training score: {rf_gs.score(X_train, y_train)}')
print(f'This is the 1st grid search testing score: {rf_gs.score(X_test, y_test)}')
print(f"This is the 1st grid search's best parameters': {rf_gs.best_params_}")

KeyboardInterrupt: ignored