In [1]:
from pipelines.DataPipeline import DataPipeline
from transformers.Preprocessing import Preprocessing
from sklearn.model_selection import train_test_split
from utils.Common import Config
import pandas as pd

In [2]:
RAW_DATA_PATH = "../data/raw/KSI.csv"
df = pd.read_csv(RAW_DATA_PATH)


In [3]:
# fill missing values, adding new columns, extracting useful columns
pc = Preprocessing(
    df, Config.binary_columns, Config.cat_attribs, Config.num_attribs, Config.label
)
new_df = pc.getFrame()

In [4]:
# seperate feature and label
X = new_df[Config.cat_attribs + Config.num_attribs + Config.binary_columns]
Y = new_df[Config.label]

In [5]:
# pass feature to pipeline and convert it to numerical data
dp = DataPipeline(Config.num_attribs, Config.cat_attribs)
X = dp.process(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=Config.test_size, stratify=Y
)

In [7]:
from imblearn.over_sampling import SMOTE

smote_minority = SMOTE(n_jobs=-1, sampling_strategy="minority")

X_train_sm, y_train_sm = smote_minority.fit_resample(X_train, y_train)



In [8]:
import joblib

rf_clf = joblib.load("../models/best_model_random_forest.pkl")
rf_clf.fit(X_train_sm, y_train_sm)
rf_clf.score(X_train_sm, y_train_sm)

0.9405493155493155

In [9]:
important_cols = {}  # a dict to hold feature_name: feature_importance
for feature, importance in zip(dc.converted_columns, rf_clf.feature_importances_):
    important_cols[feature] = importance  # add the name/value pair

sorted(important_cols.items(), key=lambda x: x[1], reverse=True)

[('cat__INVTYPE_Driver', 0.06929757342679758),
 ('scaler__LATITUDE', 0.04581676891646922),
 ('remainder__SPEEDING', 0.037463804297245024),
 ('remainder__TRUCK', 0.03507687630931674),
 ('scaler__YEAR', 0.033649513650228446),
 ('remainder__PEDESTRIAN', 0.031049465329402143),
 ('scaler__DAY', 0.030147752121445735),
 ('cat__IMPACTYPE_Pedestrian Collisions', 0.029821875690099513),
 ('scaler__LONGITUDE', 0.027563848514924338),
 ('cat__INVTYPE_Passenger', 0.027000621922909526),
 ('scaler__TIME', 0.02652934780333915),
 ('remainder__AG_DRIV', 0.02595183159123393),
 ('cat__INVAGE_25 to 29', 0.024923977483740596),
 ('cat__INVTYPE_Pedestrian', 0.02411586596771596),
 ('cat__VEHTYPE_Other', 0.02214085131819316),
 ('scaler__MONTH', 0.021386178636767558),
 ('cat__INVAGE_35 to 39', 0.020727854570170522),
 ('cat__IMPACTYPE_Rear End', 0.019811592257559307),
 ('cat__INVAGE_unknown', 0.01959925966477376),
 ('cat__INVAGE_45 to 49', 0.017642106362214364),
 ('remainder__AUTOMOBILE', 0.016511746555094877),
 ('