In [15]:
from pipelines.DataPipeline import DataPipeline
from transformers.Preprocessing import Preprocessing
from sklearn.model_selection import train_test_split

from utils.DataHelper import DataHelper
from utils.Common import Config
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

import pandas as pd
import matplotlib.pyplot as plt


In [16]:
RAW_DATA_PATH = "../data/raw/KSI.csv"
df = pd.read_csv(RAW_DATA_PATH)

In [17]:
# fill missing values, adding new columns, extracting useful columns
pc = Preprocessing(df, Config.binary_columns, Config.cat_attribs, Config.num_attribs, Config.label)
new_df = pc.getFrame()

In [18]:
# seperate feature and label
X = new_df[Config.cat_attribs + Config.num_attribs+ Config.binary_columns]
Y = new_df[Config.label]

In [24]:
print(X[:1].T)

                                   0
VEHTYPE                        Other
ROAD_CLASS            Major Arterial
LOCCOORD                Intersection
DISTRICT       Toronto and East York
TRAFFCTL                  No Control
LIGHT                           Dark
RDSFCOND                         Wet
INVTYPE                    Passenger
IMPACTYPE                Approaching
INVAGE                      50 to 54
YEAR                            2006
TIME                               3
LATITUDE                   43.699595
LONGITUDE                 -79.318797
MONTH                              1
DAY                                6
PEDESTRIAN                         0
CYCLIST                            0
AUTOMOBILE                         1
TRUCK                              0
TRSN_CITY_VEH                      0
PASSENGER                          1
SPEEDING                           1
AG_DRIV                            1


In [7]:
# pass feature to pipeline and convert it to numerical data
X = DataPipeline(Config.num_attribs,Config.cat_attribs).process(X)

In [8]:
X.isna().sum().sum()

0

In [9]:
Y.value_counts().tolist()

[14246, 2201]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=Config.test_size, stratify=Y)

In [11]:
from imblearn.over_sampling import SMOTE
smote_minority = SMOTE(sampling_strategy = "minority")

X_train_sm, y_train_sm = smote_minority.fit_resample(X_train, y_train)


In [10]:

# Create param grid
param_grid = {'C': [0.1, 1, 10, 100],  
              'gamma':['scale', 'auto'],
              'kernel': [ 'linear', 'rbf', 'sigmoid', 'poly' ]}  


clf = GridSearchCV(estimator=SVC(),param_grid=param_grid,n_jobs=10,cv=10)
best_clf = clf.fit(X_train_sm, y_train_sm)
best_clf.best_estimator_
best_clf.best_score_

0.9731496000862181

In [14]:
best_model = best_clf.best_estimator_
best_model.score(X_test,y_test)

0.9088145896656535

In [15]:
best_clf.best_params_
# {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}

{'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}

In [14]:
import joblib

best_model = SVC(C= 100, gamma= 'scale', kernel= 'rbf')
best_model.fit(X_train_sm, y_train_sm)
joblib.dump(best_model,'../models/best_model_svc.pkl')


['../models/best_model_svc.pkl']