In [1]:
import pandas as pd

quick = True

if quick:
    start_year, end_year, chk, sampled, filename = [2005, 2021, False, True, 'df-light.pkl']
else:
    start_year, end_year, chk, sampled, filename = [2005, 2021, True, False, 'df-full.pkl']

from sklearn.model_selection import train_test_split

df = pd.read_pickle(f'./{filename}')
df = df.sample(10000)
data = df.iloc[:, 1:]
target = df['grav']

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=222)

In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest
from category_encoders import TargetEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
import time

start_time = time.time()

cols_target_encoded = ['dep', 'age']
cols_onehot_encoded = X_train.columns.drop(cols_target_encoded)
scaler              = StandardScaler()
selector            = SelectKBest()

encoder_target = TargetEncoder(cols=cols_target_encoded)
encoder_onehot = OneHotEncoder(cols=cols_onehot_encoded)
sampler        = SMOTE(random_state=42)
# sampler        = RandomUnderSampler()
# sampler        = RandomOverSampler()
model          = SVC()


X_train_te = encoder_target.fit_transform(X_train, y_train)
X_test_te  = encoder_target.transform(X_test)

X_train_oh = encoder_onehot.fit_transform(X_train_te, y_train)
X_test_oh  = encoder_onehot.transform(X_test_te)

X_train_sc = pd.DataFrame(scaler.fit_transform(X_train_oh), columns=X_train_oh.columns)
X_test_sc  = pd.DataFrame(scaler.transform(X_test_oh), columns=X_test_oh.columns)

X_train_sel = selector.fit_transform(X_train_sc, y_train)
X_train_sel = pd.DataFrame(X_train_sel, columns=selector.get_feature_names_out())
X_test_sel  = pd.DataFrame(selector.transform(X_test_sc), columns=selector.get_feature_names_out())

X_train_rs, y_train_rs = sampler.fit_resample(X_train_sel, y_train)
X_test_rs, y_test_rs   = sampler.fit_resample(X_test_sel, y_test)

params = {
    'C' : [10, 100],
    # 'kernel' : ('rbf', 'linear', 'poly'),
    # 'gamma' : (0.01, 0.1, 0.5)
}

grid = GridSearchCV(estimator=model, param_grid=params, cv = 3, verbose=10)
grid.fit(X_train_rs, y_train_rs)

print('Best score  : ', grid.best_score_)
print('Best params : ', grid.best_params_)

y_pred = grid.predict(X_test_rs)
print(classification_report_imbalanced(y_test_rs, y_pred))

print("--- performed in %s seconds ---" % (time.time() - start_time))

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 1/3; 1/2] START C=100.......................................................
[CV 1/3; 1/2] END ........................C=100;, score=0.714 total time=  12.2s
[CV 2/3; 1/2] START C=100.......................................................
[CV 2/3; 1/2] END ........................C=100;, score=0.726 total time=  15.4s
[CV 3/3; 1/2] START C=100.......................................................
[CV 3/3; 1/2] END ........................C=100;, score=0.711 total time=  13.5s
[CV 1/3; 2/2] START C=1000......................................................
[CV 1/3; 2/2] END .......................C=1000;, score=0.715 total time=  53.9s
[CV 2/3; 2/2] START C=1000......................................................
[CV 2/3; 2/2] END .......................C=1000;, score=0.733 total time= 1.4min
[CV 3/3; 2/2] START C=1000......................................................
[CV 3/3; 2/2] END .......................C=1000;, 