In [4]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

X_cols_drop = ['id', 'date', 'partlybad']
y_col = 'class4'

y = train_df[y_col]
X = train_df.drop(columns=[y_col] + X_cols_drop)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('model', None),
])

grid = [
    {
        "pca__n_components": [0.9, 0.95, 0.99],
        "model": [LogisticRegression(max_iter=10000, penalty="l2", class_weight='balanced')],
        "model__C": [0.01, 0.1, 1, 10, 100],
    },
    {
        "pca__n_components": [0.9, 0.95, 0.99],
        "model": [LogisticRegression(max_iter=10000, solver='saga', penalty="l1", class_weight='balanced')],
        "model__C": [0.01, 0.1, 1, 10, 100],
    },
    {
        "pca": ["passthrough"],
        "model": [LogisticRegression(max_iter=10000, solver='saga', penalty="l1", class_weight='balanced')],
        "model__C": [0.01, 0.1, 1, 10, 100],
    },
    {
        "pca": ["passthrough"],
        "model": [RandomForestClassifier(class_weight='balanced')],
        "model__n_estimators": [50, 100, 150, 200, 500],
        "model__max_depth": [None, 10, 20, 40],
        "model__max_features": ["sqrt", 0.1, 0.11, 0.12, 0.13, 0.2],
        "model__min_samples_leaf": [1, 2, 5],
    },
    {
        "pca__n_components": [0.9, 0.95, 0.99],
        "model__estimator__C": [0.1, 0.001, 1, 10, 100, 1000],
        "model": [CalibratedClassifierCV(LinearSVC(max_iter=50000, multi_class='ovr', penalty='l2'))]
    },
    {
        "pca__n_components": [0.9, 0.95, 0.99],
        "model__estimator__C": [0.1, 0.001, 1, 10, 100, 1000],
        "model": [CalibratedClassifierCV(LinearSVC(max_iter=50000, multi_class='ovr', penalty='l1', loss='squared_hinge', dual=False))]
    },
]

cv = StratifiedKFold(n_splits=10, shuffle=True)
gs = GridSearchCV(pipeline, grid, cv=cv, scoring="roc_auc_ovr", n_jobs=-1)
gs.fit(X, y)

best_model = gs.best_estimator_
print(best_model)
X_test = test_df.drop(columns=X_cols_drop)
prediction = best_model.predict(X_test)

y = y != 'nonevent'
gs = GridSearchCV(pipeline, grid, cv=cv, scoring="roc_auc_ovr", n_jobs=-1)
gs.fit(X, y)
best_model = gs.best_estimator_
print(best_model)
probas = best_model.predict_proba(X_test)[:, 1]

result_df = pd.DataFrame({
    "id": test_df["id"],
    "class4": prediction,
    "p": probas
})

result_df.to_csv('submission.csv', index=False)

Pipeline(steps=[('scaler', StandardScaler()), ('pca', 'passthrough'),
                ('model',
                 RandomForestClassifier(class_weight='balanced', max_depth=10,
                                        max_features=0.13, min_samples_leaf=5,
                                        n_estimators=50))])
Pipeline(steps=[('scaler', StandardScaler()), ('pca', 'passthrough'),
                ('model',
                 LogisticRegression(C=1, class_weight='balanced',
                                    max_iter=10000, penalty='l1',
                                    solver='saga'))])
