In [None]:
import pandas as pd
import numpy as np
import datetime

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV


train_df = pd.read_csv('data/train.csv')

months = train_df['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').month)
unique_months = months.value_counts().sort_index()
min_count = unique_months.min()
uniform_idx = months.groupby(months).apply(lambda idx: idx.sample(n=min_count, random_state=42)).reset_index(drop=True)
uniform_train_df = train_df.loc[uniform_idx]
train_df = uniform_train_df

test_df = pd.read_csv('data/test.csv')

X_cols_drop = ['id', 'date', 'partlybad']
y_col = 'class4'

y = train_df[y_col]
X = train_df.drop(columns=[y_col] + X_cols_drop)

grid = {
    "estimator__max_depth": [5, 7, 8, 9]
}

search = GridSearchCV(
    CalibratedClassifierCV(RandomForestClassifier(n_jobs=-1), method='sigmoid', cv=10, n_jobs=-1),
    grid,
    cv=StratifiedKFold(n_splits=10, shuffle=True),
    scoring='roc_auc_ovr',
    n_jobs=-1
)

search.fit(X, y)
model = search.best_estimator_

print(search.best_params_)
print(model)

X_test = test_df.drop(columns=X_cols_drop)
prediction = model.predict(X_test)

y = y != 'nonevent'

model = make_pipeline(StandardScaler(), PCA(0.95), LogisticRegressionCV(cv=10, max_iter=10000, penalty='l1', solver='saga', scoring='roc_auc_ovr', n_jobs=-1))
model.fit(X, y)

probas = model.predict_proba(X_test)[:, 1]

result_df = pd.DataFrame({
    "id": test_df["id"],
    "class4": prediction,
    "p": probas
})

result_df.to_csv('submission.csv', index=False)

In [None]:
search.best_params_

In [None]:
import matplotlib.pyplot as plt

features = [col for col in train_df.columns if train_df[col].dtype in [np.float64, np.int64] and col not in ['id']]
date = pd.to_datetime(train_df['date'])
n_features = len(features)
ncols = 5
nrows = int(np.ceil(n_features / ncols))
fig, axes = plt.subplots(nrows, ncols, figsize=(20, 4 * nrows), sharex=True)
axes = axes.flatten()

for i, feature in enumerate(features):
    axes[i].plot(date, train_df[feature])
    axes[i].set_title(feature)
    axes[i].set_xlabel('date')
    axes[i].set_ylabel(feature)

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()
train_df[features].hist(bins=100, figsize=(20, 20), layout=(int(np.ceil(len(features)/5)), 5))
plt.tight_layout()
plt.show()

In [None]:
class4_counts = train_df['class4'].value_counts()
print(class4_counts)
class4_counts.plot(kind='bar', title='Distribution of class4')
plt.xlabel('class4')
plt.ylabel('Count')
plt.show()
