In [1]:
# !kaggle competitions download -c playground-series-s5e3
# !unzip -u *.zip

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import missingno

from sklearn import set_config
set_config(transform_output = "pandas")

from sklearn.model_selection import ShuffleSplit, KFold, StratifiedKFold
from sklearn.model_selection import cross_validate, GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

KAGGLE_RUN = False
if KAGGLE_RUN:
    working_dir = Path('/kaggle/input/playground-series-s5e3')
else:
    working_dir = Path().cwd()

In [None]:
train_df = pd.read_csv(working_dir/'train.csv', index_col='id')
test_df = pd.read_csv(working_dir/'test.csv', index_col='id')


In [None]:
train_df

In [None]:
NUMERIC_COLUMNS=['day', 'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed']
CATEGORIC_COLUMNS=[]
TARGET_COLUMN=['rainfall']
ALL_COLUMNS=NUMERIC_COLUMNS+CATEGORIC_COLUMNS+TARGET_COLUMN

In [None]:
# feature engineering
#  add lag, fourier features, spreads, binning, days with maxtemp< temaparature, etc.


In [None]:

target = train_df[TARGET_COLUMN]
train = train_df.drop(columns=TARGET_COLUMN)
test = test_df


In [None]:
target

In [None]:
train

In [None]:
transformer = ColumnTransformer(
    transformers=[
        ('numeric', MinMaxScaler(), NUMERIC_COLUMNS),
        ('categories', OneHotEncoder(sparse_output=False), []),
    ], remainder='passthrough'
)

classifier = XGBClassifier()

pipe = Pipeline(
    steps=[
        ('transform_columns', transformer),
        ('classifier', classifier)
        ]
        )


In [None]:
cv_results = cross_validate(
    pipe,
    train,
    target,
    cv=KFold(n_splits=3),
    scoring="roc_auc",
    n_jobs=2
)

errors_tree_regressor = pd.Series(
    -cv_results["test_score"]
)
errors_tree_regressor.describe()

In [None]:

cv_search = GridSearchCV(
    estimator = pipe,
    param_grid={
        'classifier__n_estimators':[100, 50, 10],
        # 'classifier__criterion':['squared_error', 'friedman_mse', 'poisson'],
    },
    scoring="roc_auc",
    n_jobs=3,
)

search_results = cv_search.fit(
    train,
    target
)


In [None]:
cv_search.best_params_

In [None]:
sub_df = pd.DataFrame(
    index=test.index,
    data={
        'num_sold':cv_search.predict(test)
    }
)
sub_df    


In [None]:
if KAGGLE_RUN:
    sub_df.to_csv("/kaggle/working/submission.csv")
    !head /kaggle/working/submission.csv