In [1]:
# !kaggle competitions download -c playground-series-s5e3
# !unzip -u *.zip

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import missingno

from sklearn import set_config
set_config(transform_output = "pandas")

from sklearn.model_selection import ShuffleSplit, KFold, StratifiedKFold
from sklearn.model_selection import cross_validate, GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

KAGGLE_RUN = False
if KAGGLE_RUN:
    working_dir = Path('/kaggle/input/playground-series-s5e3')
else:
    working_dir = Path().cwd()

In [3]:
train_df = pd.read_csv(working_dir/'train.csv', index_col='id')
test_df = pd.read_csv(working_dir/'test.csv', index_col='id')


In [4]:
train_df

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2185,361,1014.6,23.2,20.6,19.1,19.9,97.0,88.0,0.1,40.0,22.1,1
2186,362,1012.4,17.2,17.3,16.3,15.3,91.0,88.0,0.0,50.0,35.3,1
2187,363,1013.3,19.0,16.3,14.3,12.6,79.0,79.0,5.0,40.0,32.9,1
2188,364,1022.3,16.4,15.2,13.8,14.7,92.0,93.0,0.1,40.0,18.0,1


In [5]:
NUMERIC_COLUMNS=['day', 'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed']
CATEGORIC_COLUMNS=[]
TARGET_COLUMN=['rainfall']
ALL_COLUMNS=NUMERIC_COLUMNS+CATEGORIC_COLUMNS+TARGET_COLUMN

In [6]:
# feature engineering
#  add lag, fourier features, spreads, binning, days with maxtemp< temaparature, etc.


In [7]:

target = train_df[TARGET_COLUMN]
train = train_df.drop(columns=TARGET_COLUMN)
test = test_df


In [8]:
target

Unnamed: 0_level_0,rainfall
id,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,0
...,...
2185,1
2186,1
2187,1
2188,1


In [9]:
train

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2
1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9
2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1
3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6
4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8
...,...,...,...,...,...,...,...,...,...,...,...
2185,361,1014.6,23.2,20.6,19.1,19.9,97.0,88.0,0.1,40.0,22.1
2186,362,1012.4,17.2,17.3,16.3,15.3,91.0,88.0,0.0,50.0,35.3
2187,363,1013.3,19.0,16.3,14.3,12.6,79.0,79.0,5.0,40.0,32.9
2188,364,1022.3,16.4,15.2,13.8,14.7,92.0,93.0,0.1,40.0,18.0


In [10]:
transformer = ColumnTransformer(
    transformers=[
        ('numeric', MinMaxScaler(), NUMERIC_COLUMNS),
        ('categories', OneHotEncoder(sparse_output=False), []),
    ], remainder='passthrough'
)

classifier = XGBClassifier()

pipe = Pipeline(
    steps=[
        ('transform_columns', transformer),
        ('classifier', classifier)
        ]
        )


In [11]:
cv_results = cross_validate(
    pipe,
    train,
    target,
    cv=StratifiedKFold(n_splits=5),
    scoring="roc_auc",
    n_jobs=2
)

errors_tree_regressor = pd.Series(
    cv_results["test_score"]
)
errors_tree_regressor.describe()

count    5.000000
mean     0.870617
std      0.014476
min      0.856061
25%      0.857997
50%      0.867565
75%      0.885101
max      0.886364
dtype: float64

In [None]:

cv_search = GridSearchCV(
    estimator = pipe,
    param_grid={
        'classifier__n_estimators':[100, 500],
        'classifier__n_estimators':[10, 50, 100, 500],
        'classifier__max_depth':[6, 10, 50, 100],
        'classifier__max_leaves':[0, 5, 10],
        'classifier__learning_rate':[0.1, 0.3, 0.5],
        # 'classifier__subsample':[0.8, 0.9, 1],
        # 'classifier__colsample_bytree':[0.8, 0.9, 1],
        # 'classifier__criterion':['squared_error', 'friedman_mse', 'poisson'],
    },
    scoring="roc_auc",
    n_jobs=3,
)

search_results = cv_search.fit(
    train,
    target
)


In [13]:
pd.DataFrame(cv_search.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
mean_fit_time,0.157598,0.366799,2.147476,0.322331,0.027417,0.036657,0.058531,1.426806,0.027504,0.043519,...,0.289351,0.844339,0.048148,0.06578,0.104791,0.63062,0.049951,0.167512,0.217397,0.53981
std_fit_time,0.10372,0.42437,1.075318,0.020948,0.007617,0.006402,0.022096,1.028354,0.003413,0.008982,...,0.043476,0.258424,0.015814,0.011361,0.011281,0.097856,0.014492,0.075264,0.032165,0.08225
mean_score_time,0.010913,0.025443,0.029347,0.019445,0.011698,0.010916,0.013655,0.012159,0.013802,0.009602,...,0.024792,0.031405,0.017195,0.025218,0.023873,0.031485,0.022693,0.025977,0.023556,0.028854
std_score_time,0.002769,0.010688,0.021164,0.00423,0.002302,0.001708,0.004559,0.001841,0.002936,0.001153,...,0.007168,0.011762,0.002166,0.00695,0.003806,0.00731,0.005584,0.005402,0.005683,0.009252
param_classifier__learning_rate,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,...,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3
param_classifier__max_depth,6,6,6,6,6,6,6,6,6,6,...,100,100,100,100,100,100,100,100,100,100
param_classifier__max_leaves,0,0,0,0,5,5,5,5,10,10,...,0,0,5,5,5,5,10,10,10,10
param_classifier__n_estimators,10,50,100,500,10,50,100,500,10,50,...,100,500,10,50,100,500,10,50,100,500
params,"{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...",...,"{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier...","{'classifier__learning_rate': 0.3, 'classifier..."
split0_test_score,0.864843,0.85665,0.856061,0.84537,0.8719,0.869332,0.87326,0.849832,0.857674,0.864254,...,0.853816,0.844921,0.8719,0.869332,0.87326,0.849832,0.857674,0.864254,0.859063,0.842088


In [14]:
cv_search.best_score_

np.float64(0.8869865319865321)

In [15]:
cv_search.best_params_

{'classifier__learning_rate': 0.3,
 'classifier__max_depth': 6,
 'classifier__max_leaves': 5,
 'classifier__n_estimators': 10}

In [16]:
sub_df = pd.DataFrame(
    index=test.index,
    data={
        'rainfall':cv_search.predict(test)
    }
)
sub_df    


Unnamed: 0_level_0,rainfall
id,Unnamed: 1_level_1
2190,1
2191,1
2192,1
2193,0
2194,0
...,...
2915,1
2916,1
2917,1
2918,1


In [17]:
if KAGGLE_RUN:
    sub_df.to_csv("/kaggle/working/submission.csv")
    !head /kaggle/working/submission.csv