In [1]:
# !kaggle competitions download -c playground-series-s5e3
# !unzip -u *.zip

In [2]:
from pathlib import Path
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import missingno

from sklearn import set_config
set_config(transform_output = "pandas")

from sklearn.model_selection import ShuffleSplit, KFold, StratifiedKFold
from sklearn.model_selection import cross_validate, GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler

from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

KAGGLE_RUN = False
if KAGGLE_RUN:
    working_dir = Path('/kaggle/input/playground-series-s5e3')
else:
    working_dir = Path().cwd()

In [3]:
train_df = pd.read_csv(working_dir/'train.csv', index_col='id')
test_df = pd.read_csv(working_dir/'test.csv', index_col='id')


In [4]:
train_df

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2185,361,1014.6,23.2,20.6,19.1,19.9,97.0,88.0,0.1,40.0,22.1,1
2186,362,1012.4,17.2,17.3,16.3,15.3,91.0,88.0,0.0,50.0,35.3,1
2187,363,1013.3,19.0,16.3,14.3,12.6,79.0,79.0,5.0,40.0,32.9,1
2188,364,1022.3,16.4,15.2,13.8,14.7,92.0,93.0,0.1,40.0,18.0,1


In [5]:
test_df

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4
...,...,...,...,...,...,...,...,...,...,...,...
2915,361,1020.8,18.2,17.6,16.1,13.7,96.0,95.0,0.0,20.0,34.3
2916,362,1011.7,23.2,18.1,16.0,16.0,78.0,80.0,1.6,40.0,25.2
2917,363,1022.7,21.0,18.5,17.0,15.5,92.0,96.0,0.0,50.0,21.9
2918,364,1014.4,21.0,20.0,19.7,19.8,94.0,93.0,0.0,50.0,39.5


In [6]:
NUMERIC_COLUMNS=['day', 'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed']
CATEGORIC_COLUMNS=[]
TARGET_COLUMN=['rainfall']
ALL_COLUMNS=NUMERIC_COLUMNS+CATEGORIC_COLUMNS+TARGET_COLUMN

In [7]:
# feature engineering
#  add lag, fourier features, spreads, binning, days with maxtemp< temaparature, etc.

def group_agg_merge(df, by_column, agg_column):
    grouped_df = df.groupby(by=by_column).agg(
        **{
        f'{by_column}_average_{agg_column}':(agg_column, 'mean'),
        f'{by_column}_std_{agg_column}':(agg_column, 'std'),
        f'{by_column}_skew_{agg_column}':(agg_column, 'skew'),
        f'{by_column}_median_{agg_column}':(agg_column, 'median'),
        f'{by_column}_min_{agg_column}':(agg_column, 'min'),
        f'{by_column}_max_{agg_column}':(agg_column, 'max'),}
    )
    df = df.merge(grouped_df, on=by_column, how='left')
    return df

def add_features(df):
    df=df.fillna(0)
    df['temp_spread'] = df['maxtemp'] - df['mintemp']
    df['temp_diff_dew'] = df['temparature'] - df['dewpoint']
    df['pressure_bins'] = pd.cut(df['pressure'], bins=[998, 1010, 1020, 1035], labels=list(range(3)), include_lowest=True)
    df['humidity_bins'] = pd.cut(df['humidity'], bins=[0, 50, 80,100], labels=list(range(3)), include_lowest=True)
    df['cloud_bins'] = pd.cut(df['cloud'], bins=[0, 25, 50, 75, 100], labels=list(range(4)), include_lowest=True)
    df['sunshine_bins'] = pd.cut(df['sunshine'], bins=[-0.1, 3., 6., 9., 13.], labels=list(range(4)), include_lowest=True)
    df['winddirection_bins'] = pd.cut(df['winddirection'], bins=[-1., 90, 180, 270, 361.], labels=list(range(4)), include_lowest=True)
    df['windspeed_bins'] = pd.cut(df['windspeed'], bins=[0, 15, 30, 45, 60], labels=list(range(4)), include_lowest=True)

    df['max_temp_under_temp'] = df['maxtemp'] < df['temparature']
    df['min_temp_over_temp'] = df['mintemp'] > df['temparature']
    
    df['month'] = df['day'].apply(
        lambda x: 'Jan' if 1<=x<=31 
        else 'Feb' if 32<= x<= 59
        else 'Mar' if 60<= x<= 90
        else 'Apr' if 91<= x<= 120
        else 'May' if 121<= x<= 151
        else 'Jun' if 152<= x<= 181
        else 'Jul' if 182<= x<= 212
        else 'Aug' if 213<= x<= 243
        else 'Sep' if 244<= x<= 273
        else 'Oct' if 274<= x<= 304
        else 'Nov' if 305<= x<= 334
        else 'Dec' if 335<= x<= 366
        else 'Nan'
    )

    df['season'] = df['month'].apply(
        lambda x: 'Winter' if x in ['Dec', 'Jan', 'Feb']
        else 'Spring' if x in ['Mar', 'Apr', 'May']
        else 'Summer' if x in ['Jun', 'Jul', 'Aug']
        else 'Autumn' if x in ['Sep', 'Oct', 'Nov']
        else 'Nan'
    )

    for i, k in itertools.product(['month', 'season'], ['temparature', 'pressure', 'dewpoint', 'humidity', 'sunshine', 'windspeed']):
        df = group_agg_merge(df, i, k)

    return df

NUMERIC_COLUMNS+=[
    'temp_spread',
    'temp_diff_dew',
    ]+[f'{i}_{j}_{k}' for i,j,k in itertools.product(['month', 'season'], ['average', 'std', 'skew', 'median', 'min', 'max'], ['temparature', 'pressure', 'dewpoint', 'humidity', 'sunshine', 'windspeed'])]
CATEGORIC_COLUMNS+=[
    'pressure_bins',
    'humidity_bins',
    'cloud_bins',
    'sunshine_bins',
    'winddirection_bins',
    'windspeed_bins',
    'month',
    'season',
    'max_temp_under_temp',
    'min_temp_over_temp'
    ]

train_df = add_features(train_df)
test_df = add_features(test_df)

In [8]:
print(NUMERIC_COLUMNS)
print(CATEGORIC_COLUMNS)

['day', 'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed', 'temp_spread', 'temp_diff_dew', 'month_average_temparature', 'month_average_pressure', 'month_average_dewpoint', 'month_average_humidity', 'month_average_sunshine', 'month_average_windspeed', 'month_std_temparature', 'month_std_pressure', 'month_std_dewpoint', 'month_std_humidity', 'month_std_sunshine', 'month_std_windspeed', 'month_skew_temparature', 'month_skew_pressure', 'month_skew_dewpoint', 'month_skew_humidity', 'month_skew_sunshine', 'month_skew_windspeed', 'month_median_temparature', 'month_median_pressure', 'month_median_dewpoint', 'month_median_humidity', 'month_median_sunshine', 'month_median_windspeed', 'month_min_temparature', 'month_min_pressure', 'month_min_dewpoint', 'month_min_humidity', 'month_min_sunshine', 'month_min_windspeed', 'month_max_temparature', 'month_max_pressure', 'month_max_dewpoint', 'month_max_humidity', 'month_max_sunshi

In [9]:
target = train_df[TARGET_COLUMN]
train = train_df.drop(columns=TARGET_COLUMN)
test = test_df

In [10]:
target

Unnamed: 0,rainfall
0,1
1,1
2,1
3,1
4,0
...,...
2185,1
2186,1
2187,1
2188,1


In [11]:
train

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,...,season_skew_sunshine,season_median_sunshine,season_min_sunshine,season_max_sunshine,season_average_windspeed,season_std_windspeed,season_skew_windspeed,season_median_windspeed,season_min_windspeed,season_max_windspeed
0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,...,1.452287,0.3,0.0,11.2,26.444465,10.318128,0.560474,25.1,6.9,59.5
1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,...,1.452287,0.3,0.0,11.2,26.444465,10.318128,0.560474,25.1,6.9,59.5
2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,...,1.452287,0.3,0.0,11.2,26.444465,10.318128,0.560474,25.1,6.9,59.5
3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,...,1.452287,0.3,0.0,11.2,26.444465,10.318128,0.560474,25.1,6.9,59.5
4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,...,1.452287,0.3,0.0,11.2,26.444465,10.318128,0.560474,25.1,6.9,59.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,361,1014.6,23.2,20.6,19.1,19.9,97.0,88.0,0.1,40.0,...,1.452287,0.3,0.0,11.2,26.444465,10.318128,0.560474,25.1,6.9,59.5
2186,362,1012.4,17.2,17.3,16.3,15.3,91.0,88.0,0.0,50.0,...,1.452287,0.3,0.0,11.2,26.444465,10.318128,0.560474,25.1,6.9,59.5
2187,363,1013.3,19.0,16.3,14.3,12.6,79.0,79.0,5.0,40.0,...,1.452287,0.3,0.0,11.2,26.444465,10.318128,0.560474,25.1,6.9,59.5
2188,364,1022.3,16.4,15.2,13.8,14.7,92.0,93.0,0.1,40.0,...,1.452287,0.3,0.0,11.2,26.444465,10.318128,0.560474,25.1,6.9,59.5


In [None]:
test

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 95 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   day                         730 non-null    int64   
 1   pressure                    730 non-null    float64 
 2   maxtemp                     730 non-null    float64 
 3   temparature                 730 non-null    float64 
 4   mintemp                     730 non-null    float64 
 5   dewpoint                    730 non-null    float64 
 6   humidity                    730 non-null    float64 
 7   cloud                       730 non-null    float64 
 8   sunshine                    730 non-null    float64 
 9   winddirection               730 non-null    float64 
 10  windspeed                   730 non-null    float64 
 11  temp_spread                 730 non-null    float64 
 12  temp_diff_dew               730 non-null    float64 
 13  pressure_bins       

In [13]:
transformer = ColumnTransformer(
    transformers=[
        ('numeric', StandardScaler(), NUMERIC_COLUMNS),
        ('categories', OneHotEncoder(sparse_output=False), CATEGORIC_COLUMNS),
    ], remainder='passthrough'
)

classifier = XGBClassifier()

pipe = Pipeline(
    steps=[
        ('transform_columns', transformer),
        ('classifier', classifier)
        ]
        )


In [14]:
cv_results = cross_validate(
    pipe,
    train,
    target,
    cv=StratifiedKFold(n_splits=5),
    scoring="roc_auc",
    n_jobs=2
)

errors_tree_regressor = pd.Series(
    cv_results["test_score"]
)
errors_tree_regressor.describe()

count    5.000000
mean     0.867492
std      0.023910
min      0.831818
25%      0.860774
50%      0.865937
75%      0.888721
max      0.890208
dtype: float64

In [15]:

cv_search = GridSearchCV(
    estimator = pipe,
    param_grid={
        'classifier__n_estimators':[100, 500],
        'classifier__n_estimators':[10, 50, 100, 500],
        'classifier__max_depth':[6, 10, 50, 100],
        'classifier__max_leaves':[0, 5, 10],
        'classifier__learning_rate':[0.1, 0.3, 0.5],
        # 'classifier__subsample':[0.8, 0.9, 1],
        # 'classifier__colsample_bytree':[0.8, 0.9, 1],
        # 'classifier__criterion':['squared_error', 'friedman_mse', 'poisson'],
    },
    scoring="roc_auc",
    n_jobs=3,
)

search_results = cv_search.fit(
    train,
    target
)


In [16]:
pd.DataFrame(cv_search.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,134,135,136,137,138,139,140,141,142,143
mean_fit_time,0.160146,0.155975,0.181941,4.779607,0.880426,0.222452,0.135967,0.397293,0.063808,0.101871,...,0.429612,0.790137,0.077877,0.156825,0.267218,0.657632,0.126555,0.186188,0.263904,0.909679
std_fit_time,0.063862,0.033015,0.008259,3.942464,0.224529,0.264335,0.010004,0.030429,0.002274,0.007656,...,0.065038,0.176249,0.012229,0.061959,0.042426,0.048393,0.029399,0.021796,0.020749,0.095092
mean_score_time,0.036741,0.031843,0.031417,0.046589,0.152526,0.031167,0.032857,0.032477,0.031835,0.038798,...,0.052575,0.042631,0.035162,0.065264,0.049552,0.045882,0.062577,0.05126,0.05085,0.065489
std_score_time,0.013207,0.004925,0.007339,0.032226,0.067859,0.008735,0.004087,0.011807,0.004087,0.007182,...,0.00647,0.01144,0.004422,0.025142,0.00591,0.011685,0.013791,0.008291,0.005828,0.019945
param_classifier__learning_rate,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
param_classifier__max_depth,6,6,6,6,6,6,6,6,6,6,...,100,100,100,100,100,100,100,100,100,100
param_classifier__max_leaves,0,0,0,0,5,5,5,5,10,10,...,0,0,5,5,5,5,10,10,10,10
param_classifier__n_estimators,10,50,100,500,10,50,100,500,10,50,...,100,500,10,50,100,500,10,50,100,500
params,"{'classifier__learning_rate': 0.1, 'classifier...","{'classifier__learning_rate': 0.1, 'classifier...","{'classifier__learning_rate': 0.1, 'classifier...","{'classifier__learning_rate': 0.1, 'classifier...","{'classifier__learning_rate': 0.1, 'classifier...","{'classifier__learning_rate': 0.1, 'classifier...","{'classifier__learning_rate': 0.1, 'classifier...","{'classifier__learning_rate': 0.1, 'classifier...","{'classifier__learning_rate': 0.1, 'classifier...","{'classifier__learning_rate': 0.1, 'classifier...",...,"{'classifier__learning_rate': 0.5, 'classifier...","{'classifier__learning_rate': 0.5, 'classifier...","{'classifier__learning_rate': 0.5, 'classifier...","{'classifier__learning_rate': 0.5, 'classifier...","{'classifier__learning_rate': 0.5, 'classifier...","{'classifier__learning_rate': 0.5, 'classifier...","{'classifier__learning_rate': 0.5, 'classifier...","{'classifier__learning_rate': 0.5, 'classifier...","{'classifier__learning_rate': 0.5, 'classifier...","{'classifier__learning_rate': 0.5, 'classifier..."
split0_test_score,0.860522,0.854209,0.855696,0.842172,0.86748,0.873148,0.874327,0.864394,0.859343,0.868967,...,0.835774,0.833502,0.861069,0.857043,0.840123,0.836027,0.862837,0.839029,0.834905,0.829798


In [17]:
cv_search.best_score_

np.float64(0.8881902356902357)

In [18]:
cv_search.best_params_

{'classifier__learning_rate': 0.3,
 'classifier__max_depth': 6,
 'classifier__max_leaves': 5,
 'classifier__n_estimators': 10}

In [None]:
sub_df = pd.DataFrame(
    index=test.index,
    data={
        'rainfall':cv_search.predict(test)
    },
    index_label='id',
)
sub_df    


Unnamed: 0,rainfall
0,1
1,1
2,1
3,0
4,0
...,...
725,1
726,1
727,1
728,1


In [20]:
if KAGGLE_RUN:
    sub_df.to_csv("/kaggle/working/submission.csv")
    !head /kaggle/working/submission.csv