In [1]:
#| default_exp app_v2

#### Competition

[LINK](https://www.kaggle.com/competitions/playground-series-s3e11/overview)

#### Imports

In [2]:
#| export
from fastai.tabular.all import *
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

#### Downloading Datasets

In [3]:
#| export
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

In [4]:
#| export
comp = 'playground-series-s3e11'
path = setup_comp(comp, install='fastai')

In [5]:
# copy .gitignore template from my home directory and append project data folder to it
if not os.path.exists('.gitignore'):
    !cp ~/.gitignore .
if comp not in open('.gitignore').read():
    with open('.gitignore', 'a') as f: f.write(f'{comp}')

#### Create Dataframes

In [6]:
#| export
df_train = pd.read_csv(path/'train.csv', low_memory=False)
df_test = pd.read_csv(path/'test.csv', low_memory=False)

In [7]:
#| export
df_comb = pd.concat([df_train, df_test], ignore_index=True)

In [8]:
#| export
df_train.drop(['id'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)
df_comb.drop(['id'], axis=1, inplace=True)

In [9]:
#| export
df_train['store_sales_per_children'] = df_train['store_sales(in millions)'] / df_train['total_children']

In [10]:
list(df_train['coffee_bar'].unique())

[0.0, 1.0]

In [11]:
#| export
for column in df_train.columns:
    if (list(df_train[column].unique()) == [0.0, 1.0]):
        df_train.loc[:, column] = df_train[column].astype('bool')

In [12]:
#| export
train_idxs = np.arange(len(df_train))

In [13]:
#| export
test_idxs = np.arange(len(df_train), len(df_comb))

In [14]:
#| export
dep_var = 'cost'

In [15]:
#| export
procs = [Categorify, FillMissing, Normalize]

In [16]:
#| export
cont, cat = cont_cat_split(df_comb, max_card=1, dep_var=dep_var)

In [17]:
#| export
splits = RandomSplitter(valid_pct=0.2)(range_of(df_train))

In [18]:
#| export
df_train = df_comb.iloc[train_idxs]
df_test = df_comb.iloc[test_idxs]

In [19]:
len(df_train) + len(df_test) == len(df_comb)

True

#### K-Folds Cross-Validation

In [20]:
# n_folds = 5
# kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
# rmse_scores = []
# test_preds_all = np.zeros((len(df_test), n_folds))

In [21]:
# for fold, (train_index, valid_index) in enumerate(kf.split(df_train)):
#     train_fold, valid_fold = df_train.loc[train_index], df_train.loc[valid_index]
#     to_fold = TabularPandas(train_fold, procs, cat, cont, y_names=dep_var, splits=None)
#     valid_fold_to = TabularPandas(valid_fold, procs, cat, cont, y_names=dep_var, splits=None)
    
#     dls_fold = to_fold.dataloaders(bs=1024)
#     dls_fold.valid = valid_fold_to.dataloaders(bs=1024).train
    
#     learn_fold = tabular_learner(dls_fold, layers=[200, 100], metrics=rmse)
#     learn_fold.fit_one_cycle(10, 1e-2)
    
#     preds, targs = learn_fold.get_preds()
#     rmse_score = rmse(preds, targs).item()
#     rmse_scores.append(rmse_score)

#     test_dl = learn_fold.dls.test_dl(df_test)
#     test_preds, _ = learn_fold.get_preds(dl=test_dl)
#     test_preds_all[:, fold] = test_preds.squeeze()

    

# mean_rmse_score = np.mean(rmse_scores)
# print(f'Mean RMSE score for {n_folds}-fold cross-validation: {mean_rmse_score}')

In [22]:
def kfold_cross_val(layers, learning_rate):
    rmse_scores = []
    for train_index, valid_index in kf.split(df_train):
        train_fold, valid_fold = df_train.loc[train_index], df_train.loc[valid_index]
        to_fold = TabularPandas(train_fold, procs, cat, cont, y_names=dep_var, splits=None)
        valid_fold_to = TabularPandas(valid_fold, procs, cat, cont, y_names=dep_var, splits=None)

        dls_fold = to_fold.dataloaders(bs=1024)
        dls_fold.valid = valid_fold_to.dataloaders(bs=1024).train

        learn_fold = tabular_learner(dls_fold, layers=layers, metrics=rmse)
        learn_fold.fit_one_cycle(5, lr_max=learning_rate)

        preds, targs = learn_fold.get_preds()
        rmse_score = rmse(preds, targs).item()
        rmse_scores.append(rmse_score)

    return np.mean(rmse_scores)

In [23]:
# layer_sizes = [[200, 100], [300, 150]]
# learning_rates = [1e-2, 1e-3]

# best_rmse = float('inf')
# best_layers = None
# best_lr = None

# for layers in layer_sizes:
#     for lr in learning_rates:
#         mean_rmse = kfold_cross_val(layers, lr)
#         print(f'Layers: {layers}, Learning rate: {lr}, Mean RMSE: {mean_rmse}')
#         if mean_rmse < best_rmse:
#             best_rmse = mean_rmse
#             best_layers = layers
#             best_lr = lr

# print(f'Best layers: {best_layers}, Best learning rate: {best_lr}, Best RMSE: {best_rmse}')

In [24]:
#| export
splits = RandomSplitter(valid_pct=0.2)(range_of(df_train))

In [25]:
#| export
to_final = TabularPandas(df_train, procs, cat, cont, y_names=dep_var, splits=splits)
test_final = TabularPandas(df_test, procs, cat, cont, y_names=None, splits=None)
dls_final = to_final.dataloaders(bs=1024)

In [26]:
#| export
epochs=0

#### Train Model

In [27]:
#| export
learn_final = tabular_learner(dls_final, layers=[200, 100], y_range=(0, 150), metrics=rmse)
learn_final.fit_one_cycle(epochs, 1e-2)

In [28]:
#| export
test_dl = learn_final.dls.test_dl(df_test)
preds_final, _ = learn_final.get_preds(dl=test_dl)

#### Get Model Predictions

In [29]:
#| export
preds_final = preds_final.squeeze()

In [30]:
preds_final

tensor([72.2343, 75.0274, 74.8790,  ..., 74.2280, 75.6081, 72.0924])

#### Random Forest Model

In [31]:
#| export
xs, y = to_final.train.xs, to_final.train.y
valid_xs, valid_y = to_final.valid.xs, to_final.valid.y
test_xs = test_final.train.xs

In [32]:
#| export
def rf(xs, y, n_estimators=40, max_samples=200_000, max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators, 
                                 max_samples=max_samples, max_features=max_features,
                                 min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)

In [33]:
#| export
def r_mse(pred, y):
    return round(math.sqrt(((pred-y)**2).mean()), 6)

In [34]:
#| export
def m_rmse(m, xs, y):
    return r_mse(m.predict(xs), y)

In [35]:
#| export
m = rf(xs, y, n_estimators=100)

In [36]:
rf_mse = m_rmse(m, valid_xs, valid_y)
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

(23.169503, 28.214902)

In [37]:
#| export
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp': m.feature_importances_}).sort_values('imp', ascending=False) 

In [38]:
fi = rf_feat_importance(m, xs)
fi

Unnamed: 0,cols,imp
0,store_sales(in millions),0.218797
5,gross_weight,0.205121
8,units_per_case,0.153397
9,store_sqft,0.09985
2,total_children,0.077763
4,avg_cars_at home(approx).1,0.063925
3,num_children_at_home,0.04957
1,unit_sales(in millions),0.035504
6,recyclable_package,0.027552
7,low_fat,0.023974


#### Gradient Boosting

In [39]:
gb_train = xgb.DMatrix(xs, label=y)
gb_valid = xgb.DMatrix(valid_xs, label=valid_y)
gb_test = xgb.DMatrix(test_xs)

Set params

In [54]:
xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'eta': 0.01,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
}

Train Model

In [55]:
num_boost_round = 1000
early_stopping_rounds = 10

In [56]:
evals = [(gb_train, 'train'), (gb_valid, 'valid')]

In [57]:
booster = xgb.train(
    xgb_params,
    gb_train,
    num_boost_round=num_boost_round,
    evals=evals,
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval=20)


[0]	train-rmse:102.57804	valid-rmse:102.62429
[20]	train-rmse:85.57022	valid-rmse:85.61400
[40]	train-rmse:71.96139	valid-rmse:72.00263
[60]	train-rmse:61.17093	valid-rmse:61.20870
[80]	train-rmse:52.72341	valid-rmse:52.75907
[100]	train-rmse:46.21148	valid-rmse:46.24563
[120]	train-rmse:41.27238	valid-rmse:41.30456
[140]	train-rmse:37.60308	valid-rmse:37.63415
[160]	train-rmse:34.92193	valid-rmse:34.95284
[180]	train-rmse:33.00002	valid-rmse:33.03069
[200]	train-rmse:31.63609	valid-rmse:31.66610
[220]	train-rmse:30.68322	valid-rmse:30.71345
[240]	train-rmse:30.02030	valid-rmse:30.05073
[260]	train-rmse:29.55733	valid-rmse:29.58783
[280]	train-rmse:29.23719	valid-rmse:29.26893
[300]	train-rmse:29.01418	valid-rmse:29.04718
[320]	train-rmse:28.85762	valid-rmse:28.89214
[340]	train-rmse:28.74931	valid-rmse:28.78607
[360]	train-rmse:28.66877	valid-rmse:28.70790
[380]	train-rmse:28.61301	valid-rmse:28.65421
[400]	train-rmse:28.57202	valid-rmse:28.61622
[420]	train-rmse:28.54085	valid-rmse:2

In [58]:
xgb_mse = r_mse(booster.predict(gb_valid), valid_y)
xgb_mse

28.411565

In [59]:
xgb_preds = booster.predict(gb_test)

In [60]:
#| export
rf_preds = m.predict(test_xs)

In [61]:
#| export
ens_preds = (to_np(preds_final) + rf_preds + xgb_preds) / 3

In [62]:
#| export
sample_df = pd.read_csv(path/'sample_submission.csv')
sample_df['cost'] = preds_final
sample_df.to_csv('submission.csv', index=False)

In [63]:
!head submission.csv

id,cost
360336,72.23431
360337,75.027405
360338,74.87896
360339,73.194725
360340,74.127464
360341,75.40361
360342,77.273415
360343,71.768196
360344,75.90176


In [64]:
sample_df.max()

id      600559.00000
cost        81.76387
dtype: float64

In [65]:
submit = True

In [66]:
if not iskaggle and submit:
    from kaggle import api
    api.competition_submit_cli(file_name='submission.csv', message="drop id column", competition=comp)

100%|██████████| 3.70M/3.70M [00:01<00:00, 2.12MB/s]


In [None]:
import nbdev
nbdev.export.nb_export('media_campaign_cost_k_fold.ipynb', 'app_v2')
print("export successful")