In [1]:
#| default_exp app_v3

#### Competition

[LINK](https://www.kaggle.com/competitions/playground-series-s3e11/overview)

#### Imports

In [2]:
#| export
from fastai.tabular.all import *

from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance

import xgboost as xgb

#### Downloading Datasets

In [3]:
#| export
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

In [4]:
#| export
comp = 'playground-series-s3e11'
path = setup_comp(comp, install='fastai')

In [5]:
# copy .gitignore template from my home directory and append project data folder to it
if not os.path.exists('.gitignore'):
    !cp ~/.gitignore .
if comp not in open('.gitignore').read():
    with open('.gitignore', 'a') as f: f.write(f'{comp}')

#### Create Dataframes

In [6]:
#| export
df_train = pd.read_csv(path/'train.csv', low_memory=False)
df_test = pd.read_csv(path/'test.csv', low_memory=False)
df_comb = pd.concat([df_train, df_test], ignore_index=True)

In [7]:
#| export
df_train.drop(['id'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)
df_comb.drop(['id'], axis=1, inplace=True)

In [8]:
#| export
df_train['store_sales_per_children'] = df_train['store_sales(in millions)'] / df_train['total_children']

In [9]:
#| export
for column in df_train.columns:
    if (list(df_train[column].unique()) == [0.0, 1.0]):
        df_train.loc[:, column] = df_train[column].astype('bool')

In [10]:
#| export
train_idxs = np.arange(len(df_train))
test_idxs = np.arange(len(df_train), len(df_comb))

In [11]:
dep_var = 'cost'
procs = [Categorify, FillMissing, Normalize]
cont, cat = cont_cat_split(df_comb, max_card=1, dep_var=dep_var)
splits = RandomSplitter(valid_pct=0.2)(range_of(df_train))

In [12]:
df_train = df_comb.iloc[train_idxs]
df_test = df_comb.iloc[test_idxs]

In [13]:
#| export
to_final = TabularPandas(df_train, procs, cat, cont, y_names=dep_var, splits=splits)
test_final = TabularPandas(df_test, procs, cat, cont, y_names=None, splits=None)
dls_final = to_final.dataloaders(bs=1024)

In [17]:
#| export
fast_train = True
epochs = 14

#### Train Model

In [18]:
#| export
if fast_train == True:
    learn_final = tabular_learner(dls_final, layers=[200, 100], y_range=(0, 150), metrics=rmse)
    learn_final.fit_one_cycle(epochs, 1e-2)
    learn_final.export('models/tab_learner.pkl')


epoch,train_loss,valid_loss,_rmse,time
0,905.591492,877.187256,29.61735,00:02
1,864.612427,851.642639,29.182919,00:02
2,851.192017,842.056091,29.018202,00:02
3,844.642944,837.926025,28.946953,00:02
4,837.624451,835.310425,28.901737,00:02
5,827.79187,828.426208,28.782396,00:02
6,821.680786,824.091553,28.706995,00:02
7,814.252319,813.154053,28.515856,00:02
8,813.817322,811.342041,28.484068,00:02
9,808.345825,804.771362,28.368492,00:02


In [19]:
learn_final = load_learner('models/tab_learner.pkl')

In [17]:
#| export
test_dl = learn_final.dls.test_dl(df_test)
preds_final, _ = learn_final.get_preds(dl=test_dl)

#### Get Model Predictions

In [18]:
#| export
preds_final = preds_final.squeeze()

#### Random Forest Model

In [19]:
#| export
xs, y = to_final.train.xs, to_final.train.y
valid_xs, valid_y = to_final.valid.xs, to_final.valid.y
test_xs = test_final.train.xs

In [20]:
#| export
def rf(xs, y, n_estimators=40, max_samples=200_000, max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators, 
                                 max_samples=max_samples, max_features=max_features,
                                 min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)

In [21]:
#| export
def r_mse(pred, y):
    return round(math.sqrt(((pred-y)**2).mean()), 6)

In [22]:
#| export
def m_rmse(m, xs, y):
    return r_mse(m.predict(xs), y)

In [23]:
#| export
m = rf(xs, y, n_estimators=100)

In [24]:
rf_mse = m_rmse(m, valid_xs, valid_y)
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

(23.176449, 28.270338)

#### Feature Importance

In [26]:
#| export
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp': m.feature_importances_}).sort_values('imp', ascending=False) 

In [37]:
fi = rf_feat_importance(m, xs)
fi

Unnamed: 0,cols,imp
0,store_sales(in millions),0.218586
5,gross_weight,0.204598
8,units_per_case,0.15282
9,store_sqft,0.10128
2,total_children,0.077333
4,avg_cars_at home(approx).1,0.064515
3,num_children_at_home,0.04959
1,unit_sales(in millions),0.03534
6,recyclable_package,0.027543
7,low_fat,0.023863


#### Gradient Boosting

##### Feature Importance

In [43]:
train_fi = df_train.drop(columns = ['cost'])
target_fi = df_train[dep_var]

In [44]:
def plot_fi(data,ax = None,title = None):
    fi = pd.Series(data, index = train_fi.columns).sort_values(ascending = True)
    fi.plot(kind = 'barh', ax = ax)

In [None]:
x_train, x_test, y_train, y_test = train_tes

In [43]:
gb_train = xgb.DMatrix(xs, label=y)
gb_valid = xgb.DMatrix(valid_xs, label=valid_y)
gb_test = xgb.DMatrix(test_xs)

Set params

In [45]:
xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'eta': 0.05,
    'max_depth': 6,
    'min_child_weight': 4,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
}

Train Model

In [50]:
num_boost_round = 500
early_stopping_rounds = 10

In [51]:
evals = [(gb_train, 'train'), (gb_valid, 'valid')]

In [52]:
booster = xgb.train(
    xgb_params,
    gb_train,
    num_boost_round=num_boost_round,
    evals=evals,
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval=20)

[0]	train-rmse:102.57799	valid-rmse:102.62649
[20]	train-rmse:85.57004	valid-rmse:85.61834
[40]	train-rmse:71.96188	valid-rmse:72.00769
[60]	train-rmse:61.17031	valid-rmse:61.21340
[80]	train-rmse:52.72202	valid-rmse:52.76324
[100]	train-rmse:46.20773	valid-rmse:46.24730
[120]	train-rmse:41.27069	valid-rmse:41.31044
[140]	train-rmse:37.59939	valid-rmse:37.63962
[160]	train-rmse:34.91779	valid-rmse:34.95841
[180]	train-rmse:32.99660	valid-rmse:33.03857
[200]	train-rmse:31.63321	valid-rmse:31.67602
[220]	train-rmse:30.68050	valid-rmse:30.72464
[240]	train-rmse:30.01761	valid-rmse:30.06336
[260]	train-rmse:29.55457	valid-rmse:29.60213
[280]	train-rmse:29.23423	valid-rmse:29.28258
[300]	train-rmse:29.01029	valid-rmse:29.06081
[320]	train-rmse:28.85392	valid-rmse:28.90623
[340]	train-rmse:28.74425	valid-rmse:28.79861
[360]	train-rmse:28.66663	valid-rmse:28.72323
[380]	train-rmse:28.61079	valid-rmse:28.67059
[400]	train-rmse:28.56875	valid-rmse:28.63159
[420]	train-rmse:28.53794	valid-rmse:2

In [48]:
xgb_mse = r_mse(booster.predict(gb_valid), valid_y)
xgb_mse

28.439951

In [49]:
xgb_preds = booster.predict(gb_test)

In [60]:
#| export
rf_preds = m.predict(test_xs)

In [61]:
#| export
ens_preds = (to_np(preds_final) + rf_preds + xgb_preds) / 3

In [62]:
#| export
sample_df = pd.read_csv(path/'sample_submission.csv')
sample_df['cost'] = preds_final
sample_df.to_csv('submission.csv', index=False)

In [63]:
!head submission.csv

id,cost
360336,72.23431
360337,75.027405
360338,74.87896
360339,73.194725
360340,74.127464
360341,75.40361
360342,77.273415
360343,71.768196
360344,75.90176


In [64]:
sample_df.max()

id      600559.00000
cost        81.76387
dtype: float64

In [65]:
submit = True

In [66]:
if not iskaggle and submit:
    from kaggle import api
    api.competition_submit_cli(file_name='submission.csv', message="drop id column", competition=comp)

100%|██████████| 3.70M/3.70M [00:01<00:00, 2.12MB/s]


In [None]:
import nbdev
nbdev.export.nb_export('media_campaign_cost_boost.ipynb', 'app_v3')
print("export successful")