In [1]:
#| default_exp app_v1

#### Competition

[LINK](https://www.kaggle.com/competitions/playground-series-s3e11/overview)

#### Imports

In [2]:
#| export
from fastai.tabular.all import *
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

#### Downloading Datasets

In [3]:
#| export
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

In [4]:
#| export
comp = 'playground-series-s3e11'
path = setup_comp(comp, install='fastai')

In [5]:
# copy .gitignore template from my home directory and append project data folder to it
if not os.path.exists('.gitignore'):
    !cp ~/.gitignore .
if comp not in open('.gitignore').read():
    with open('.gitignore', 'a') as f: f.write(f'{comp}')

#### Create Dataframes

In [6]:
#| export
df_train = pd.read_csv(path/'train.csv', low_memory=False)
df_test = pd.read_csv(path/'test.csv', low_memory=False)

In [7]:
#| export
df_comb = pd.concat([df_train, df_test], ignore_index=True)

In [8]:
df_train

Unnamed: 0,id,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,recyclable_package,low_fat,units_per_case,store_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,cost
0,0,8.61,3.0,2.0,2.0,2.0,10.30,1.0,0.0,32.0,36509.0,0.0,0.0,0.0,0.0,0.0,62.09
1,1,5.00,2.0,4.0,0.0,3.0,6.66,1.0,0.0,1.0,28206.0,1.0,0.0,0.0,0.0,0.0,121.80
2,2,14.08,4.0,0.0,0.0,3.0,21.30,1.0,0.0,26.0,21215.0,1.0,0.0,0.0,0.0,0.0,83.51
3,3,4.02,3.0,5.0,0.0,0.0,14.80,0.0,1.0,36.0,21215.0,1.0,0.0,0.0,0.0,0.0,66.78
4,4,2.13,3.0,5.0,0.0,3.0,17.00,1.0,1.0,20.0,27694.0,1.0,1.0,1.0,1.0,1.0,111.51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360331,360331,7.60,4.0,5.0,5.0,3.0,13.50,1.0,0.0,33.0,30268.0,0.0,0.0,0.0,0.0,0.0,133.42
360332,360332,14.44,4.0,4.0,0.0,4.0,18.80,1.0,1.0,18.0,20319.0,0.0,0.0,0.0,0.0,0.0,81.85
360333,360333,10.74,3.0,0.0,0.0,2.0,11.30,1.0,0.0,35.0,30584.0,1.0,1.0,1.0,1.0,1.0,87.07
360334,360334,11.04,3.0,1.0,0.0,3.0,10.20,0.0,1.0,14.0,30584.0,1.0,1.0,1.0,1.0,1.0,146.72


In [9]:
df_train.drop(['id'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)
df_comb.drop(['id'], axis=1, inplace=True)

In [10]:
df_train

Unnamed: 0,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,recyclable_package,low_fat,units_per_case,store_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,cost
0,8.61,3.0,2.0,2.0,2.0,10.30,1.0,0.0,32.0,36509.0,0.0,0.0,0.0,0.0,0.0,62.09
1,5.00,2.0,4.0,0.0,3.0,6.66,1.0,0.0,1.0,28206.0,1.0,0.0,0.0,0.0,0.0,121.80
2,14.08,4.0,0.0,0.0,3.0,21.30,1.0,0.0,26.0,21215.0,1.0,0.0,0.0,0.0,0.0,83.51
3,4.02,3.0,5.0,0.0,0.0,14.80,0.0,1.0,36.0,21215.0,1.0,0.0,0.0,0.0,0.0,66.78
4,2.13,3.0,5.0,0.0,3.0,17.00,1.0,1.0,20.0,27694.0,1.0,1.0,1.0,1.0,1.0,111.51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360331,7.60,4.0,5.0,5.0,3.0,13.50,1.0,0.0,33.0,30268.0,0.0,0.0,0.0,0.0,0.0,133.42
360332,14.44,4.0,4.0,0.0,4.0,18.80,1.0,1.0,18.0,20319.0,0.0,0.0,0.0,0.0,0.0,81.85
360333,10.74,3.0,0.0,0.0,2.0,11.30,1.0,0.0,35.0,30584.0,1.0,1.0,1.0,1.0,1.0,87.07
360334,11.04,3.0,1.0,0.0,3.0,10.20,0.0,1.0,14.0,30584.0,1.0,1.0,1.0,1.0,1.0,146.72


In [11]:
df_train['store_sales_per_children'] = df_train['store_sales(in millions)'] / df_train['total_children']

In [12]:
list(df_train['coffee_bar'].unique())

[0.0, 1.0]

In [13]:
for column in df_train.columns:
    if (list(df_train[column].unique()) == [0.0, 1.0]):
        df_train.loc[:, column] = df_train[column].astype('bool')

In [14]:
df_train.dtypes

store_sales(in millions)      float64
unit_sales(in millions)       float64
total_children                float64
num_children_at_home          float64
avg_cars_at home(approx).1    float64
gross_weight                  float64
recyclable_package            float64
low_fat                          bool
units_per_case                float64
store_sqft                    float64
coffee_bar                       bool
video_store                      bool
salad_bar                        bool
prepared_food                    bool
florist                          bool
cost                          float64
store_sales_per_children      float64
dtype: object

In [15]:
#| export
train_idxs = np.arange(len(df_train))

In [16]:
#| export
test_idxs = np.arange(len(df_train), len(df_comb))

In [17]:
#| export
dep_var = 'cost'

In [18]:
#| export
procs = [Categorify, FillMissing, Normalize]

In [19]:
#| export
cont, cat = cont_cat_split(df_comb, max_card=1, dep_var=dep_var)

In [20]:
#| export
splits = RandomSplitter(valid_pct=0.2)(range_of(df_train))

In [21]:
#| export
df_train = df_comb.iloc[train_idxs]
df_test = df_comb.iloc[test_idxs]

In [22]:
len(df_train) + len(df_test) == len(df_comb)

True

#### K-Folds Cross-Validation

In [23]:
# n_folds = 5
# kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
# rmse_scores = []
# test_preds_all = np.zeros((len(df_test), n_folds))

In [24]:
# for fold, (train_index, valid_index) in enumerate(kf.split(df_train)):
#     train_fold, valid_fold = df_train.loc[train_index], df_train.loc[valid_index]
#     to_fold = TabularPandas(train_fold, procs, cat, cont, y_names=dep_var, splits=None)
#     valid_fold_to = TabularPandas(valid_fold, procs, cat, cont, y_names=dep_var, splits=None)
    
#     dls_fold = to_fold.dataloaders(bs=1024)
#     dls_fold.valid = valid_fold_to.dataloaders(bs=1024).train
    
#     learn_fold = tabular_learner(dls_fold, layers=[200, 100], metrics=rmse)
#     learn_fold.fit_one_cycle(10, 1e-2)
    
#     preds, targs = learn_fold.get_preds()
#     rmse_score = rmse(preds, targs).item()
#     rmse_scores.append(rmse_score)

#     test_dl = learn_fold.dls.test_dl(df_test)
#     test_preds, _ = learn_fold.get_preds(dl=test_dl)
#     test_preds_all[:, fold] = test_preds.squeeze()

    

# mean_rmse_score = np.mean(rmse_scores)
# print(f'Mean RMSE score for {n_folds}-fold cross-validation: {mean_rmse_score}')

In [25]:
def kfold_cross_val(layers, learning_rate):
    rmse_scores = []
    for train_index, valid_index in kf.split(df_train):
        train_fold, valid_fold = df_train.loc[train_index], df_train.loc[valid_index]
        to_fold = TabularPandas(train_fold, procs, cat, cont, y_names=dep_var, splits=None)
        valid_fold_to = TabularPandas(valid_fold, procs, cat, cont, y_names=dep_var, splits=None)

        dls_fold = to_fold.dataloaders(bs=1024)
        dls_fold.valid = valid_fold_to.dataloaders(bs=1024).train

        learn_fold = tabular_learner(dls_fold, layers=layers, metrics=rmse)
        learn_fold.fit_one_cycle(5, lr_max=learning_rate)

        preds, targs = learn_fold.get_preds()
        rmse_score = rmse(preds, targs).item()
        rmse_scores.append(rmse_score)

    return np.mean(rmse_scores)

In [26]:
# layer_sizes = [[200, 100], [300, 150]]
# learning_rates = [1e-2, 1e-3]

# best_rmse = float('inf')
# best_layers = None
# best_lr = None

# for layers in layer_sizes:
#     for lr in learning_rates:
#         mean_rmse = kfold_cross_val(layers, lr)
#         print(f'Layers: {layers}, Learning rate: {lr}, Mean RMSE: {mean_rmse}')
#         if mean_rmse < best_rmse:
#             best_rmse = mean_rmse
#             best_layers = layers
#             best_lr = lr

# print(f'Best layers: {best_layers}, Best learning rate: {best_lr}, Best RMSE: {best_rmse}')

In [27]:
splits = RandomSplitter(valid_pct=0.2)(range_of(df_train))

In [28]:
to_final = TabularPandas(df_train, procs, cat, cont, y_names=dep_var, splits=splits)
test_final = TabularPandas(df_test, procs, cat, cont, y_names=None, splits=None)
dls_final = to_final.dataloaders(bs=1024)

In [29]:
epochs=17

#### Train Model

In [30]:
learn_final = tabular_learner(dls_final, layers=[200, 100], y_range=(0, 150), metrics=rmse)
learn_final.fit_one_cycle(epochs, 1e-2)

epoch,train_loss,valid_loss,_rmse,time
0,895.872009,874.514282,29.572191,00:04
1,859.670532,858.316162,29.297035,00:02
2,848.579407,852.127014,29.191212,00:02
3,847.807983,845.060059,29.06992,00:02
4,836.020325,833.755798,28.87483,00:02
5,831.847534,828.838684,28.789558,00:02
6,824.805115,826.227478,28.744175,00:02
7,819.01062,825.132629,28.725119,00:02
8,817.523071,815.194702,28.551613,00:02
9,813.505615,811.646057,28.489403,00:02


In [31]:
test_dl = learn_final.dls.test_dl(df_test)
preds_final, _ = learn_final.get_preds(dl=test_dl)

#### Get Model Predictions

In [32]:
preds_final = preds_final.squeeze()

In [33]:
preds_final

tensor([ 97.8138,  98.6221, 100.2354,  ..., 100.1513, 107.9970, 111.9872])

#### Random Forest Model

In [34]:
#| export
xs, y = to_final.train.xs, to_final.train.y
valid_xs, valid_y = to_final.valid.xs, to_final.valid.y
test_xs = test_final.train.xs

In [35]:
def rf(xs, y, n_estimators=40, max_samples=200_000, max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators, 
                                 max_samples=max_samples, max_features=max_features,
                                 min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)

In [36]:
#| export
def r_mse(pred, y):
    return round(math.sqrt(((pred-y)**2).mean()), 6)

In [37]:
#| export
def m_rmse(m, xs, y):
    return r_mse(m.predict(xs), y)

In [38]:
m = rf(xs, y, n_estimators=100)

In [39]:
rf_mse = m_rmse(m, valid_xs, valid_y)
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

(23.145543, 28.29217)

In [40]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp': m.feature_importances_}).sort_values('imp', ascending=False) 

In [41]:
fi = rf_feat_importance(m, xs)
fi

Unnamed: 0,cols,imp
0,store_sales(in millions),0.21915
5,gross_weight,0.205314
8,units_per_case,0.153429
9,store_sqft,0.100817
2,total_children,0.076905
4,avg_cars_at home(approx).1,0.06337
3,num_children_at_home,0.050143
1,unit_sales(in millions),0.035234
6,recyclable_package,0.027278
7,low_fat,0.024077


In [46]:
#| export
rf_preds = m.predict(test_xs)

In [47]:
rf_preds

array([110.39516014,  90.86170374, 101.74165286, ...,  98.97014657,
       117.42786308, 115.32894703])

In [48]:
preds_final

tensor([ 97.8138,  98.6221, 100.2354,  ..., 100.1513, 107.9970, 111.9872])

In [49]:
#| export
ens_preds = (to_np(preds_final) + rf_preds) /2

In [51]:
#| export
sample_df = pd.read_csv(path/'sample_submission.csv')
sample_df['cost'] = preds_final
sample_df.to_csv('submission.csv', index=False)

In [52]:
!head submission.csv

id,cost
360336,97.813835
360337,98.62207
360338,100.235374
360339,102.82609
360340,82.80857
360341,104.276184
360342,114.131294
360343,91.833176
360344,91.2275


In [53]:
sample_df.max()

id      600559.000000
cost       138.794235
dtype: float64

In [54]:
submit = True

In [55]:
if not iskaggle and submit:
    from kaggle import api
    api.competition_submit_cli(file_name='submission.csv', message="drop id column", competition=comp)



100%|██████████| 3.81M/3.81M [00:01<00:00, 2.17MB/s]


In [None]:
import nbdev
nbdev.export.nb_export('media_campaign_cost.ipynb', 'app_v1')
print("export successful")