In [1]:
#| default_exp app_v1

#### Competition

[LINK](https://www.kaggle.com/competitions/playground-series-s3e11/overview)

#### Imports

In [2]:
#| export
from fastai.tabular.all import *
from sklearn.model_selection import KFold

#### Downloading Datasets

In [3]:
#| export
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

In [4]:
#| export
comp = 'playground-series-s3e11'
path = setup_comp(comp, install='fastai')

In [5]:
# copy .gitignore template from my home directory and append project data folder to it
if not os.path.exists('.gitignore'):
    !cp ~/.gitignore .
if comp not in open('.gitignore').read():
    with open('.gitignore', 'a') as f: f.write(f'{comp}')

#### Create Dataframes

In [6]:
#| export
df_train = pd.read_csv(path/'train.csv', low_memory=False)
df_test = pd.read_csv(path/'test.csv', low_memory=False)

In [7]:
#| export
df_comb = pd.concat([df_train, df_test], ignore_index=True)

In [8]:
df_train

Unnamed: 0,id,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,recyclable_package,low_fat,units_per_case,store_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,cost
0,0,8.61,3.0,2.0,2.0,2.0,10.30,1.0,0.0,32.0,36509.0,0.0,0.0,0.0,0.0,0.0,62.09
1,1,5.00,2.0,4.0,0.0,3.0,6.66,1.0,0.0,1.0,28206.0,1.0,0.0,0.0,0.0,0.0,121.80
2,2,14.08,4.0,0.0,0.0,3.0,21.30,1.0,0.0,26.0,21215.0,1.0,0.0,0.0,0.0,0.0,83.51
3,3,4.02,3.0,5.0,0.0,0.0,14.80,0.0,1.0,36.0,21215.0,1.0,0.0,0.0,0.0,0.0,66.78
4,4,2.13,3.0,5.0,0.0,3.0,17.00,1.0,1.0,20.0,27694.0,1.0,1.0,1.0,1.0,1.0,111.51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360331,360331,7.60,4.0,5.0,5.0,3.0,13.50,1.0,0.0,33.0,30268.0,0.0,0.0,0.0,0.0,0.0,133.42
360332,360332,14.44,4.0,4.0,0.0,4.0,18.80,1.0,1.0,18.0,20319.0,0.0,0.0,0.0,0.0,0.0,81.85
360333,360333,10.74,3.0,0.0,0.0,2.0,11.30,1.0,0.0,35.0,30584.0,1.0,1.0,1.0,1.0,1.0,87.07
360334,360334,11.04,3.0,1.0,0.0,3.0,10.20,0.0,1.0,14.0,30584.0,1.0,1.0,1.0,1.0,1.0,146.72


In [9]:
df_train['store_sales_per_children'] = df_train['store_sales(in millions)'] / df_train['total_children']

In [10]:
list(df_train['coffee_bar'].unique())

[0.0, 1.0]

In [11]:
for column in df_train.columns:
    if (list(df_train[column].unique()) == [0.0, 1.0]):
        df_train.loc[:, column] = df_train[column].astype('bool')

In [12]:
df_train.dtypes

id                              int64
store_sales(in millions)      float64
unit_sales(in millions)       float64
total_children                float64
num_children_at_home          float64
avg_cars_at home(approx).1    float64
gross_weight                  float64
recyclable_package            float64
low_fat                          bool
units_per_case                float64
store_sqft                    float64
coffee_bar                       bool
video_store                      bool
salad_bar                        bool
prepared_food                    bool
florist                          bool
cost                          float64
store_sales_per_children      float64
dtype: object

In [13]:
#| export
train_idxs = np.arange(len(df_train))

In [14]:
#| export
test_idxs = np.arange(len(df_train), len(df_comb))

In [15]:
#| export
dep_var = 'cost'

In [16]:
#| export
procs = [Categorify, FillMissing, Normalize]

In [17]:
#| export
cont, cat = cont_cat_split(df_comb, max_card=1, dep_var=dep_var)

In [18]:
#| export
splits = RandomSplitter(valid_pct=0.2)(range_of(df_train))

In [19]:
#| export
df_train = df_comb.iloc[train_idxs]
df_test = df_comb.iloc[test_idxs]

In [20]:
len(df_train) + len(df_test) == len(df_comb)

True

#### K-Folds Cross-Validation

In [21]:
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
rmse_scores = []
test_preds_all = np.zeros((len(df_test), n_folds))

In [22]:
# for fold, (train_index, valid_index) in enumerate(kf.split(df_train)):
#     train_fold, valid_fold = df_train.loc[train_index], df_train.loc[valid_index]
#     to_fold = TabularPandas(train_fold, procs, cat, cont, y_names=dep_var, splits=None)
#     valid_fold_to = TabularPandas(valid_fold, procs, cat, cont, y_names=dep_var, splits=None)
    
#     dls_fold = to_fold.dataloaders(bs=1024)
#     dls_fold.valid = valid_fold_to.dataloaders(bs=1024).train
    
#     learn_fold = tabular_learner(dls_fold, layers=[200, 100], metrics=rmse)
#     learn_fold.fit_one_cycle(10, 1e-2)
    
#     preds, targs = learn_fold.get_preds()
#     rmse_score = rmse(preds, targs).item()
#     rmse_scores.append(rmse_score)

#     test_dl = learn_fold.dls.test_dl(df_test)
#     test_preds, _ = learn_fold.get_preds(dl=test_dl)
#     test_preds_all[:, fold] = test_preds.squeeze()

    

# mean_rmse_score = np.mean(rmse_scores)
# print(f'Mean RMSE score for {n_folds}-fold cross-validation: {mean_rmse_score}')

In [23]:
def kfold_cross_val(layers, learning_rate):
    rmse_scores = []
    for train_index, valid_index in kf.split(df_train):
        train_fold, valid_fold = df_train.loc[train_index], df_train.loc[valid_index]
        to_fold = TabularPandas(train_fold, procs, cat, cont, y_names=dep_var, splits=None)
        valid_fold_to = TabularPandas(valid_fold, procs, cat, cont, y_names=dep_var, splits=None)

        dls_fold = to_fold.dataloaders(bs=1024)
        dls_fold.valid = valid_fold_to.dataloaders(bs=1024).train

        learn_fold = tabular_learner(dls_fold, layers=layers, metrics=rmse)
        learn_fold.fit_one_cycle(5, lr_max=learning_rate)

        preds, targs = learn_fold.get_preds()
        rmse_score = rmse(preds, targs).item()
        rmse_scores.append(rmse_score)

    return np.mean(rmse_scores)

In [24]:
# layer_sizes = [[200, 100], [300, 150]]
# learning_rates = [1e-2, 1e-3]

# best_rmse = float('inf')
# best_layers = None
# best_lr = None

# for layers in layer_sizes:
#     for lr in learning_rates:
#         mean_rmse = kfold_cross_val(layers, lr)
#         print(f'Layers: {layers}, Learning rate: {lr}, Mean RMSE: {mean_rmse}')
#         if mean_rmse < best_rmse:
#             best_rmse = mean_rmse
#             best_layers = layers
#             best_lr = lr

# print(f'Best layers: {best_layers}, Best learning rate: {best_lr}, Best RMSE: {best_rmse}')

In [25]:
splits = RandomSplitter(valid_pct=0.2)(range_of(df_train))

In [26]:
to_final = TabularPandas(df_train, procs, cat, cont, y_names=dep_var, splits=splits)
dls_final = to_final.dataloaders(bs=1024)

In [27]:
to_final.cat_names

(#0) []

In [28]:
to_final

              id  store_sales(in millions)  unit_sales(in millions)  \
161832 -0.175908                  0.769376                -0.056522   
74316  -1.017233                  0.488066                -0.056522   
7330   -1.661196                  0.237004                -1.332241   
215927  0.344129                 -1.072753                -0.056522   
43634  -1.312191                  2.363468                 2.494917   
...          ...                       ...                      ...   
40330  -1.343954                  0.959941                -0.056522   
212198  0.308280                 -0.256046                -0.056522   
139327 -0.392257                  2.293897                 1.219198   
118234 -0.595032                 -1.009231                 1.219198   
86109  -0.903863                  1.259401                 2.494917   

        total_children  num_children_at_home  avg_cars_at home(approx).1  \
161832        1.036692             -0.567229                   -1.10808

In [29]:
to_final.train.y.max()

149.75

In [30]:
to_final.train.y.min()

50.79

In [31]:
epochs=17

In [32]:
learn_final = tabular_learner(dls_final, layers=[200, 100], y_range=(0, 150), metrics=rmse)
learn_final.fit_one_cycle(epochs, 1e-2)

epoch,train_loss,valid_loss,_rmse,time
0,904.007141,888.531494,29.808247,00:03
1,864.565918,860.694214,29.337591,00:02
2,850.851807,846.404968,29.093037,00:02
3,841.301941,840.293701,28.98782,00:03
4,836.917542,837.097168,28.932632,00:05
5,832.389771,829.908508,28.80813,00:02
6,830.736694,828.787537,28.788673,00:02
7,827.740662,821.924866,28.669233,00:02
8,822.458679,819.691162,28.630251,00:02
9,816.674622,816.230774,28.569754,00:02


In [33]:
test_dl = learn_final.dls.test_dl(df_test)
preds_final, _ = learn_final.get_preds(dl=test_dl)

In [None]:
# test_preds_avg = np.mean(test_preds_all, axis=1)

In [34]:
preds_final = preds_final.squeeze()


In [35]:
preds_final

tensor([ 97.0685, 100.3434, 100.7119,  ..., 100.8236, 107.4522, 107.0247])

In [36]:
#| export
sample_df = pd.read_csv(path/'sample_submission.csv')
sample_df['cost'] = preds_final
sample_df.to_csv('submission.csv', index=False)

In [37]:
!head submission.csv

id,cost
360336,97.06852
360337,100.343445
360338,100.71191
360339,99.094284
360340,85.61225
360341,103.97306
360342,110.84988
360343,94.05224
360344,90.5632


In [38]:
sample_df.max()

id      600559.000000
cost       142.999832
dtype: float64

In [39]:
submit = True

In [40]:
if not iskaggle and submit:
    from kaggle import api
    api.competition_submit_cli(file_name='submission.csv', message="baseline model", competition=comp)



100%|██████████| 3.81M/3.81M [00:01<00:00, 2.02MB/s]


In [None]:
import nbdev
nbdev.export.nb_export('media_campaign_cost.ipynb', 'app_v1')
print("export successful")