In [1]:
#| defualt_exp nn_model

In [2]:
#| export
from fastai.tabular.all import *

from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance

import xgboost as xgb

import seaborn as sns

import optuna

import json

In [3]:
#| export
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

In [4]:
#| export
comp = 'playground-series-s3e11'
path = setup_comp(comp, install='fastai')

In [5]:
#| export
df_train = pd.read_csv(path/'train.csv', low_memory=False)
df_test = pd.read_csv(path/'test.csv', low_memory=False)
df_comb = pd.concat([df_train, df_test], ignore_index=True)

In [6]:
#| export
df_train.drop(['id'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)
df_comb.drop(['id'], axis=1, inplace=True)

In [7]:
#| export
df_train['store_sales_per_children'] = df_train['store_sales(in millions)'] / df_train['total_children']

In [8]:
#| export
for column in df_train.columns:
    if (list(df_train[column].unique()) == [0.0, 1.0]):
        df_train.loc[:, column] = df_train[column].astype('bool')

In [9]:
#| export
train_idxs = np.arange(len(df_train))
test_idxs = np.arange(len(df_train), len(df_comb))

In [10]:
#| export
dep_var = 'cost'
procs = [Categorify, FillMissing, Normalize]
cont, cat = cont_cat_split(df_comb, max_card=1, dep_var=dep_var)
splits = RandomSplitter(valid_pct=0.2)(range_of(df_train))

In [11]:
#| export
df_train = df_comb.iloc[train_idxs]
df_test = df_comb.iloc[test_idxs]

In [12]:
#| export
to_final = TabularPandas(df_train, procs, cat, cont, y_names=dep_var, splits=splits)
test_final = TabularPandas(df_test, procs, cat, cont, y_names=None, splits=None)
dls_final = to_final.dataloaders(bs=1024)

#### Optimizing NN Parameters with Optuna

In [13]:
#| export
def nn_trial(trial):
    lr = trial.suggest_float('lr', 1e-5, 1e-1)
    wd = trial.suggest_float('wd', 1e-6, 1e-1)
    n_layers = trial.suggest_int('n_layers', 1, 3)
    hidden_dim = trial.suggest_int('hidden_dim', 100, 1000)
    

    layer_sizes = [hidden_dim] * n_layers

    learn = tabular_learner(dls_final, layers=layer_sizes, metrics=rmse, wd=wd)
    learn.fit_one_cycle(10, lr)

    # Return the validation loss (or any other metric of your choice)
    return learn.recorder.values[-1][0]


In [14]:
#| export
nn_study = False

In [15]:
#| export
if nn_study == True:
    study = optuna.create_study(direction='minimize')
    study.optimize(nn_trial, n_trials=50)
    with open('./training_params/nn_params.json', 'w') as fp:
        json.dump(study.best_params, fp)

In [16]:
#| export
with open('./training_params/nn_params.json', 'r') as fp:
    nn_best_params = json.load(fp)

In [17]:
#| export
nn_lr = nn_best_params['lr']
nn_best_params.pop('lr')

0.028868944455163

In [18]:
#| export
nn_best_params['layers'] = [nn_best_params['hidden_dim']] * nn_best_params['n_layers']
nn_best_params.pop('n_layers')
nn_best_params.pop('hidden_dim')

722

### Neural Network Model

#### Train Model

In [19]:
#| export
nn_train = False
epochs = 17

In [20]:
#| export
if nn_train == True:
    learn_final = tabular_learner(dls_final, **nn_best_params, y_range=(0, 150), metrics=rmse)
    learn_final.fit_one_cycle(epochs, nn_lr)
    learn_final.export('models/tab_learner.pkl')


In [21]:
#| export
learn_final = load_learner('models/tab_learner.pkl')

#### Get NN Model Predictions

In [22]:
#| export
test_dl = learn_final.dls.test_dl(df_test)
nn_preds, _ = learn_final.get_preds(dl=test_dl)

In [23]:
#| export
nn_preds = nn_preds.squeeze().numpy()
np.savetxt('./predictions/nn_preds.csv', nn_preds, delimiter=',')

In [24]:
#| export
nn_preds = np.loadtxt('./predictions/nn_preds.csv', delimiter=',')

In [25]:
Path('./nn_submissions').mkdir(exist_ok=True, parents=True)
sample_df = pd.read_csv(path/'sample_submission.csv')
sample_df['cost'] = nn_preds
sample_df.to_csv('./nn_submissions/submission.csv', index=False)

In [26]:
new_sub_nums = sample_df['cost'].to_numpy()
benchmark_df = pd.read_csv('benchmark_submission.csv')
benchmark_nums = benchmark_df['cost'].to_numpy()
diffs = np.mean(np.abs(new_sub_nums - benchmark_nums))

In [27]:
diffs

5.484411156291848

In [28]:
submit = False

In [29]:
if not iskaggle and submit:
    from kaggle import api
    api.competition_submit_cli(file_name='./nn_submissions/submission.csv', message="nn_model", competition=comp)



100%|██████████| 5.78M/5.78M [00:02<00:00, 2.24MB/s]


In [34]:
current_model = 'nn_model'

In [35]:
submission_history = api.competitions_submissions_list(id=comp)
submission_recent = [x for x in submission_history if x['description'] == current_model]
most_recent_submission = max(submission_recent, key=lambda x: x['date'])
if most_recent_submission['publicScore'] is not None:
    current_model = most_recent_submission['description']
    current_score = most_recent_submission['publicScore']

    if not os.path.exists('model_scores.csv'):
        with open('model_scores.csv', "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["model", "current_score"])

    model_score_df = pd.read_csv('./model_scores.csv')

    if current_model in model_score_df['model'].values:
        model_score_df.loc[model_score_df['model'] == current_model, 'current_score'] = current_score
    else:
        new_row = pd.DataFrame({"model": [current_model], "current_score": [current_score]})
        model_score_df = pd.concat([model_score_df, new_row], ignore_index=True)
        
    pd.DataFrame(model_score_df).to_csv('./model_scores.csv', index=False)

In [36]:
current_score

'0.29889'

In [None]:
import nbdev
nbdev.export.nb_export('nn_model.ipynb', 'nn_model')
print("export successful")