In [1]:
#| defualt_exp rf_model

In [2]:
#| export


from fastai.tabular.all import *

from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance


import xgboost as xgb

import seaborn as sns

import optuna

import json

In [3]:
#| export
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

In [4]:
#| export
comp = 'playground-series-s3e11'
path = setup_comp(comp, install='fastai')

In [5]:
#| export
df_train = pd.read_csv(path/'train.csv', low_memory=False)
df_test = pd.read_csv(path/'test.csv', low_memory=False)
df_comb = pd.concat([df_train, df_test], ignore_index=True)

In [6]:
#| export
df_train.drop(['id'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)
df_comb.drop(['id'], axis=1, inplace=True)

In [7]:
#| export
df_train['store_sales_per_children'] = df_train['store_sales(in millions)'] / df_train['total_children']

In [8]:
#| export
for column in df_train.columns:
    if (list(df_train[column].unique()) == [0.0, 1.0]):
        df_train.loc[:, column] = df_train[column].astype('bool')

In [9]:
#| export
train_idxs = np.arange(len(df_train))
test_idxs = np.arange(len(df_train), len(df_comb))

In [10]:
#| export
dep_var = 'cost'
procs = [Categorify, FillMissing, Normalize]
cont, cat = cont_cat_split(df_comb, max_card=1, dep_var=dep_var)
splits = RandomSplitter(valid_pct=0.2)(range_of(df_train))

In [11]:
#| export
df_train = df_comb.iloc[train_idxs]
df_test = df_comb.iloc[test_idxs]

In [12]:
#| export
to_final = TabularPandas(df_train, procs, cat, cont, y_names=dep_var, splits=splits)
test_final = TabularPandas(df_test, procs, cat, cont, y_names=None, splits=None)
dls_final = to_final.dataloaders(bs=1024)

### Random Forest Model

In [13]:
#| export
xs, y = to_final.train.xs, to_final.train.y
valid_xs, valid_y = to_final.valid.xs, to_final.valid.y
test_xs = test_final.train.xs

In [14]:
#| export
def rf(xs, y, n_estimators=40, max_samples=200_000, max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators, 
                                 max_samples=max_samples, max_features=max_features,
                                 min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)

#### RF Optuna Optimization

In [15]:
rf_study = False

In [16]:
def rf_param(trial, xs, y):
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    max_samples = trial.suggest_int('max_samples', 100, 200_000)
    max_features = trial.suggest_float('max_features', 0.1, 1.0)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)

    model = rf(xs, y, n_estimators=n_estimators, max_samples=max_samples, 
               max_features=max_features, min_samples_leaf=min_samples_leaf)
    
    score = cross_val_score(model, xs, y, cv=5, scoring='neg_mean_squared_error')
    return math.sqrt(-1 * score.mean())

In [17]:
if rf_study == True:
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: rf_param(trial, xs, y), n_trials=50)
    with open('./training_params/rf_params.json', 'w') as fp:
        json.dump(study.best_params, fp) 

In [18]:
with open('./training_params/rf_params.json', 'r') as fp:
    rf_best_params = json.load(fp)

In [19]:
#| export
def r_mse(pred, y):
    return round(math.sqrt(((pred-y)**2).mean()), 6)

In [20]:
#| export
def m_rmse(m, xs, y):
    return r_mse(m.predict(xs), y)

In [21]:
#| export
m = rf(xs, y, **rf_best_params)

In [22]:
rf_mse = m_rmse(m, valid_xs, valid_y)
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

(26.484396, 28.17077)

In [23]:
#| export
rf_preds = m.predict(test_xs)
np.savetxt('./predictions/rf_preds.csv', rf_preds, delimiter=',')

In [24]:
rf_preds = np.loadtxt('./predictions/rf_preds.csv', delimiter=',')

#### RF Feature Importance

In [25]:
#| export
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp': m.feature_importances_}).sort_values('imp', ascending=False) 

In [26]:
fi = rf_feat_importance(m, xs)
fi

Unnamed: 0,cols,imp
9,store_sqft,0.179939
0,store_sales(in millions),0.162263
5,gross_weight,0.149502
2,total_children,0.1096
8,units_per_case,0.105428
4,avg_cars_at home(approx).1,0.08791
3,num_children_at_home,0.072675
14,florist,0.040213
1,unit_sales(in millions),0.021502
6,recyclable_package,0.019755


In [27]:
rf_preds

array([102.09342713,  95.00089652,  96.44385451, ..., 100.73261883,
       113.27658028, 115.78389046])

In [28]:
#| export
np.savetxt('./predictions/rf_preds.csv', rf_preds, delimiter=',')

In [29]:
#| export
rf_preds = np.loadtxt('./predictions/rf_preds.csv', delimiter=',')

In [30]:
Path('./rf_submissions').mkdir(exist_ok=True, parents=True)
sample_df = pd.read_csv(path/'sample_submission.csv')
sample_df['cost'] = rf_preds
sample_df.to_csv('./rf_submissions/submission.csv', index=False)

In [31]:
new_sub_nums = sample_df['cost'].to_numpy()
benchmark_df = pd.read_csv('benchmark_submission.csv')
benchmark_nums = benchmark_df['cost'].to_numpy()
diffs = np.mean(np.abs(new_sub_nums - benchmark_nums))

In [32]:
diffs

4.943032977030909

In [33]:
submit = True

In [34]:
if not iskaggle and submit:
    from kaggle import api
    api.competition_submit_cli(file_name='./rf_submissions/submission.csv', message="rf_model", competition=comp)



100%|██████████| 5.81M/5.81M [00:02<00:00, 2.27MB/s]


In [None]:
current_model

In [49]:
submission_history = api.competitions_submissions_list(id=comp)

In [50]:
submission_recent = [x for x in submission_history if x['description'] == 'rf_model']

In [51]:
most_recent_submission = max(submission_recent, key=lambda x: x['date'])
if most_recent_submission['publicScore'] is not None:
    current_model = most_recent_submission['description']
    current_score = most_recent_submission['publicScore']

    if not os.path.exists('model_scores.csv'):
        with open('model_scores.csv', "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["model", "current_score"])

    model_score_df = pd.read_csv('./model_scores.csv')

    if current_model in model_score_df['model'].values:
        model_score_df.loc[model_score_df['model'] == current_model, 'current_score'] = current_score
    else:
        new_row = pd.DataFrame({"model": [current_model], "current_score": [current_score]})
        model_score_df = pd.concat([model_score_df, new_row], ignore_index=True)
        
    pd.DataFrame(model_score_df).to_csv('./model_scores.csv', index=False)

In [94]:
current_score

'0.30094'

In [None]:
import nbdev
nbdev.export.nb_export('nn_model.ipynb', 'nn_model')
print("export successful")