https://www.kaggle.com/truocpham/feature-engineering-and-lightgbm-starter 

 https://www.kaggle.com/nikhilikhar/elo-fastai-pytorch 

 # Load packages

In [None]:
import os
import gc

from fastai import *
from fastai.tabular import *

# Load datasets

In [None]:
%%time
train_df = pd.read_csv('../input/train.csv', parse_dates=["first_active_month"])
test_df = pd.read_csv('../input/test.csv', parse_dates=["first_active_month"])
sample_submission = pd.read_csv('../input/sample_submission.csv')

In [None]:
train_df.head().T

In [None]:
test_df.head().T

In [None]:
%%time
# merchants = pd.read_csv('../input/merchants.csv')
# historical_transactions = pd.read_csv('../input/historical_transactions.csv')
new_merchant_transactions = pd.read_csv('../input/new_merchant_transactions.csv')

In [None]:
# col with unique value less than 7
m_cat_cols = ['category_1',    'most_recent_sales_range',    'most_recent_purchases_range',    
              'active_months_lag3', 'active_months_lag6', 'category_4', 'category_2']
h_cat_cols = ['authorized_flag', 'category_1', 'category_3', 'category_2']
nm_cat_cols = ['authorized_flag', 'category_1','category_3','month_lag','category_2']

In [None]:
new_merchant_transactions.head().T

In [None]:
def missing_impute(df):
    for i in df.columns:
        if df[i].dtype == "object":
            df[i] = df[i].fillna("other")
        elif (df[i].dtype == "int64" or df[i].dtype == "float64"):
            df[i] = df[i].fillna(df[i].mean())
        else:
            pass
    return df
def datetime_extract(df, dt_col='first_active_month'):
    # df['date'] = df[dt_col].dt.date 
    df['day'] = df[dt_col].dt.day 
    df['dayofweek'] = df[dt_col].dt.dayofweek
    df['dayofyear'] = df[dt_col].dt.dayofyear
    df['days_in_month'] = df[dt_col].dt.days_in_month
    df['daysinmonth'] = df[dt_col].dt.daysinmonth 
    df['month'] = df[dt_col].dt.month
    df['week'] = df[dt_col].dt.week 
    df['weekday'] = df[dt_col].dt.weekday
    df['weekofyear'] = df[dt_col].dt.weekofyear
    # df['year'] = train[dt_col].dt.year

    return df


In [None]:
%%time
# Do impute missing values
for df in [train_df, test_df, 
#            new_merchant_transactions
          ]:
    missing_impute(df)
    
add_datepart(train_df, "first_active_month", drop=False)
add_datepart(test_df, "first_active_month", drop=False)

In [None]:
# One-hot encode features
ohe_df_1 = pd.get_dummies(train_df['feature_1'], prefix='f1_')
ohe_df_2 = pd.get_dummies(train_df['feature_2'], prefix='f2_')
ohe_df_3 = pd.get_dummies(train_df['feature_3'], prefix='f3_')

ohe_df_4 = pd.get_dummies(test_df['feature_1'], prefix='f1_')
ohe_df_5 = pd.get_dummies(test_df['feature_2'], prefix='f2_')
ohe_df_6 = pd.get_dummies(test_df['feature_3'], prefix='f3_')

# Numerical representation of the first active month
train_df = pd.concat([train_df, ohe_df_1, ohe_df_2, ohe_df_3], axis=1, sort=False)
test_df = pd.concat([test_df, ohe_df_4, ohe_df_5, ohe_df_6], axis=1, sort=False)

del ohe_df_1, ohe_df_2, ohe_df_3
del ohe_df_4, ohe_df_5, ohe_df_6
gc.collect()

In [None]:
lt = len(train_df)
val_idx = range(int(lt * .90), lt)

In [None]:
train_df.head().T

In [None]:
path = Path('../input')
cat_names = ['feature_1', 'feature_2', 'feature_3', 'f1__1', 'f1__2',
       'f1__3', 'f1__4', 'f1__5', 'f2__1', 'f2__2', 'f2__3', 'f3__0', 'f3__1',
#             'city_id', 'category_1', 'category_3', 'merchant_category_id',
#             'month_lag', 'category_2', 'state_id', 'subsector_id'
            ] 
cont_names = [ 'purchase_amount']
cont_names = []
dep_var = 'target'
procs = [FillMissing, Categorify, Normalize]

try:
    del data
    del learn
    gc.collect()
except Exception:
    pass

data = (TabularList.from_df(train_df, path=path, cat_names=cat_names, 
                            cont_names=cont_names, procs=procs,)
    .random_split_by_pct()
    .label_from_df(cols=dep_var, label_cls=FloatList,log=False)
    .add_test(ItemList.from_df(test_df,path))
    .databunch())


In [None]:
len(test_df)

In [None]:
def rmse(pred:FloatTensor, targ:FloatTensor):
    "RMSE between `pred` and `targ`."
    assert pred.numel() == targ.numel(), "Expected same numbers of elements in pred & targ"
    if len(pred.shape)==2: pred=pred.squeeze(1)
    var = (targ - pred)
    return torch.sqrt((var**2).mean())

max_y = (np.max(train_df[dep_var])*1.2)
y_range = torch.tensor([0, max_y], device=defaults.device)
y_range

In [None]:
learn = tabular_learner(data, layers=[1000, 500], ps=[0.001,0.01], emb_drop=0.04, 
                        model_dir='..', 
                        y_range=y_range, metrics=rmse)

In [None]:
learn.model

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot(skip_start=0, skip_end=-10)

In [None]:
learn.fit_one_cycle(1, 3e-2,)

In [None]:
learn.fit_one_cycle(5, 3e-2, wd=0.2)

In [None]:
preds, _ = learn.get_preds(ds_type=DatasetType.Test)

In [None]:
from datetime import datetime
ids = test_df['card_id']
vals = np.exp(preds.numpy())
sub = pd.DataFrame({'card_id': ids, 'target': vals[:,0]})
sub.to_csv("submission-%s.csv"%datetime.now().date(), index = False, header = True)
sub.head()
