# Structured and time series data

In [None]:
import os

from fastai.structured import *
from fastai.column_data import *
from sklearn.preprocessing import OneHotEncoder

np.set_printoptions(threshold=50, edgeitems=20)

input_dir = 'data/input/'

train_ratio = 0.99
batch_size = 128
embedding_divisor = 1.5
embedding_size_max = 50

## Massage input data

In [None]:
data_train = pd.read_feather(os.path.join(input_dir, 'train_clean'))
data_test = pd.read_feather(os.path.join(input_dir, 'test_clean'))

In [None]:
response_var = 'outcome'
ignore_vars = ['id']
cat_vars = data_train.columns[data_train.dtypes == 'category'].tolist()
cat_vars = [var for var in cat_vars if var not in ignore_vars + [response_var]]

In [None]:
# df_train = df_train.set_index("id")
# df_test = df_test.set_index("id")

In [None]:
df_train, y, nas, mapper = proc_df(data_train, response_var, do_scale=True, skip_flds=ignore_vars)

In [None]:
mapping = {'FE': 0, 'UE': 1, 'W': 2}

y_codes = [mapping.get(item) for item in y]

response_y = np.eye(len(y_codes), len(mapping))[y_codes]
# response_y = np.array([1 if item == 'W' else 0 for item in y])

# response_y = np.array(y_codes)
response_y = response_y.astype('float32')
# response_y = response_y.astype('int64')
# idxs = np.where(response_y > 0)[1]
# response_y = torch.LongTensor(idxs)
response_y = response_y.squeeze()
response_y

In [None]:
df_test, _, nas, mapper = proc_df(data_test, response_var, do_scale=True, skip_flds=ignore_vars,
                                  mapper=mapper, na_dict=nas)

#### Split into validation set

In [None]:
samp_size = len(df_train)

val_idx = get_cv_idxs(samp_size, val_pct=1 - train_ratio)

## Set up the deep learning

In [None]:
def multi_logloss(preds, targs, epsilon=1e-15):
    if targs.ndim == 3:
        targs = targs[:,0,:]
    
    return metrics.log_loss(targs, preds)

Create a ModelData object directly from the data frame

In [None]:
model_data = ColumnarModelData.from_data_frame(input_dir, 
   val_idx, df_train, response_y, cat_flds=cat_vars, bs=batch_size, test_df=df_test)

Some categorical variables have a lot more levels than others. Store, in particular, has over a thousand!

In [None]:
cat_sz = [(c, len(data_train[c].cat.categories)+1) for c in cat_vars]
cat_sz

We use the *cardinality* of each variable (that is, its number of unique values) to decide how large to make its *embeddings*. Each level will be associated with a vector with length defined as below.

In [None]:
emb_szs = [(c, min(embedding_size_max, int((c+1)/embedding_divisor))) for _, c in cat_sz]
emb_szs

#### Build the learner

In [None]:
def binary_cross_entropy(input, target, weight=None, size_average=True):
    return F.binary_cross_entropy(input, target.squeeze(1), weight=weight, size_average=size_average)

def cross_entropy(input, target, weight=None, size_average=True):
    return F.cross_entropy(input, target.squeeze(1), weight=weight, size_average=size_average)


class StructuredClassifyLearner(Learner):
    def __init__(self, data, models, **kwargs):
        super().__init__(data, models, **kwargs)
        
        self.crit = binary_cross_entropy
        # self.crit = cross_entropy


class MixedInputClassifyModel(nn.Module):
    def __init__(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops, use_bn=False):
        super().__init__() ## inherit from nn.Module parent class
        self.embs = nn.ModuleList([nn.Embedding(m, d) for m, d in emb_szs]) ## construct embeddings
        for emb in self.embs: emb_init(emb) ## initialize embedding weights
        n_emb = sum(e.embedding_dim for e in self.embs) ## get embedding dimension needed for 1st layer
        szs = [n_emb+n_cont] + szs ## add input layer to szs
        self.lins = nn.ModuleList([
            nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)]) ## create linear layers input, l1 -> l1, l2 ...
        self.bns = nn.ModuleList([
            nn.BatchNorm1d(sz) for sz in szs[1:]]) ## batchnormalization for hidden layers activations
        for o in self.lins: kaiming_normal(o.weight.data) ## init weights with kaiming normalization
        self.outp = nn.Linear(szs[-1], out_sz) ## create linear from last hidden layer to output
        kaiming_normal(self.outp.weight.data) ## do kaiming initialization
        
        self.emb_drop = nn.Dropout(emb_drop) ## embedding dropout, will zero out weights of embeddings
        self.drops = nn.ModuleList([nn.Dropout(drop) for drop in drops]) ## fc layer dropout
        self.bn = nn.BatchNorm1d(n_cont) # bacthnorm for continous data
        self.use_bn = use_bn
        
    def forward(self, x_cat, x_cont):
        x = [emb(x_cat[:, i]) for i, emb in enumerate(self.embs)] # takes necessary emb vectors 
        x = torch.cat(x, 1) ## concatenate along axis = 1 (columns - side by side) # this is our input from cats
        x = self.emb_drop(x) ## apply dropout to elements of embedding tensor
        x2 = self.bn(x_cont) ## apply batchnorm to continous variables
        x = torch.cat([x, x2], 1) ## concatenate cats and conts for final input
        for l, d, b in zip(self.lins, self.drops, self.bns):
            x = F.relu(l(x)) ## dotprod + non-linearity
            if self.use_bn: x = b(x) ## apply batchnorm activations
            x = d(x) ## apply dropout to activations
        x = self.outp(x) # we defined this externally just not to apply dropout to output

        x = F.sigmoid(x) # for classification
        return x

In [None]:
model = MixedInputClassifyModel(emb_szs = emb_szs, 
                                n_cont = len(df_train.columns) - len(cat_vars), 
                                emb_drop = 0.25, 
                                out_sz = len(mapping), 
                                szs = [1000, 500, 250], 
                                drops = [0.02, 0.04, 0.08],
                                use_bn = True)

learner = StructuredClassifyLearner(model_data, BasicModel(model),
                                    opt_fn=optim.Adam)

In [None]:
learner.lr_find(1e-5, 10)
learner.sched.plot()

### Fit model

In [None]:
lr = 5e-3
learner.fit(lr, 3, metrics=[multi_logloss], cycle_len=1, cycle_mult=2)

In [None]:
# learner.save('val0')
# learner.load('val0')

In [None]:
preds, targs = learner.predict_with_targs()
multi_logloss(preds, targs)

## Traditional ML

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from fastai.imports import *

def plot_fi(fi): return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [None]:
((val, trn), (y_val, y_trn)) = split_by_idx(val_idx, df_train.values, y)

In [None]:
rf_learner = RandomForestClassifier(n_estimators=1000, max_features=0.4,
                                    min_samples_leaf=3,
                                    n_jobs=-1, oob_score=True, criterion='entropy')
rf_learner.fit(trn, y_trn)
preds = rf_learner.predict_proba(val)
rf_learner.score(trn, y_trn), rf_learner.score(val, y_val), metrics.log_loss(y_val, preds)

In [None]:
plot_fi(rf_feat_importance(rf_learner, df_train))

In [None]:
gbm_learner = GradientBoostingClassifier(n_estimators=2000, max_depth=3,
                                         min_samples_leaf=3, subsample = 0.5,
                                         learning_rate=0.01)
gbm_learner.fit(trn, y_trn)
preds = gbm_learner.predict_proba(val)
gbm_learner.score(trn, y_trn), gbm_learner.score(val, y_val), metrics.log_loss(y_val, preds)

In [None]:
plot_fi(rf_feat_importance(gbm_learner, df_train))

In [None]:
y_winner = np.array([out if out == 'W' else 'E' for out in y])
y_unforced = np.array([out if out == 'UE' else 'tough' for out in y])

In [None]:
((val, trn), (y_val, y_trn)) = split_by_idx(val_idx, df_train.values, y_winner)

gbm_winner_learner = GradientBoostingClassifier(n_estimators=2000, max_depth=4,
                                                min_samples_leaf=2, subsample = 0.8,
                                                learning_rate=0.01)
gbm_winner_learner.fit(trn, y_trn)
preds = gbm_winner_learner.predict_proba(val)
gbm_winner_learner.score(trn, y_trn), gbm_winner_learner.score(val, y_val), metrics.log_loss(y_val, preds)

In [None]:
plot_fi(rf_feat_importance(gbm_winner_learner, df_train))

In [None]:
((val, trn), (y_val, y_trn)) = split_by_idx(val_idx, df_train.values, y_unforced)

gbm_unforced_learner = GradientBoostingClassifier(n_estimators=1000, max_depth=4,
                                                  min_samples_leaf=3, subsample = 0.6,
                                                  learning_rate=0.01)
gbm_unforced_learner.fit(trn, y_trn)
preds = gbm_unforced_learner.predict_proba(val)
gbm_unforced_learner.score(trn, y_trn), gbm_unforced_learner.score(val, y_val), metrics.log_loss(y_val, preds)

In [None]:
plot_fi(rf_feat_importance(gbm_unforced_learner, df_train))

## Ensemble averaging

In [None]:
nn_trn_preds = learner.predict_dl(model_data.trn_dl)
nn_val_preds = learner.predict_dl(model_data.val_dl)
nn_test_preds = learner.predict_dl(model_data.test_dl)

learners = {'rf': rf_learner, 
            'gbm': gbm_learner,
            'winner': gbm_winner_learner,
            'unforced': gbm_unforced_learner
           }

# learners = {'gbm': gbm_learner}

trn_preds = [np.array(lrn.predict_proba(trn)) for nm, lrn in learners.items()] + [nn_trn_preds]
val_preds = [np.array(lrn.predict_proba(val)) for nm, lrn in learners.items()] + [nn_val_preds]
test_preds = [np.array(lrn.predict_proba(df_test)) for nm, lrn in learners.items()] + [nn_test_preds]

In [None]:
trn_preds = np.column_stack(trn_preds)
val_preds = np.column_stack(val_preds)
test_preds = np.column_stack(test_preds)

In [None]:
y_val, y_trn = split_by_idx(val_idx, y)[0]

stacking_model = SGDClassifier(loss='log', penalty='l2', alpha=0.007)

stacking_model.fit(trn_preds, y_trn)
preds = stacking_model.predict_proba(val_preds)

In [None]:
metrics.log_loss(y_val, preds), metrics.log_loss(y_trn, stacking_model.predict_proba(trn_preds))

## Test output

In [None]:
pred_test_nn = learner.predict(True)

data_out = pd.DataFrame(pred_test_nn, columns = ['FE', 'UE', 'W'])
data_out['submission_id'] = data_test['id'].astype('str') + '_' + data_test['gender'].astype('str')
data_out['train'] = 0
data_out = data_out[['submission_id', 'train', 'UE', 'FE', 'W']]
data_out.index = data_out['submission_id']
submission_format = pd.read_csv('data/AUS_SubmissionFormat.csv')
data_out = data_out.loc[submission_format['submission_id']].reset_index(drop = True)
data_out.head()

data_out.to_csv('submissions/submission_nn.csv', index = False)
data_out.head()

In [None]:
pred_test_stack = stacking_model.predict_proba(test_preds)

data_out = pd.DataFrame(pred_test_stack, columns = ['FE', 'UE', 'W'])
data_out['submission_id'] = data_test['id'].astype('str') + '_' + data_test['gender'].astype('str')
data_out['train'] = 0
data_out = data_out[['submission_id', 'train', 'UE', 'FE', 'W']]
data_out.index = data_out['submission_id']
submission_format = pd.read_csv('data/AUS_SubmissionFormat.csv')
data_out = data_out.loc[submission_format['submission_id']].reset_index(drop = True)
data_out.head()

data_out.to_csv('submissions/submission_stack.csv', index = False)
data_out.head()

In [None]:
pred_test_gbm = gbm_learner.predict_proba(df_test)

data_out = pd.DataFrame(pred_test_gbm, columns = ['FE', 'UE', 'W'])
data_out['submission_id'] = data_test['id'].astype('str') + '_' + data_test['gender'].astype('str')
data_out['train'] = 0
data_out = data_out[['submission_id', 'train', 'UE', 'FE', 'W']]
data_out.index = data_out['submission_id']
submission_format = pd.read_csv('data/AUS_SubmissionFormat.csv')
data_out = data_out.loc[submission_format['submission_id']].reset_index(drop = True)
data_out.head()

data_out.to_csv('submissions/submission_gbm.csv', index = False)
data_out.head()

In [None]:
pred_test_average = (pred_test_nn + pred_test_gbm) / 2

In [None]:
data_out = pd.DataFrame(pred_test_average, columns = ['FE', 'UE', 'W'])
data_out['submission_id'] = data_test['id'].astype('str') + '_' + data_test['gender'].astype('str')
data_out['train'] = 0
data_out = data_out[['submission_id', 'train', 'UE', 'FE', 'W']]
data_out.index = data_out['submission_id']
submission_format = pd.read_csv('data/AUS_SubmissionFormat.csv')
data_out = data_out.loc[submission_format['submission_id']].reset_index(drop = True)
data_out.head()

data_out.to_csv('submissions/submission_average.csv', index = False)
data_out.head()