### TabularLearner with part 1

In this notebook data generated from this [kernel](https://www.kaggle.com/keremt/fastai-feature-engineering-part1-6160-feats/) is used during modeling. Data can also be found as a Kaggle [dataset](https://www.kaggle.com/keremt/dsbowl2019-feng-part1). This notebook is part 1 of series of notebooks that will model data from corresponding feature engineering kernels as we keep adding hopefully some creative features :)

**Important Note:** Feature generation for test data will happen online since private test set is not publicly available for precomputation!

This notebook will give a LB around: 0.506 (score can vary but it's solely for baseline purposes)

**To see how features are generated in more detail:** [Feature Engineering Part 1 Notebook](https://www.kaggle.com/keremt/fastai-feature-engineering-part1-6160-feats/)

### Imports

In [1]:
from fastai.core import *
Path.read_csv = lambda o: pd.read_csv(o)
input_path = Path("../data-science-bowl-2019")
pd.options.display.max_columns=200
pd.options.display.max_rows=200
input_path.ls()

[PosixPath('../data-science-bowl-2019/sample_submission.csv'),
 PosixPath('../data-science-bowl-2019/specs.csv'),
 PosixPath('../data-science-bowl-2019/dsbowl2019-feng-part1'),
 PosixPath('../data-science-bowl-2019/train_labels.csv'),
 PosixPath('../data-science-bowl-2019/train.csv'),
 PosixPath('../data-science-bowl-2019/test.csv'),
 PosixPath('../data-science-bowl-2019/working')]

### Read Data

In [2]:
train_with_features_part1 = pd.read_csv("../data-science-bowl-2019/dsbowl2019-feng-part1/train_with_features_part1.csv")

In [3]:
sample_subdf = (input_path/'sample_submission.csv').read_csv()
# specs_df = (input_path/"specs.csv").read_csv()
# train_labels_df = (input_path/"train_labels.csv").read_csv()
# train_df = (input_path/"train.csv").read_csv()
test_df = (input_path/"test.csv").read_csv()

In [4]:
sample_subdf.shape, test_df.shape, train_with_features_part1.shape

((1000, 2), (1156414, 11), (17690, 6191))

### Features (part1)

Basically here we redefine the feature generation code for test.

In [5]:
from fastai.tabular import *
import types

stats = ["median","mean","sum","min","max"]
UNIQUE_COL_VALS = pickle.load(open(input_path/"dsbowl2019-feng-part1/UNIQUE_COL_VALS.pkl", "rb"))

In [6]:
for k in UNIQUE_COL_VALS.__dict__.keys():
    print(k, len(UNIQUE_COL_VALS.__dict__[k]))

event_ids 379
media_types 4
titles 44
worlds 4
event_codes 42


In [7]:
def array_output(f):
    def inner(*args, **kwargs): return array(listify(f(*args, **kwargs))).flatten()
    return inner

feature_funcs = []

@array_output
def time_elapsed_since_hist_begin(df):
    "total time passed until assessment begin"
    return df['timestampElapsed'].max() - df['timestampElapsed'].min()

feature_funcs.append(time_elapsed_since_hist_begin)

@array_output
def time_elapsed_since_each(df, types, dfcol):
    "time since last occurence of each types, if type not seen then time since history begin"
    types = UNIQUE_COL_VALS.__dict__[types]
    last_elapsed = df['timestampElapsed'].max()
    _d = dict(df.iloc[:-1].groupby(dfcol)['timestampElapsed'].max())
    return [last_elapsed - _d[t] if t in _d else time_elapsed_since_hist_begin(df)[0] for t in types]

feature_funcs.append(partial(time_elapsed_since_each, types="media_types", dfcol="type"))
feature_funcs.append(partial(time_elapsed_since_each, types="titles", dfcol="title"))
feature_funcs.append(partial(time_elapsed_since_each, types="event_ids", dfcol="event_id"))
feature_funcs.append(partial(time_elapsed_since_each, types="worlds", dfcol="world"))
feature_funcs.append(partial(time_elapsed_since_each, types="event_codes", dfcol="event_code"))

@array_output
def countfreqhist(df, types, dfcol, freq=False):
    "count or freq of types until assessment begin"
    types = UNIQUE_COL_VALS.__dict__[types]
    _d = dict(df[dfcol].value_counts(normalize=(True if freq else False)))
    return [_d[t] if t in _d else 0 for t in types]

feature_funcs.append(partial(countfreqhist, types="media_types", dfcol="type", freq=False))
feature_funcs.append(partial(countfreqhist, types="media_types", dfcol="type", freq=True))

feature_funcs.append(partial(countfreqhist, types="titles", dfcol="title", freq=False))
feature_funcs.append(partial(countfreqhist, types="titles", dfcol="title", freq=True))

feature_funcs.append(partial(countfreqhist, types="event_ids", dfcol="event_id", freq=False))
feature_funcs.append(partial(countfreqhist, types="event_ids", dfcol="event_id", freq=True))

feature_funcs.append(partial(countfreqhist, types="worlds", dfcol="world", freq=False))
feature_funcs.append(partial(countfreqhist, types="worlds", dfcol="world", freq=True))

feature_funcs.append(partial(countfreqhist, types="event_codes", dfcol="event_code", freq=False))
feature_funcs.append(partial(countfreqhist, types="event_codes", dfcol="event_code", freq=True))

@array_output
def overall_event_count_stats(df):
    "overall event count stats until assessment begin"
    return df['event_count'].agg(stats)
feature_funcs.append(overall_event_count_stats)

@array_output
def event_count_stats_each(df, types, dfcol):
    "event count stats per media types until assessment begin, all zeros if media type missing for user"
    types = UNIQUE_COL_VALS.__dict__[types]
    _stats_df = df.groupby(dfcol)['event_count'].agg(stats)
    _d = dict(zip(_stats_df.reset_index()[dfcol].values, _stats_df.values))
    return [_d[t] if t in _d else np.zeros(len(stats)) for t in types]
feature_funcs.append(partial(event_count_stats_each, types="media_types", dfcol="type"))
feature_funcs.append(partial(event_count_stats_each, types="titles", dfcol="title"))
feature_funcs.append(partial(event_count_stats_each, types="event_ids", dfcol="event_id"))
feature_funcs.append(partial(event_count_stats_each, types="worlds", dfcol="world"))
feature_funcs.append(partial(event_count_stats_each, types="event_codes", dfcol="event_code"))

@array_output
def overall_session_game_time_stats(df):
    "overall session game time stats until assessment begin"
    return df['game_time'].agg(stats)
feature_funcs.append(overall_session_game_time_stats)

@array_output
def session_game_time_stats_each(df, types, dfcol):
    "session game time stats per media types until assessment begin, all zeros if missing for user"
    types = UNIQUE_COL_VALS.__dict__[types]
    _stats_df = df.groupby(dfcol)['game_time'].agg(stats)
    _d = dict(zip(_stats_df.reset_index()[dfcol].values, _stats_df.values))
    return [_d[t] if t in _d else np.zeros(len(stats)) for t in types]
feature_funcs.append(partial(session_game_time_stats_each, types="media_types", dfcol="type"))
feature_funcs.append(partial(session_game_time_stats_each, types="titles", dfcol="title"))
feature_funcs.append(partial(session_game_time_stats_each, types="event_ids", dfcol="event_id"))
feature_funcs.append(partial(session_game_time_stats_each, types="worlds", dfcol="world"))
feature_funcs.append(partial(session_game_time_stats_each, types="event_codes", dfcol="event_code"))

len(feature_funcs)

28

### Test Feature Engineering

Test set in LB and Private LB is different than what is publicly shared. So feature engineering and inference for test set should be done online.

In [8]:
def get_sorted_user_df(df, ins_id):
    "extract sorted data for a given installation id and add datetime features"
    _df = df[df.installation_id == ins_id].sort_values("timestamp").reset_index(drop=True)
    add_datepart(_df, "timestamp", time=True)
    return _df

def get_test_assessment_start_idxs(df): 
    return list(df.sort_values("timestamp")
                  .query("type == 'Assessment' & event_code == 2000")
                  .groupby("installation_id").tail(1).index)

def get_test_feats_row(idx, i):
    "get all faeatures by an installation start idx"
    ins_id = test_df.loc[idx, "installation_id"]
    _df = get_sorted_user_df(test_df, ins_id)
    assessment_row = _df.iloc[-1]
    row_feats = np.concatenate([f(_df) for f in feature_funcs])
    feat_row = pd.Series(row_feats, index=[f"static_feat{i}"for i in range(len(row_feats))])
    row = pd.concat([assessment_row, feat_row])
    return row

In [9]:
# Feature Engineering part 1
start_idxs = get_test_assessment_start_idxs(test_df)
res = parallel(get_test_feats_row, start_idxs)
test_with_features_df = pd.concat(res,1).T

In [10]:
test_with_features_part1 = test_with_features_df

In [11]:
# check to see train and test have same features
num_test_feats = [c for c in test_with_features_df.columns if c.startswith("static")]
num_train_feats = [c for c in train_with_features_part1.columns if c.startswith("static")]
assert num_train_feats == num_test_feats

## Pseudolabeling

In [12]:
pseudolabels = pd.read_csv('for_pseudolabeling.csv', index_col=0)

In [13]:
train_with_features_part1.drop(columns=['Unnamed: 0'], inplace=True)


In [14]:
test_with_features_part1_pseudo = test_with_features_part1.copy()
test_with_features_part1_pseudo['accuracy_group'] = np.mean(pseudolabels, axis=1)
test_with_features_part1_pseudo['accuracy'] = np.zeros_like(pseudolabels['lgb']) * np.nan
test_with_features_part1_pseudo['num_correct'] = np.zeros_like(pseudolabels['lgb']) * np.nan
test_with_features_part1_pseudo['num_incorrect'] = np.zeros_like(pseudolabels['lgb']) * np.nan

In [15]:
train_with_features_part1 = pd.concat([train_with_features_part1,
                                       test_with_features_part1_pseudo],
                                      axis=0,
                                      sort=True).reset_index()

In [16]:
train_with_features_part1.drop(columns=['index'], inplace=True)

### TabularLearner Model

Here we use a single validation but in later stages once we finalize features we should use cross-validation. We don't over optimize the model or do any hyperparameter search since the whole purpose is to get a baseline and build on top of it in upcoming parts.

In [17]:
from fastai.tabular import *

In [18]:
train_with_features_part1.shape, test_with_features_part1.shape

((18690, 6190), (1000, 6186))

In [19]:
# create validation set - split by installation_id
np.random.seed(42)
valid_ids = (np.random.choice(train_with_features_part1.installation_id.unique(),
                              int(len(train_with_features_part1)*0.05)))
valid_idx = (train_with_features_part1[train_with_features_part1.installation_id.isin(valid_ids)].index); valid_idx

Int64Index([    3,     4,     5,     6,     7,     8,     9,    10,    11,
               12,
            ...
            18641, 18645, 18647, 18656, 18659, 18668, 18674, 18687, 18688,
            18689],
           dtype='int64', length=3488)

In [20]:
# get data
cat_names = ['title','world']
cont_names = [c for c in train_with_features_part1.columns if c.startswith("static_")]

In [24]:
# from torch.utils.data import TensorDataset, DataLoader

# trainset = TensorDataset(torch.from_numpy(train_numpy[train_idxs, :]))
# train_loader = DataLoader(trainset, batch_size=256, shuffle=True, num_workers=-1)

# validset = TensorDataset(torch.from_numpy(train_numpy[valid_idx, :]))
# valid_loader = DataLoader(validset, batch_size=256, shuffle=False, num_workers=-1)

# testset = TensorDataset(torch.from_numpy(test_numpy))
# test_loader = DataLoader(testset, batch_size=256, shuffle=False, num_workers=-1)


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, int64, int32, int16, int8, uint8, and bool.

In [36]:
from multiprocessing import  Pool


def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores, axis=1)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split), axis=1)
    pool.close()
    pool.join()
    return df

In [37]:
def change_df_type(df, dtype=float):
    for col in df.columns:
        df[col] = df[col].astype(dtype)
    return df

In [40]:
%%time


train_with_features_part1_type = parallelize_dataframe(train_with_features_part1[cont_names], change_df_type, 16)

CPU times: user 44.4 s, sys: 31.4 s, total: 1min 15s
Wall time: 4min 17s


In [50]:
columns_to_add = list(set(train_with_features_part1.columns) - set(train_with_features_part1_type.columns))

In [51]:
for col in columns_to_add:
    train_with_features_part1_type[col] = train_with_features_part1[col]

In [53]:
procs = [FillMissing, Categorify, Normalize]
data = TabularDataBunch.from_df(path=".", df=train_with_features_part1_type, dep_var="accuracy_group", 
                                valid_idx=valid_idx, procs=procs, cat_names=cat_names, cont_names=cont_names)

data.add_test(TabularList.from_df(test_with_features_part1, cat_names=cat_names, cont_names=cont_names));

In [74]:
# fit
from fastai.callbacks import EarlyStoppingCallback, SaveModelCallback
learner = tabular_learner(data, [512, 256, 128, 256, 128, 64], y_range=(0., 3), ps=0.6, emb_drop=0)
learner.fit_one_cycle(100, 3e-3, callbacks=[EarlyStoppingCallback(learner,
                                                                  monitor='valid_loss',
                                                                  patience=10)])

epoch,train_loss,valid_loss,time
0,1.889248,1.606252,00:06
1,1.758725,1.562617,00:06
2,1.708034,1.538743,00:06
3,1.621285,1.531267,00:06
4,1.587671,1.521891,00:06
5,1.547718,1.536903,00:06
6,1.514013,1.525254,00:06
7,1.45865,1.523725,00:06
8,1.372522,1.463661,00:06
9,1.287194,1.316368,00:06


Epoch 36: early stopping


### Check Validation Score

Again, we don't search for optimal coefficients since main purpose is to create a baseline.

In [75]:
from sklearn.metrics import cohen_kappa_score

In [76]:
def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :param max_rat:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e


In [77]:
# from functools import partial
# import scipy as sp
# class OptimizedRounder(object):
#     """
#     An optimizer for rounding thresholds
#     to maximize Quadratic Weighted Kappa (QWK) score
#     # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
#     """
#     def __init__(self):
#         self.coef_ = 0

#     def _kappa_loss(self, coef, X, y):
#         """
#         Get loss according to
#         using current coefficients
        
#         :param coef: A list of coefficients that will be used for rounding
#         :param X: The raw predictions
#         :param y: The ground truth labels
#         """
#         X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1/4, 2/4, 3/4])

#         return -qwk(y, X_p)

#     def fit(self, X, y):
#         """
#         Optimize rounding thresholds
        
#         :param X: The raw predictions
#         :param y: The ground truth labels
#         """
#         loss_partial = partial(self._kappa_loss, X=X, y=y)
#         initial_coef = [0.5/4, 1.5/4, 2.5/4]
#         self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

#     def predict(self, X, coef):
#         """
#         Make predictions with specified thresholds
        
#         :param X: The raw predictions
#         :param coef: A list of coefficients that will be used for rounding
#         """
#         return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1/4, 2/4, 3/4])


#     def coefficients(self):
#         """
#         Return the optimized coefficients
#         """
#         return self.coef_['x']

from functools import partial
import scipy as sp
class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

        return -qwk(y, X_p)

    def fit(self, X, y):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

In [78]:
from scipy.special import softmax

# get valid preds
preds, targs = learner.get_preds()

In [79]:
optR = OptimizedRounder()
optR.fit(preds.reshape(-1,), targs)
coefs = optR.coefficients()

In [80]:
coefs

array([0.630145, 1.448402, 2.291683])

In [81]:
def soft2hard(o):
    if o < coefs[0]: return 0
    elif o < coefs[1]: return 1
    elif o < coefs[2]: return 2
    else: return 3

In [82]:
# get accuracy_group for preds and targs
_preds = array([soft2hard(o.item()) for o in preds])
_targs = array(train_with_features_part1.iloc[valid_idx]['accuracy_group'].values)

In [83]:
_targs = array([soft2hard(targ) for targ in _targs])

In [84]:
# see validation score
cohen_kappa_score(_targs, _preds, weights="quadratic")

0.4998358824055993

0.5223101490886595

### Submit

In [85]:
# get test preds
preds,targs=learner.get_preds(DatasetType.Test)
_preds = array([soft2hard(o.item()) for o in preds])

In [86]:
Counter(_preds)

Counter({3: 472, 2: 311, 1: 130, 0: 87})

In [87]:
# get installation ids for test set
test_ids = test_with_features_part1['installation_id'].values; len(test_ids)

1000

In [88]:
# generate installation_id : pred dict
test_preds_dict = dict(zip(test_ids, _preds)); len(test_preds_dict)

1000

In [89]:
# create submission
sample_subdf['accuracy_group'] = sample_subdf.installation_id.map(test_preds_dict)
sample_subdf['accuracy_group'] = sample_subdf['accuracy_group'].fillna(3)
sample_subdf.to_csv("submission.csv", index=False)

### end