In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns', None)
import datetime
import catboost
from catboost import CatBoostClassifier,Pool
import time
from tqdm import tqdm_notebook as tqdm
import os
import random
from bayes_opt import BayesianOptimization
import itertools
from itertools import chain 
import json
import pprint
import gc
import seaborn as sns
from typing import Any
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import cohen_kappa_score
from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn.base import BaseEstimator, TransformerMixin
import lightgbm as lgb
import xgboost as xgb
import copy
from functools import partial
import scipy as sp

In [2]:
def spec(value,*args):
    i= specs[specs['event_id'] == value].index.values[-1]
    print('Index :',i)
    print('Event_code :',train[train['event_id'] == value]['event_code'].unique()[-1])
    for arg in args:
        if(arg == 'info'):
         print(specs[arg][i])
        elif(arg == 'args'):
         print(pprint.pprint(json.loads(specs[arg][i])))
        else:
         print('Nothing')

In [3]:
def event(value):
    i = train[train['event_id'] == value].index.values[-1]
    print(pprint.pprint(json.loads(train['event_data'][i])))

In [4]:
def reset(df):
    df = df.reset_index(drop=False)
    df.drop(columns=['index'],axis=1,inplace=True)
    return df

In [5]:
def get_features(df):
    total_features = []
    clip_features = ['num_clip_watched']
    game_features = ['4070_count','num_unique_games','total_game_actions','total_duration_spend_for_game','total_game_attempts','total_correct_attempts','last_game_played_accuracy','gaming_accuracy']
    activity_features = ['4070_count']
    total_data = {}
    total_features = clip_features + game_features + activity_features
    for i,install_id in tqdm(df.groupby('installation_id',sort=False)):

        clip_data = {eve : 0 for eve in clip_features}
        game_data = {eve : 0 for eve in game_features}
        activity_data = {eve : 0 for eve in activity_features}
        event_code_count = {f'{eve}_count' : 0 for eve in list_of_event_code}

        for j,session in install_id.groupby('game_session',sort=False):

            session_type = session['type'].iloc[0]
            session_title = session['title'].iloc[0]

            if((session_type == 'Clip')):
                clip_data['num_clip_watched'] += 1

            elif(session_type == 'Game'):
                game_data['num_unique_games'] += 1
                game_data['total_game_actions'] += session['event_code'].count()
                game_data['total_duration_spend_for_game'] += int(session['game_time'].iloc[-1]/1000)
                game_data['total_game_attempts'] += session[session['event_code'] == 4020]['event_code'].count()
                game_data['total_correct_attempts'] += session[session['event_code'] == 4020]['event_data'].map(lambda x:1 if(str(x).find('"correct":true') >= 0) else 0).sum()
                game_data['gaming_accuracy'] += round((game_data['total_correct_attempts']/game_data['total_game_attempts']),3) if(game_data['total_game_attempts']>0) else 0
                game_data['last_game_played_accuracy'] = round(game_data['total_correct_attempts'] / game_data['total_game_attempts'],3) if(game_data['total_game_attempts']>0) else 0
                game_data['4070_count'] += session[session['event_code']==4070]['event_code'].count()
                #for i in list(set(train['event_code'].unique()).union(set(test['event_code'].unique()))):
                    #event_code_count[f'{i}_count'] += session[session['event_code'] == i].count()
            elif(session_type == 'Activity'):
                activity_data['4070_count'] = session[session['event_code'] == 4070].count()
                #pass
            elif(session_type == 'Assessment'):
                #game_preprocessing


                #Activity preprocessing


                #Clip preprocessing



                total_data[j] = {}
                total_data[j].update(clip_data)
                total_data[j].update(activity_data)
                total_data[j].update(game_data)
                #total_data[j].update(event_code_count)

                clip_data = {eve : 0 for eve in clip_features}
                game_data = {eve : 0 for eve in game_features}
                activity_data = {}
    return total_features,total_data

In [6]:
def get_validation_data(df,num_samples,random_state):
    validation_data = df[(df['type'] == 'Assessment') & (((df['event_code'] == 4100) & (df['title'] != 'Bird Measurer (Assessment)')) | ((df['event_code'] == 4110) & (df['title'] == 'Bird Measurer (Assessment)')))]
    validation_data.drop_duplicates(subset = 'game_session',keep = 'last',inplace=True)
    if(isinstance(num_samples,float)):
            validation_data = validation_data.sample(frac = num_samples,random_state = random_state)
            print(validation_data.shape)
    else:
            validation_data = validation_data.sample(n = num_samples,random_state = random_state)
            print(validation_data.shape)
    return validation_data

In [7]:
def seed_everything(seed=0):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)

In [8]:
def Rand(start, end, num): 
    res = [] 
    for j in range(num): 
        res.append(random.randint(start, end)) 
    return res

In [9]:
def qwk(a1, a2):
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]
    return 1 - o / e

In [10]:
def eval_qwk_lgb(y_true, y_pred):
    y_pred = y_pred.reshape(len(np.unique(y_true)), -1).argmax(axis=0)
    return 'cappa', qwk(y_true, y_pred), True

In [11]:
def nelder_mead(y_true,y_pred):
    nelder_value = []
    optimizer = OptimizedRounder()
    optimizer.fit(y_true.reshape(-1,), y_pred)
    optimized_value = list(optimizer.coefficients())
    
    return optimized_value

In [12]:
def eval_qwk_lgb_regr(y_true, y_pred):
    
    optimized = nelder_mead(y_true,y_pred)
    #print(optimized)
    
    y_pred[y_pred <= optimized[0]] = 0
    y_pred[np.where(np.logical_and(y_pred > optimized[0], y_pred <= optimized[1]))] = 1
    y_pred[np.where(np.logical_and(y_pred > optimized[1], y_pred <= optimized[2]))] = 2
    y_pred[y_pred > optimized[2]] = 3
    
    return 'cappa', qwk(y_true, y_pred), True


In [13]:
def eval_qwk_lgb_regr(y_true, y_pred):
    y_pred[y_pred <= 1.12232214] = 0
    y_pred[np.where(np.logical_and(y_pred > 1.12232214, y_pred <= 1.73925866))] = 1
    y_pred[np.where(np.logical_and(y_pred > 1.73925866, y_pred <= 2.22506454))] = 2
    y_pred[y_pred > 2.22506454] = 3
    
    return 'cappa', qwk(y_true, y_pred), True


In [14]:
class LGBWrapper_regr(object):
   
    def __init__(self):
        self.model = lgb.LGBMRegressor(**params)
    
    
    def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None):
        
        if params['objective'] == 'regression':
            eval_metric = eval_qwk_lgb_regr
        else:
            eval_metric = 'auc'
            
        eval_set = [(X_train, y_train)]
        eval_names = ['train']
        
        self.model = self.model.set_params(**params)

        if X_valid is not None:
            eval_set.append((X_valid, y_valid))
            eval_names.append('valid')

        if X_holdout is not None:
            eval_set.append((X_holdout, y_holdout))
            eval_names.append('holdout')

        if 'cat_cols' in params.keys():
            cat_cols = [col for col in params['cat_cols'] if col in X_train.columns]
            if len(cat_cols) > 0:
                categorical_columns = params['cat_cols']
            else:
                categorical_columns = 'auto'
        else:
            categorical_columns = 'auto'
        self.model.fit(X=X_train, y=y_train,
                       eval_set=eval_set, eval_names=eval_names, eval_metric=eval_metric,
                       verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds'],
                       categorical_feature=categorical_columns)         

        self.best_score_ = self.model.best_score_
        self.feature_importances_ = self.model.feature_importances_

    def predict(self, X_test):
        return self.model.predict(X_test, num_iteration=self.model.best_iteration_)

In [15]:
class RegressorModel(object):

    def __init__(self, columns: list = None, model_wrapper=None):
        
        self.columns = columns
        self.model_wrapper = model_wrapper
        self.result_dict = {}
        self.train_one_fold = False
        self.preprocesser = None

    def fit(self, X: pd.DataFrame, y,
            X_holdout: pd.DataFrame = None, y_holdout=None,
            folds=None,
            params: dict = None,
            eval_metric='rmse',
            cols_to_drop: list = None,
            preprocesser=None,
            transformers: dict = None,
            adversarial: bool = False,
            plot: bool = True):

        if folds is None:
            folds = KFold(n_splits=3, random_state=42)
            self.train_one_fold = True

        self.columns = X.columns if self.columns is None else self.columns
        self.feature_importances = pd.DataFrame(columns=['feature', 'importance'])
        self.trained_transformers = {k: [] for k in transformers}
        self.transformers = transformers
        self.models = []
        self.folds_dict = {}
        self.eval_metric = eval_metric
        n_target = 1
        self.oof = np.zeros((len(X), n_target))
        self.n_target = n_target

        X = X[self.columns]
        if X_holdout is not None:
            X_holdout = X_holdout[self.columns]

        if preprocesser is not None:
            self.preprocesser = preprocesser
            self.preprocesser.fit(X, y)
            X = self.preprocesser.transform(X, y)
            self.columns = X.columns.tolist()
            if X_holdout is not None:
                X_holdout = self.preprocesser.transform(X_holdout)

        for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y, X['installation_id'])):

            if X_holdout is not None:
                X_hold = X_holdout.copy()
            else:
                X_hold = None
            self.folds_dict[fold_n] = {}
            if params['verbose']:
                print(f'Fold {fold_n + 1} started at {time.ctime()}')

            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            if self.train_one_fold:
                X_train = X[self.original_columns]
                y_train = y
                X_valid = None
                y_valid = None

            datasets = {'X_train': X_train, 'X_valid': X_valid, 'X_holdout': X_hold, 'y_train': y_train}
            X_train, X_valid, X_hold = self.transform_(datasets, cols_to_drop)

            self.folds_dict[fold_n]['columns'] = X_train.columns.tolist()
            if adversarial:
                X_new1 = X_train.copy()
                if X_valid is not None:
                    X_new2 = X_valid.copy()
                elif X_holdout is not None:
                    X_new2 = X_holdout.copy()
                X_new = pd.concat([X_new1, X_new2], axis=0)
                y_new = np.hstack((np.zeros((X_new1.shape[0])), np.ones((X_new2.shape[0]))))
                X_train, X_valid, y_train, y_valid = train_test_split(X_new, y_new)
             
            self.X_train = X_train
            self.y_train = y_train
            self.X_valid = X_valid
            self.y_valid = y_valid
            self.X_holdout = X_holdout
            self.y_holdout = y_holdout
            
            model = copy.deepcopy(self.model_wrapper)  
            model.fit(X_train, y_train, X_valid, y_valid, X_hold, y_holdout, params=params)                                   
                                                                                                                                            
            self.folds_dict[fold_n]['scores'] = model.best_score_
            if self.oof.shape[0] != len(X):
                self.oof = np.zeros((X.shape[0], self.oof.shape[1]))
            if not adversarial:
                self.oof[valid_index] = model.predict(X_valid).reshape(-1, n_target)
            fold_importance = pd.DataFrame(list(zip(X_train.columns, model.feature_importances_)),
                                           columns=['feature', 'importance'])
            self.feature_importances = self.feature_importances.append(fold_importance)
            self.models.append(model)

        self.feature_importances['importance'] = self.feature_importances['importance'].astype(int)

        self.calc_scores_()
        if plot:
            fig, ax = plt.subplots(figsize=(16, 12))
            plt.subplot(2, 2, 1)
            self.plot_feature_importance(top_n=20)
            plt.subplot(2, 2, 2)
            self.plot_metric()
            plt.subplot(2, 2, 3)
            plt.hist(y.values.reshape(-1, 1) - self.oof)
            plt.title('Distribution of errors')
            plt.subplot(2, 2, 4)
            plt.hist(self.oof)
            plt.title('Distribution of oof predictions');

    def transform_(self, datasets, cols_to_drop):
        for name, transformer in self.transformers.items():
            transformer.fit(datasets['X_train'], datasets['y_train'])
            datasets['X_train'] = transformer.transform(datasets['X_train'])
            if datasets['X_valid'] is not None:
                datasets['X_valid'] = transformer.transform(datasets['X_valid'])
            if datasets['X_holdout'] is not None:
                datasets['X_holdout'] = transformer.transform(datasets['X_holdout'])
            self.trained_transformers[name].append(transformer)
        if cols_to_drop is not None:
            cols_to_drop = [col for col in cols_to_drop if col in datasets['X_train'].columns]

            datasets['X_train'] = datasets['X_train'].drop(cols_to_drop, axis=1)
            if datasets['X_valid'] is not None:
                datasets['X_valid'] = datasets['X_valid'].drop(cols_to_drop, axis=1)
            if datasets['X_holdout'] is not None:
                datasets['X_holdout'] = datasets['X_holdout'].drop(cols_to_drop, axis=1)
        self.cols_to_drop = cols_to_drop

        return datasets['X_train'], datasets['X_valid'], datasets['X_holdout']

    def calc_scores_(self):
        datasets = [k for k, v in [v['scores'] for k, v in self.folds_dict.items()][0].items() if len(v) > 0]
        self.scores = {}
        for d in datasets:
            scores = [v['scores'][d][self.eval_metric] for k, v in self.folds_dict.items()]
            print(f"CV mean score on {d}: {np.mean(scores):.4f} +/- {np.std(scores):.4f} std.")
            self.scores[d] = np.mean(scores)

    def predict(self, X_test, averaging: str = 'usual'):
        full_prediction = np.zeros((X_test.shape[0], self.oof.shape[1]))
        if self.preprocesser is not None:
            X_test = self.preprocesser.transform(X_test)
        for i in range(len(self.models)):
            X_t = X_test.copy()
            for name, transformers in self.trained_transformers.items():
                X_t = transformers[i].transform(X_t)

            if self.cols_to_drop is not None:
                cols_to_drop = [col for col in self.cols_to_drop if col in X_t.columns]
                X_t = X_t.drop(cols_to_drop, axis=1)
            y_pred = self.models[i].predict(X_t[self.folds_dict[i]['columns']]).reshape(-1, full_prediction.shape[1])

            # if case transformation changes the number of the rows
            if full_prediction.shape[0] != len(y_pred):
                full_prediction = np.zeros((y_pred.shape[0], self.oof.shape[1]))

            if averaging == 'usual':
                full_prediction += y_pred
            elif averaging == 'rank':
                full_prediction += pd.Series(y_pred).rank().values

        return full_prediction / len(self.models)

    def plot_feature_importance(self, drop_null_importance: bool = True, top_n: int = 10):

        top_feats = self.get_top_features(drop_null_importance, top_n)
        feature_importances = self.feature_importances.loc[self.feature_importances['feature'].isin(top_feats)]
        feature_importances['feature'] = feature_importances['feature'].astype(str)
        top_feats = [str(i) for i in top_feats]
        sns.barplot(data=feature_importances, x='importance', y='feature', orient='h', order=top_feats)
        plt.xticks(color='white')
        plt.yticks(color='white')
        plt.title('Feature importances')

    def get_top_features(self, drop_null_importance: bool = True, top_n: int = 10):
    
        grouped_feats = self.feature_importances.groupby(['feature'])['importance'].mean()
        if drop_null_importance:
            grouped_feats = grouped_feats[grouped_feats != 0]
        return list(grouped_feats.sort_values(ascending=False).index)[:top_n]

    def plot_metric(self):
        
        full_evals_results = pd.DataFrame()
        for model in self.models:
            evals_result = pd.DataFrame()
            for k in model.model.evals_result_.keys():
                evals_result[k] = model.model.evals_result_[k][self.eval_metric]
            evals_result = evals_result.reset_index().rename(columns={'index': 'iteration'})
            full_evals_results = full_evals_results.append(evals_result)

        full_evals_results = full_evals_results.melt(id_vars=['iteration']).rename(columns={'value': self.eval_metric,
                                                                                            'variable': 'dataset'})
        sns.lineplot(data=full_evals_results, x='iteration', y=self.eval_metric, hue='dataset')
        plt.xticks(color='white')
        plt.yticks(color='white')
        plt.title('Training progress')


In [16]:
def add_datepart(df:pd.DataFrame()):
    df['timestampHour'] = df['timestamp'].dt.hour
    df['timestampMonth'] = df['timestamp'].dt.month
    df['timestampMinute'] = df['timestamp'].dt.minute
    df['timestampWeek'] = df['timestamp'].dt.week
    
    return df

In [17]:
class MainTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, convert_cyclical: bool = False, create_interactions: bool = False, n_interactions: int = 20):
        self.convert_cyclical = convert_cyclical
        self.create_interactions = create_interactions
        self.feats_for_interaction = None
        self.n_interactions = n_interactions

    def fit(self, X, y=None):

        if self.create_interactions:
            self.feats_for_interaction = [col for col in X.columns if 'sum' in col
                                          or 'mean' in col or 'max' in col or 'std' in col
                                          or 'attempt' in col]
            self.feats_for_interaction1 = np.random.choice(self.feats_for_interaction, self.n_interactions)
            self.feats_for_interaction2 = np.random.choice(self.feats_for_interaction, self.n_interactions)

        return self

    def transform(self, X, y=None):
        data = copy.deepcopy(X)
        if self.create_interactions:
            for col1 in self.feats_for_interaction1:
                for col2 in self.feats_for_interaction2:
                    data[f'{col1}_int_{col2}'] = data[col1] * data[col2]

        if self.convert_cyclical:
            data['timestampHour'] = np.sin(2 * np.pi * data['timestampHour'] / 23.0)
            data['timestampMonth'] = np.sin(2 * np.pi * data['timestampMonth'] / 23.0)
            data['timestampWeek'] = np.sin(2 * np.pi * data['timestampWeek'] / 23.0)
            data['timestampMinute'] = np.sin(2 * np.pi * data['timestampMinute'] / 23.0)
        return data

    def fit_transform(self, X, y=None, **fit_params):
        data = copy.deepcopy(X)
        self.fit(data)
        return self.transform(data)


In [18]:
class FeatureTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, main_cat_features: list = None, num_cols: list = None):
        self.main_cat_features = main_cat_features
        self.num_cols = num_cols

    def fit(self, X, y=None):

        return self

    def transform(self, X, y=None):
        data = copy.deepcopy(X)
        return data

    def fit_transform(self, X, y=None, **fit_params):
        data = copy.deepcopy(X)
        self.fit(data)
        return self.transform(data)

In [19]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

        return -qwk(y, X_p)

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

    def coefficients(self):
        return self.coef_['x']

In [20]:
def convert_dict_to_list(d):
    for i,j in d.items():
        d[i] = [v for k,v in d[i].items()]
    return d

In [21]:
def lgbRegression(x_train, y_train, x_val, y_val,cols_to_drop,params, **kwargs):
    
    if len(cols_to_drop)>0:
        x_train = x_train.drop(columns=cols_to_drop,axis=1)
        x_val = x_val.drop(columns=cols_to_drop,axis=1)
    
    #train_set = lgb.Dataset(x_train, label = y_train)
    #val_set = lgb.Dataset(x_val, label = y_val)
    
    eval_metric = eval_qwk_lgb_regr
    eval_set = [(x_train,y_train),(x_val,y_val)]
    eval_names = ['train','valid']
    
    model = lgb.LGBMRegressor.fit(X=x_train, y=y_train,
                       eval_set=eval_set, eval_names=eval_names, eval_metric=eval_metric,
                       verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds'],
                       categorical_feature=['title'],**kwargs) 
        
    pred_val=model.predict(x_val)
    oof = pred_val.reshape(len(x_val))
        
    return oof

**Reading Data**

In [72]:
start_time = time.time()
test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
print(time.time() - start_time)

7.873899936676025


In [73]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

train = add_datepart(train)
test = add_datepart(test)

In [24]:
train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')
submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')

In [25]:
assessment_id = list(train[train['type'] == 'Assessment']['installation_id'].unique())
train = train.loc[train['installation_id'].isin(assessment_id)]

In [26]:
train = train.reset_index(drop=False)
train.drop(columns = ['index'],axis = 1, inplace = True)

In [27]:
list_of_event_code = set(train['event_code'].unique()).union(set(test['event_code'].unique()))
list_of_event_code = list(list_of_event_code)

In [28]:
train_game = train[(train['type'] == 'Game') | (train['type'] == 'Assessment')]
train_game = train_game.reset_index(drop=False)
train_game.drop(columns = ['index'],axis = 1, inplace = True)

train_activity = train[(train['type'] == 'Assessment') | (train['type'] == 'Activity')]
train_activity = train_activity.reset_index(drop=False)
train_activity.drop(columns = ['index'],axis = 1, inplace = True)

train_clip = train[(train['type'] == 'Assessment') | (train['type'] == 'Clip')]
train_clip = train_clip.reset_index(drop=False)
train_clip.drop(columns = ['index'],axis = 1, inplace = True)

**Train**

In [29]:
features, train_dict = get_features(train)
features.pop()

HBox(children=(IntProgress(value=0, max=4242), HTML(value='')))




In [30]:
column_mapper = {'index':'game_session'}
for i in range(len(features)):
    column_mapper[i] = features[i]
column_mapper

{'index': 'game_session',
 0: 'num_clip_watched',
 1: '4070_count',
 2: 'num_unique_games',
 3: 'total_game_actions',
 4: 'total_duration_spend_for_game',
 5: 'total_game_attempts',
 6: 'total_correct_attempts',
 7: 'last_game_played_accuracy',
 8: 'gaming_accuracy',
 9: '4070_count'}

In [31]:
for i,j in tqdm(train_dict.items()):
    train_dict[i] = [v for k,v in train_dict[i].items()]

HBox(children=(IntProgress(value=0, max=21239), HTML(value='')))




In [32]:
train = train[(train['type'] == 'Assessment')].drop_duplicates(subset='game_session',keep = 'first').reset_index(drop=False).drop(columns=['index'],axis=1)

In [33]:
train_merge = pd.DataFrame().from_dict(train_dict)
train_merge = train_merge.T.reset_index(drop=False).rename(columns = column_mapper)

In [34]:
train = train.merge(train_merge,on='game_session',how='left')

In [35]:
int_float_cols = list(train.select_dtypes(['int64','float64']).columns)

In [36]:
#new_train = train.copy()
#AMean
#new_train = pd.merge(train,train.groupby(['installation_id']).mean(),how='left',on='installation_id',suffixes=('','_Amean'))

#deMean
#new_train_data = train.groupby('installation_id')[int_float_cols].apply(lambda x:x-x.mean())
#new_train_data['installation_id'] = train['installation_id']
#new_train = pd.merge(new_train,new_train_data,how='left',on='installation_id',suffixes=('','_deMean'))

In [37]:
#lag Feature
#del new_train_data
new_train_data = train.groupby('installation_id',sort=False)[int_float_cols].diff(axis=0,periods=-1)

In [38]:
new_train_data['installation_id'] = train['installation_id']
new_train_data['game_session'] = train['game_session']

In [39]:
train = pd.merge(train,new_train_data,on='game_session',how='left',suffixes=('','_lag_diff'))
train = train.drop(columns=['installation_id_lag_diff'],axis=1)

In [40]:
for c in tqdm(features):
    train[f'{c}_sum_lag'] = train[f'{c}'] + train.groupby('installation_id')[f'{c}'].shift(1)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [41]:
train_mean_duration = dict(train.groupby('installation_id')['total_duration_spend_for_game'].mean())
train['mean_duration_spend'] = train['installation_id'].map(train_mean_duration)

In [55]:
train['title_prev'] = train.groupby('installation_id')['title'].shift(1)

**Validation Part**

In [None]:
#val_data = train.loc[train['installation_id'].isin((train.iloc[1800:2400,:]['installation_id'].unique().tolist()))]
#train = train.loc[~train['installation_id'].isin(val_data['installation_id'].unique().tolist())]
#val_data = val_data.drop_duplicates(subset='installation_id',keep='last').reset_index(drop=False).drop(columns=['index'],axis=1)
#val_data.shape

**Test**

In [74]:
features, test_dict = get_features(test)
features.pop()

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [78]:
features = ['num_clip_watched',
 '4070_count',
 'num_unique_games',
 'total_game_actions',
 'total_duration_spend_for_game',
 'total_game_attempts',
 'total_correct_attempts',
 'last_game_played_accuracy',
 'gaming_accuracy']

In [79]:
column_mapper = {'index':'game_session'}
for i in range(len(features)):
    column_mapper[i] = features[i]
column_mapper

{'index': 'game_session',
 0: 'num_clip_watched',
 1: '4070_count',
 2: 'num_unique_games',
 3: 'total_game_actions',
 4: 'total_duration_spend_for_game',
 5: 'total_game_attempts',
 6: 'total_correct_attempts',
 7: 'last_game_played_accuracy',
 8: 'gaming_accuracy'}

In [80]:
for i,j in tqdm(test_dict.items()):
    test_dict[i] = [v for k,v in test_dict[i].items()]

HBox(children=(IntProgress(value=0, max=3347), HTML(value='')))




In [82]:
test = test[(test['type'] == 'Assessment')].drop_duplicates(subset='game_session',keep = 'first').reset_index(drop=False).drop(columns=['index'],axis=1)

In [83]:
test_merge = pd.DataFrame().from_dict(test_dict)
test_merge = test_merge.T.reset_index(drop=False).rename(columns = column_mapper)

In [84]:
test = test.merge(test_merge,on='game_session',how='left')

In [85]:
int_float_cols = list(test.select_dtypes(['int64','float64']).columns)

In [62]:
#new_test = test.copy()
#AMean
#new_test = pd.merge(test,test.groupby(['installation_id']).mean(),how='left',on='installation_id',suffixes=('','_Amean'))

In [63]:
#deMean
#new_test_data = test.groupby('installation_id')[int_float_cols].apply(lambda x:x-x.mean())
#new_test_data['installation_id'] = test['installation_id']
#new_test = pd.merge(new_test,new_test_data,how='left',on='installation_id',suffixes=('','_deMean'))

In [86]:
#lag Feature
new_test = test.groupby('installation_id')[int_float_cols].diff(axis=0,periods=-1)

In [87]:
new_test['installation_id'] = test['installation_id']
new_test['game_session'] = test['game_session']

In [88]:
test = pd.merge(test,new_test,on='game_session',how='left',suffixes=('','_lag_diff'))
test = test.drop(columns=['installation_id_lag_diff'],axis=1)

In [89]:
for c in tqdm(features):
    test[f'{c}_sum_lag'] = test[f'{c}'] + test.groupby('installation_id')[f'{c}'].shift(1)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




In [90]:
test_mean_duration = dict(test.groupby('installation_id')['total_duration_spend_for_game'].mean())
test['mean_duration_spend'] = test['installation_id'].map(test_mean_duration)

In [91]:
test['title_prev'] = test.groupby('installation_id')['title'].shift(1)

**Preprocessing**

In [None]:
#cols_for_previous_value = ['title','timestampHour','num_clip_watched','num_unique_games','total_game_actions','total_duration_spend_for_game','total_game_attempts',
#                           'total_correct_attempts','last_game_played_accuracy','gaming_accuracy']

In [None]:
#for c in tqdm(cols_for_previous_value):
#    train[f'{c}_prev'] = train.groupby('installation_id')[c].shift(1)

In [None]:
#for c in tqdm(cols_for_previous_value):
#    test[f'{c}_prev'] = test.groupby('installation_id')[c].shift(1)

In [93]:
cols_to_drop_1 = []
cols_to_drop_1.append(list(train.columns[train.columns.str.startswith('event')]))
cols_to_drop_1.append(list(train.columns[train.columns.str.startswith('game_time')]))
cols_to_drop_1 = list(chain.from_iterable(cols_to_drop_1))
cols_to_drop_1

['event_id',
 'event_data',
 'event_count',
 'event_code',
 'event_count_lag_diff',
 'event_code_lag_diff',
 'game_time',
 'game_time_lag_diff']

In [94]:
print(train['game_session'].nunique())
print(train_labels['game_session'].nunique())

21239
17690


In [95]:
train = train.loc[train['game_session'].isin(train_labels['game_session'].unique().tolist())]

In [96]:
accuracy_mapper = dict(zip(train_labels['game_session'],train_labels['accuracy_group']))
train['accuracy_group'] = train['game_session'].map(accuracy_mapper)

In [97]:
list_of_user_activities = list(set(train['title'].value_counts().index).union(set(test['title'].value_counts().index)))
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))

train['title'] = train['title'].map(activities_map)
train['title_prev'] = train['title_prev'].map(activities_map)
test['title'] = test['title'].map(activities_map)
test['title_prev'] = test['title_prev'].map(activities_map)
train_labels['title'] = train_labels['title'].map(activities_map)

In [99]:
world = list(set(train['world'].unique().tolist()).union(set(test['world'].unique().tolist())))
world = dict(zip(world,np.arange(len(world))))

train['world'] = train['world'].map(world)
test['world'] = test['world'].map(world)

In [None]:
#instal_id_mapper = list(set(train['installation_id'].unique().tolist()).union(set(test['installation_id'].unique().tolist())))
#instal_id_mapper = dict(zip(instal_id_mapper,np.arange(len(instal_id_mapper))))

#for df in tqdm([train,test]):
#    df['installation_id'] = df['installation_id'].map(instal_id_mapper)

In [100]:
cols_to_drop = ['game_session', 'installation_id', 'timestamp', 'accuracy_group','event_id','event_data','event_count','event_code','game_time','type']
cols_to_drop = list(set(cols_to_drop).union(set(cols_to_drop_1)))
n_fold = 5
folds = GroupKFold(n_splits=n_fold)
y = train['accuracy_group']

In [None]:
#x_train = train.drop(columns='accuracy_group',axis=1)
#y_train = train['accuracy_group']
#x_val = val_data.drop(columns=['accuracy_group'],axis=1)
#y_val = val_data['accuracy_group']

In [None]:
cat_features = ['title','title_prev']

In [None]:
params = {  'n_estimators':200,
            'boosting_type': 'gbdt',
            'objective': 'regression',
             'metric': 'rmse',
            'subsample': 0.75,
            'subsample_freq': 1,
            'learning_rate': 0.04,
            'feature_fraction': 0.9,
             'max_depth': 15,
            'lambda_l1': 0.87,  
            'lambda_l2': 0.96,
            'verbose_eval':50,
            'verbose': 100,
            'early_stopping_rounds': 100, 
            'eval_metric': 'cappa'
            }

In [None]:
seed_everything(np.random.randint(1,1000))

In [None]:
plt.figure(figsize=(40,30))
cols = train.select_dtypes(include=['int64','float64']).columns
corr = train[cols].corr(method = 'pearson')
#corr = corr[corr>=0.75]
sns.heatmap(corr,annot=True)

**Training Part**

1) Using Object Oriented LightGBM Approah with Nelder-Mead Local Minima Optimizations

In [None]:
train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')

In [None]:
mt = MainTransformer()
ft = FeatureTransformer()
transformers = {'ft': ft}
regressor_model1 = RegressorModel(model_wrapper=LGBWrapper_regr())
regressor_model1.fit(X=train, y=y, folds=folds,params=params, preprocesser=mt, transformers=transformers,
                    eval_metric='cappa', cols_to_drop=cols_to_drop)

In [None]:
feature_importance = regressor_model1.feature_importances
feature_importance = feature_importance.sort_values('importance').reset_index(drop=False).drop(columns=['index'],axis=1)
feature_importance

In [None]:
pr1 = regressor_model1.predict(train)

optR = OptimizedRounder()
optR.fit(pr1.reshape(-1,), y)
coefficients = optR.coefficients()
coefficients_1 = list(coefficients)
print(coefficients_1)

In [None]:
#val_pred = regressor_model1.predict(val_data)
#val_opt = optR.predict(val_pred.reshape(-1,),coefficients)
#qwk(y_val,val_opt)

In [None]:
opt_preds = optR.predict(pr1.reshape(-1, ), coefficients)
qwk(y, opt_preds)

In [None]:
pr1 = regressor_model1.predict(test)
pr1[pr1 <= coefficients[0]] = 0
pr1[np.where(np.logical_and(pr1 > coefficients[0], pr1 <= coefficients[1]))] = 1
pr1[np.where(np.logical_and(pr1 > coefficients[1], pr1 <= coefficients[2]))] = 2
pr1[pr1 > coefficients[2]] = 3

2) Training with Functional LightGBM Regression model

**Submitting the Prediction**

In [None]:
test['accuracy_group'] = pr1
new_test = test.drop_duplicates(subset='installation_id',keep='last').reset_index(drop=False).drop(columns=['index'],axis=1)
test_prediction_dict = dict(zip(new_test['installation_id'],new_test['accuracy_group']))

In [None]:
submission['accuracy_group'] = submission['installation_id'].map(test_prediction_dict)
submission['accuracy_group'] = submission['accuracy_group'].astype(int)
submission.head()

In [None]:
submission['accuracy_group'].value_counts(normalize=True)

In [None]:
submission.to_csv('submission.csv',index=False)

**Garbage**

In [None]:
def new():
    cols = train.columns.tolist()
    cols.remove('installation_id')
    for i ,data in train.groupby('installation_id',sort=False):
        new_data = data[cols]
        new_data['installation_id'] = i
        break
    return new_data.head()

In [None]:
def get_features(df:pd.DataFrame(),which):
    
    new_df_list_values = []
    event_code_count_dict = {eve : 0 for eve in list_of_event_code}
    new_df_dict = {}
        
    df = df.reset_index(drop= False)
    df.drop(columns=['index'],axis=1,inplace=True)
    
    for i,data in tqdm(df.groupby('installation_id',sort=False)):
        print(i)
        new_data = data[df.columns.tolist()]
        new_data['installation_id'] = i
        
        new_data = new_data.reset_index(drop= False)
        new_data.drop(columns=['index'],axis=1,inplace=True)
        
        a1 = new_data[new_data['type'] == 'Assessment'].drop_duplicates(subset = 'game_session',keep = 'first').index.tolist()
        a2 = new_data[new_data['type'] == 'Assessment'].drop_duplicates(subset = 'game_session',keep = 'last').index.tolist()

        a = []
        a.append(0)
        print(a2)
        for i in range(len(a2)):
            a.append(a2[i])
            a.append(a2[i]+1)
        a.pop(len(a)-1)
        print(a)

        for i in range(0,len(a),2):
            print(a[i] , 'to' , a[i+1])
            new_df = data.iloc[a[i]:a[i+1]+1,:]
            g_session = new_df[new_df['type'] == 'Assessment']['game_session'].unique()[-1]
            print(g_session)
            new_df_dict[g_session] = {}
            new_df_dict[g_session]['num_game_session'] = new_df['game_session'].nunique()
            new_df_dict[g_session]['num_event_id'] = new_df['event_id'].nunique()
            new_df_dict[g_session]['num_actions_before_assessment'] = len(new_df['event_code'])
            new_df_dict[g_session][f'num_unique_{which}'] = new_df[new_df['type'] == which]['title'].nunique()
            if(which == 'Game'):
                new_df_dict[g_session]['game_attempts'] = new_df[new_df['event_code'] == 4020]['event_code'].count()
                new_df_dict[g_session]['mean_game_time'] = new_df[new_df['type'] == 'Game']['game_time'].mean()
                new_df_dict[g_session]['max_game_time'] = new_df[new_df['type'] == 'Game']['game_time'].max()
                new_df_dict[g_session]['min_game_time'] = new_df[new_df['type'] == 'Game']['game_time'].min()
                new_df_dict[g_session]['std_game_time'] = new_df[new_df['type'] == 'Game']['game_time'].std()
                new_df['gaming_accuracy'] = new_df[(new_df['type'] == 'Game') & (new_df['event_code'] == 4020)]['event_data'].map(lambda x:1 if(str(x).find('"correct":true')) else 0)
                new_df['gaming_accuracy'].fillna(0,inplace=True)
                new_df_dict[g_session]['gaming_accuracy'] = new_df['gaming_accuracy'].sum()/new_df_dict[g_session]['game_attempts']
            elif(which == 'Activity'):
                new_df_dict[g_session]['Activity actions'] = new_df[new_df['type'] == 'Activity']['event_id'].count()

            for k,v in event_code_count_dict.items():
                new_df_dict[g_session][k] = v

            for i in event_code_count_dict.keys():
                 new_df_dict[g_session][i] = new_df[new_df['event_code'] == i]['event_code'].count()
            #break
    return new_df_dict      

In [None]:
def get_features(df:pd.DataFrame(),which):
    
    new_df_list_values = []
    event_code_count_dict = {}
    new_df_dict = {}
    
    for i in train['event_code'].unique().tolist():
        event_code_count_dict[i] = 0
        
    df = df.reset_index(drop= False)
    df.drop(columns=['index'],axis=1,inplace=True)
    
    a1 = df[df['type'] == 'Assessment'].drop_duplicates(subset = 'game_session',keep = 'first').index.tolist()
    a2 = df[df['type'] == 'Assessment'].drop_duplicates(subset = 'game_session',keep = 'last').index.tolist()
    
    a = []
    a.append(0)
    for i in range(len(a2)):
        a.append(a2[i])
        a.append(a2[i]+1)
    a.pop(len(a)-1)
    
    for i in tqdm(range(len(a))):
        new_df = df.iloc[a[i]:a[i+1],:]
        ids  = new_df['installation_id'].unique().tolist()[-1]
        new_df_dict[ids] = {}
        new_df_dict['num_game_session'] = new_df['game_session'].nunique()
        new_df_dict['num_event_id'] = new_df['event_id'].nunique()
        new_df_dict['num_actions_before_assessment'] = len(new_df['event_code'])
        new_df_dict[f'num_unique_{which}'] = new_df[new_df['type'] == which]['title'].nunique()
        if(which == 'Game'):
            new_df_dict['game_attempts'] = new_df[new_df['event_code'] == 4020]['event_code'].count()
            new_df_dict['mean_game_time'] = new_df[new_df['type'] == 'Game']['game_time'].mean()
            new_df_dict['max_game_time'] = new_df[new_df['type'] == 'Game']['game_time'].max()
            new_df_dict['min_game_time'] = new_df[new_df['type'] == 'Game']['game_time'].min()
            new_df_dict['std_game_time'] = new_df[new_df['type'] == 'Game']['game_time'].std()
            new_df['gaming_accuracy'] = new_df[(new_df['type'] == 'Game') & (new_df['event_code'] == 4020)]['event_data'].map(lambda x:1 if(str(x).find('"correct":true')) else 0)
            new_df['gaming_accuracy'].fillna(0,inplace=True)
            new_df_dict['gaming_accuracy'] = new_df['gaming_accuracy'].sum()/new_df_dict['game_attempts']
            #new_df['time_gap_before_assessment'] = new_df['']
        elif(which == 'Activity'):
            new_df_dict['Activity actions'] = new_df[new_df['type'] == 'Activity']['event_id'].count()
        
        event_code_count_dict.update(new_df_dict)
        break
    print(event_code_count_dict)

In [None]:
def get_features(df:pd.DataFrame(),which):
    
    new_df_list_values = []
    event_code_count_dict = {eve : 0 for eve in list_of_event_code}
    new_df_dict = {}
        
    df = df.reset_index(drop= False)
    df.drop(columns=['index'],axis=1,inplace=True)
    
    a1 = df[df['type'] == 'Assessment'].drop_duplicates(subset = 'game_session',keep = 'first').index.tolist()
    a2 = df[df['type'] == 'Assessment'].drop_duplicates(subset = 'game_session',keep = 'last').index.tolist()
    
    a = []
    a.append(0)
    for i in range(len(a2)):
        a.append(a2[i])
        a.append(a2[i]+1)
    a.pop(len(a)-1)
    
    for i in tqdm(range(10)):
        new_df = df.iloc[a[i]:a[i+1],:]
        ids  = new_df['installation_id'].unique().tolist()[-1]
        g_session = new_df[new_df['type'] == 'Assessment']['game_session'].unique()[-1]
        print(ids,g_session)
        new_df_dict[ids] = {}
        new_df_dict[ids][g_session] = {}
        new_df_dict[ids][g_session]['num_game_session'] = new_df['game_session'].nunique()
        new_df_dict[ids][g_session]['num_event_id'] = new_df['event_id'].nunique()
        new_df_dict[ids][g_session]['num_actions_before_assessment'] = len(new_df['event_code'])
        new_df_dict[ids][g_session][f'num_unique_{which}'] = new_df[new_df['type'] == which]['title'].nunique()
        if(which == 'Game'):
            new_df_dict[ids][g_session]['game_attempts'] = new_df[new_df['event_code'] == 4020]['event_code'].count()
            new_df_dict[ids][g_session]['mean_game_time'] = new_df[new_df['type'] == 'Game']['game_time'].mean()
            new_df_dict[ids][g_session]['max_game_time'] = new_df[new_df['type'] == 'Game']['game_time'].max()
            new_df_dict[ids][g_session]['min_game_time'] = new_df[new_df['type'] == 'Game']['game_time'].min()
            new_df_dict[ids][g_session]['std_game_time'] = new_df[new_df['type'] == 'Game']['game_time'].std()
            new_df['gaming_accuracy'] = new_df[(new_df['type'] == 'Game') & (new_df['event_code'] == 4020)]['event_data'].map(lambda x:1 if(str(x).find('"correct":true')) else 0)
            new_df['gaming_accuracy'].fillna(0,inplace=True)
            new_df_dict[ids][g_session]['gaming_accuracy'] = new_df['gaming_accuracy'].sum()/new_df_dict[ids][g_session]['game_attempts']
        elif(which == 'Activity'):
            new_df_dict[ids][g_session]['Activity actions'] = new_df[new_df['type'] == 'Activity']['event_id'].count()
        
        for k,v in event_code_count_dict.items():
            new_df_dict[ids][g_session][k] = v
        
        for i in event_code_count_dict.keys():
             new_df_dict[ids][g_session][i] = new_df[new_df['event_code'] == i]['event_code'].count()
        break
    return new_df_dict  

In [None]:
def g5():
    train_column_values = {}
    for i in tqdm(train['installation_id'].unique().tolist()):
        df = train[(train['installation_id'] == i) & ((train['type'] == 'Game') | (train['type'] == 'Assessment'))]
        column_values_instance = get_features(df,'Game')
        train_column_values.update(column_values_instance)
    train_column_values

In [None]:
def g4():
    per_install_id_features = {}
    event_code_count_per_id = {}
    for i,data in tqdm(train.groupby('installation_id',sort = False)):
        new_data = data[data.columns.tolist()]
        per_install_id_features[i] = {}
        for j in ['Game','Activity','Clip']:
            per_install_id_features[i][f'num_{j}'] = new_data[new_data['type'] == j]['title'].nunique()
        per_install_id_features[i]['total_num_actions'] = new_data[(new_data['type'] == 'Game') | (new_data['type'] == 'Activity') | (new_data['type'] == 'Clip')]['event_id'].count()
        per_install_id_features[i]['total_game_attempts'] = new_data[(new_data['type'] == 'Game') & (new_data['event_code'] == 4020)]['event_code'].count()
        new_data['correct_attempts'] = new_data[(new_data['type'] == 'Game') & (new_data['event_code'] == 4020)]['event_data'].map(lambda x:1 if(str(x).find('"correct":true'))>=0 else 0)
        per_install_id_features[i]['total_correct_game_attempts'] = new_data['correct_attempts'].sum()
        per_install_id_features[i]['total_accuracy'] = per_install_id_features[i]['total_correct_game_attempts']/per_install_id_features[i]['total_game_attempts'] if per_install_id_features[i]['total_game_attempts'] >0 else 0

    per_install_id_features

In [None]:
def g3():
    clip_features = ['num_clip_watched']
    game_features = ['num_unique_games','total_game_actions','total_duration_spend_for_game','total_game_attempts','total_correct_attempts','last_game_played_accuracy','gaming_accuracy']
    activity_features = []
    total_data = {}

    for i,install_id in tqdm(test.groupby('installation_id',sort=False)):

        clip_data = {eve : 0 for eve in clip_features}
        game_data = {eve : 0 for eve in game_features}
        activity_data = {}
        event_code_count = {eve : 0 for eve in list_of_event_code}

        for j,session in install_id.groupby('game_session',sort=False):

            session_type = session['type'].iloc[0]
            session_title = session['title'].iloc[0]

            if((session_type == 'Clip')):
                clip_data['num_clip_watched'] += 1

            elif(session_type == 'Game'):
                game_data['num_unique_games'] += 1
                game_data['total_game_actions'] += session['event_code'].count()
                game_data['total_duration_spend_for_game'] += int(session['game_time'].iloc[-1]/1000)
                game_data['total_game_attempts'] += session[session['event_code'] == 4020]['event_code'].count()
                game_data['total_correct_attempts'] += session[session['event_code'] == 4020]['event_data'].map(lambda x:1 if(str(x).find('"correct":true') >= 0) else 0).sum()
                game_data['gaming_accuracy'] += np.round((game_data['total_correct_attempts']/game_data['total_game_attempts']),3)
                game_data['last_game_played_accuracy'] = np.round(game_data['total_correct_attempts'] / game_data['total_game_attempts'],3) if(game_data['total_game_attempts']>0) else 0
            elif(session_type == 'Activity'):
                pass
            elif(session_type == 'Assessment'):
                #game_preprocessing


                #Activity preprocessing


                #Clip preprocessing



                total_data[j] = {}
                total_data[j].update(clip_data)
                total_data[j].update(activity_data)
                total_data[j].update(game_data)

                clip_data = {eve : 0 for eve in clip_features}
                game_data = {eve : 0 for eve in game_features}
                activity_data = {}
        #break   

    total_data

In [None]:
def g2():
    train = train[(train['type'] == 'Assessment') & (((train['event_code'] == 4100) & (train['title'] != 'Bird Measurer (Assessment)')) | ((train['event_code'] == 4110) & (train['title'] == 'Bird Measurer (Assessment)')))]

    session_count = train['game_session'].value_counts().to_dict()
    train['assessment_attempt_count'] = train['game_session'].map(session_count)

    train['contains_true_assessment'] = train['event_data'].map(lambda x: True if (x.find('"correct":true')>=0) else False)

    change_value = {
        True : 1,
        False : 0
    }
    train['contains_true_assessment'] = train['contains_true_assessment'].map(change_value)

    correct_attempt = dict(train.groupby('game_session',sort=False)['contains_true_assessment'].sum())
    train['contains_true_assessment_count'] = train['game_session'].map(correct_attempt)

    for c in ['contains_true_assessment']:
        train.pop(c)

    train['accumulated_accuracy'] = np.where((train['contains_true_assessment_count'] == 0),0,(train['contains_true_assessment_count']/train['assessment_attempt_count']))

    train.loc[(train['type'] == 'Assessment'), 'accuracy_group'] = 0
    train.loc[(train['accumulated_accuracy'] == 1) & (train['type'] == 'Assessment'), 'accuracy_group'] = 3
    train.loc[(train['accumulated_accuracy'] == 0.5) & (train['type'] == 'Assessment'), 'accuracy_group'] = 2
    train.loc[(train['accumulated_accuracy'] < 0.5) & (train['accumulated_accuracy'] > 0) & (train['assessment_attempt_count'] > 0) & (train['type'] == 'Assessment'), 'accuracy_group'] = 1

    train.rename(columns = {'contains_true_assessment_count': 'num_correct',
                            'accumulated_accuracy':'accuracy',
                            'assessment_attempt_count': 'total_attempt'},inplace=True)
    train = train.drop_duplicates(subset = 'game_session',keep = 'last')
    train = train.reset_index(drop=False)
    train.drop(columns = ['index'],axis = 1, inplace = True)
    train.shape

In [None]:
def g1():
    test =  test[(test['type'] == 'Assessment') &  ((test['event_count'] == 1) | ((test['event_code'] == 4100) & (test['title'] != 'Bird Measurer (Assessment)')) | ((test['event_code'] == 4110) & (test['title'] == 'Bird Measurer (Assessment)')))]
    session_count = test['game_session'].value_counts().to_dict()
    test['assessment_attempt_count'] = test['game_session'].map(session_count)

    test['contains_true_assessment'] = test['event_data'].map(lambda x: True if (x.find('"correct":true')>=0) else False)

    change_value = {
        True : 1,
        False : 0
    }
    test['contains_true_assessment'] = test['contains_true_assessment'].map(change_value)

    correct_attempt = dict(test.groupby('game_session',sort=False)['contains_true_assessment'].sum())
    test['contains_true_assessment_count'] = test['game_session'].map(correct_attempt)


    for c in ['contains_true_assessment']:
        test.pop(c)

    test['accumulated_accuracy'] = np.where((test['contains_true_assessment_count'] == 0),0,(test['contains_true_assessment_count']/test['assessment_attempt_count']))

    test.loc[(test['type'] == 'Assessment'), 'accuracy_group'] = 0
    test.loc[(test['accumulated_accuracy'] == 1) & (test['type'] == 'Assessment'), 'accuracy_group'] = 3
    test.loc[(test['accumulated_accuracy'] == 0.5) & (test['type'] == 'Assessment'), 'accuracy_group'] = 2
    test.loc[(test['accumulated_accuracy'] < 0.5) & (test['accumulated_accuracy'] > 0) & (test['assessment_attempt_count'] > 0) & (test['type'] == 'Assessment'), 'accuracy_group'] = 1

    test.rename(columns = {'contains_true_assessment_count': 'num_correct',
                            'accumulated_accuracy':'accuracy',
                            'assessment_attempt_count': 'total_attempt'},inplace=True)

    test = test.drop_duplicates(subset = 'game_session',keep = 'last')
    test = test.reset_index(drop=False)
    test.drop(columns = ['index'],axis = 1, inplace = True)
    test.shape

In [None]:
def g8():
    oof = np.zeros(len(x))
    NFOLDS = 5
    folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=2019)


    for fold, (trn_idx, test_idx) in enumerate(folds.split(x, y)):

        print(f'Training on fold {fold+1}')
        clf = make_classifier()
        clf.fit(x.loc[trn_idx], y.loc[trn_idx], eval_set=(x.loc[test_idx], y.loc[test_idx]),
                              use_best_model=True, verbose=500)

        oof[test_idx] = clf.predict(x.loc[test_idx]).reshape(len(test_idx))
        print('OOF QWK:', qwk(y, oof))

    print('-' * 30)
    print('OOF QWK:', qwk(y, oof))
    print('-' * 30)

In [None]:
def plot_feature_importance(self, drop_null_importance: bool = True, top_n: int = 10):

        top_feats = self.get_top_features(drop_null_importance, top_n)
        feature_importances = self.feature_importances.loc[self.feature_importances['feature'].isin(top_feats)]
        feature_importances['feature'] = feature_importances['feature'].astype(str)
        top_feats = [str(i) for i in top_feats]
        sns.barplot(data=feature_importances, x='importance', y='feature', orient='h', order=top_feats)
        plt.title('Feature importances')

    def get_top_features(self, drop_null_importance: bool = True, top_n: int = 10):
    
        grouped_feats = self.feature_importances.groupby(['feature'])['importance'].mean()
        if drop_null_importance:
            grouped_feats = grouped_feats[grouped_feats != 0]
        return list(grouped_feats.sort_values(ascending=False).index)[:top_n]

    def plot_metric(self):
        
        full_evals_results = pd.DataFrame()
        for model in self.models:
            evals_result = pd.DataFrame()
            for k in model.model.evals_result_.keys():
                evals_result[k] = model.model.evals_result_[k][self.eval_metric]
            evals_result = evals_result.reset_index().rename(columns={'index': 'iteration'})
            full_evals_results = full_evals_results.append(evals_result)

        full_evals_results = full_evals_results.melt(id_vars=['iteration']).rename(columns={'value': self.eval_metric,
                                                                                            'variable': 'dataset'})
        sns.lineplot(data=full_evals_results, x='iteration', y=self.eval_metric, hue='dataset')
        plt.title('Training progress')
