In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns', None)
import datetime
import catboost
from catboost import CatBoostClassifier,Pool
from time import time
from tqdm import tqdm_notebook as tqdm
import os
import random
import json
import pprint
import gc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold,GroupKFold
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import cohen_kappa_score

In [2]:
def qwk(act,pred,n=4,hist_range=(0,3)):
    
    O = confusion_matrix(act,pred)
    O = np.divide(O,np.sum(O))
    
    W = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            W[i][j] = ((i-j)**2)/((n-1)**2)
            
    act_hist = np.histogram(act,bins=n,range=hist_range)[0]
    prd_hist = np.histogram(pred,bins=n,range=hist_range)[0]
    
    E = np.outer(act_hist,prd_hist)
    E = np.divide(E,np.sum(E))
    
    num = np.sum(np.multiply(W,O))
    den = np.sum(np.multiply(W,E))
        
    return 1-np.divide(num,den)

In [3]:
def spec(value,*args):
    i= specs[specs['event_id'] == value].index.values[-1]
    print('Index :',i)
    print('Event_code :',train[train['event_id'] == value]['event_code'].unique()[-1])
    for arg in args:
        if(arg == 'info'):
         print(specs[arg][i])
        elif(arg == 'args'):
         print(pprint.pprint(json.loads(specs[arg][i])))
        else:
         print('Nothing')

In [4]:
def event(value):
    i = train[train['event_id'] == value].index.values[-1]
    print(pprint.pprint(json.loads(train['event_data'][i])))

In [5]:
def get_game_feature(df,data):
    g0_d = dict(df[df['type'] == 'Game'].groupby('installation_id')['type'].value_counts())
    g0 = {}
    for (k,i),v in g0_d.items():
        g0[k] = v
    del g0_d
    df['G0'] = df['installation_id'].map(g0)
    df['G0'].fillna(0,inplace=True)

    g1_d = dict(df[(df['event_code'] == 4020) & (df['type'] == 'Game')].groupby('installation_id')['event_code'].value_counts())
    g1 = {}
    for (k,i),v in g1_d.items():
        g1[k] = v
    del g1_d
    df['G1'] = df['installation_id'].map(g1)
    df['G1'].fillna(0,inplace=True)
    del g1
    
    df['contains_true'] = df[(df['event_code'] == 4020) & (df['type'] == 'Game')]['event_data'].map(lambda x:1 if(str(x).find('"correct":true')>= 0) else 0)
    g2 = dict(df[df['event_code'] == 4020].groupby('installation_id')['contains_true'].sum())
    df['G2'] = df['installation_id'].map(g2)
    df['G2'].fillna(0,inplace=True)
    for c in ['contains_true']:
        df.pop(c)
        
    df['G3'] = df['G2'] / df['G1']
    df['G3'] = df['G3'].map(lambda x:3 if(x == 1.0) else(2 if((x >= 0.5) & (x <1.0)) else (1 if((x>0.0) &  (x<0.5)) else 0)))
    df['G3'].fillna(0,inplace=True)
    
    i = 4
    for d in tqdm(data):
        g_attempt_d = dict(df[(df['type'] == 'Game') & (df['event_code'] == 4020) & (df['title'] == d)].groupby('installation_id')['title'].value_counts())
        g_attempt = {}
        for (k,z),v in g_attempt_d.items():
            g_attempt[k] = v

        df[f'G{i}_attempt'] = df['installation_id'].map(g_attempt)
        df[f'G{i}_attempt'].fillna(0,inplace=True)

        df['contains_true'] = df[(df['type'] == 'Game') & (df['event_code'] == 4020) & (df['title'] == d)]['event_data'].map(lambda x:1 if(str(x).find('"correct":true')>=0) else 0)
        g = dict(df.groupby('installation_id')['contains_true'].sum())
        df[f'G{i}_correct'] = df['installation_id'].map(g)
        df[f'G{i}_correct'].fillna(0,inplace=True)

        df[f'G{i}_accuracy'] = df[f'G{i}_correct'] / df[f'G{i}_attempt']
        df[f'G{i}_accuracy'] = df[f'G{i}_accuracy'].map(lambda x:3 if(x == 1.0) else(2 if((x >= 0.5) & (x <1.0)) else (1 if((x>0.0) &  (x<0.5)) else 0)))
        df[f'G{i}_accuracy'].fillna(0,inplace=True)

        for c in ['contains_true',f'G{i}_attempt']:
            df.pop(c)
            
        i = i + 1

In [7]:
def get_attempted_assessment_results(df,data):
    if(data == 'train'):
        df = df[(df['type'] == 'Assessment') & (((df['event_code'] == 4100) & (df['title'] != 'Bird Measurer (Assessment)')) | ((df['event_code'] == 4110) & (df['title'] == 'Bird Measurer (Assessment)')))]
    else:
        df =  df[(df['type'] == 'Assessment') &  ((df['event_count'] == 1) | ((df['event_code'] == 4100) & (df['title'] != 'Bird Measurer (Assessment)')) | ((df['event_code'] == 4110) & (df['title'] == 'Bird Measurer (Assessment)')))]
    session_count = df['game_session'].value_counts().to_dict()
    df['assessment_attempt_count'] = df['game_session'].map(session_count)

    df['contains_true_assessment'] = df['event_data'].map(lambda x: True if (x.find('"correct":true')>=0) else False)

    change_value = {
        True : 1,
        False : 0
    }
    df['contains_true_assessment'] = df['contains_true_assessment'].map(change_value)

    correct_attempt = dict(df.groupby('game_session',sort=False)['contains_true_assessment'].sum())
    df['contains_true_assessment_count'] = df['game_session'].map(correct_attempt)

    for c in ['contains_true_assessment']:
        df.pop(c)

    df['accumulated_accuracy'] = np.where((df['contains_true_assessment_count'] == 0),0,(df['contains_true_assessment_count']/df['assessment_attempt_count']))

    df.loc[(df['type'] == 'Assessment'), 'accuracy_group'] = 0
    df.loc[(df['accumulated_accuracy'] == 1) & (df['type'] == 'Assessment'), 'accuracy_group'] = 3
    df.loc[(df['accumulated_accuracy'] == 0.5) & (df['type'] == 'Assessment'), 'accuracy_group'] = 2
    df.loc[(df['accumulated_accuracy'] < 0.5) & (df['accumulated_accuracy'] > 0) & (df['assessment_attempt_count'] > 0) & (df['type'] == 'Assessment'), 'accuracy_group'] = 1

    df.rename(columns = {'contains_true_assessment_count': 'num_correct',
                            'accumulated_accuracy':'accuracy',
                            'assessment_attempt_count': 'total_attempt'},inplace=True)
    df = df.drop_duplicates(subset = 'game_session',keep = 'last')

In [8]:
def get_validation_data(df,num_samples,random_state):
    validation_data = df[(df['type'] == 'Assessment') & (((df['event_code'] == 4100) & (df['title'] != 'Bird Measurer (Assessment)')) | ((df['event_code'] == 4110) & (df['title'] == 'Bird Measurer (Assessment)')))]
    validation_data.drop_duplicates(subset = 'game_session',keep = 'last',inplace=True)
    if(isinstance(num_samples,float)):
            validation_data = validation_data.sample(frac = num_samples,random_state = random_state)
            print(validation_data.shape)
    else:
            validation_data = validation_data.sample(n = num_samples,random_state = random_state)
            print(validation_data.shape)
    return validation_data

In [69]:
test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')

In [75]:
train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')
test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')

In [40]:
assessment_id = list(train[train['type'] == 'Assessment']['installation_id'].unique())
train = train.loc[train['installation_id'].isin(assessment_id)]

In [41]:
train = train.reset_index(drop=False)
train.drop(columns = ['index'],axis = 1, inplace = True)

**For Game**

In [12]:
#game_list = set(train[train['type'] == 'Game']['title'].unique()).union(set(test[test['type'] == 'Game']['title'].unique()))
#game_list = list(game_list)

In [13]:
#get_game_feature(train,game_list)

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))




In [14]:
#get_game_feature(test,game_list)

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))




In [None]:
#activities_list = set(train[train['type'] == 'Activity']['title'].unique()).union(set(test[test['type'] == 'Activity']['title'].unique()))
#activities_list = list(activities_list)

In [None]:
#assessment_list = set(train[train['type'] == 'Assessment']['title'].unique()).union(set(test[test['type'] == 'Assessment']['title'].unique()))
#assessment_list = list(assessment_list)

In [None]:
#train.shape

**Train**

In [59]:
train = train[(train['type'] == 'Assessment') & (((train['event_code'] == 4100) & (train['title'] != 'Bird Measurer (Assessment)')) | ((train['event_code'] == 4110) & (train['title'] == 'Bird Measurer (Assessment)')))]

session_count = train['game_session'].value_counts().to_dict()
train['assessment_attempt_count'] = train['game_session'].map(session_count)

train['contains_true_assessment'] = train['event_data'].map(lambda x: True if (x.find('"correct":true')>=0) else False)

change_value = {
    True : 1,
    False : 0
}
train['contains_true_assessment'] = train['contains_true_assessment'].map(change_value)

correct_attempt = dict(train.groupby('game_session',sort=False)['contains_true_assessment'].sum())
train['contains_true_assessment_count'] = train['game_session'].map(correct_attempt)

for c in ['contains_true_assessment']:
    train.pop(c)
    
train['accumulated_accuracy'] = np.where((train['contains_true_assessment_count'] == 0),0,(train['contains_true_assessment_count']/train['assessment_attempt_count']))

train.loc[(train['type'] == 'Assessment'), 'accuracy_group'] = 0
train.loc[(train['accumulated_accuracy'] == 1) & (train['type'] == 'Assessment'), 'accuracy_group'] = 3
train.loc[(train['accumulated_accuracy'] == 0.5) & (train['type'] == 'Assessment'), 'accuracy_group'] = 2
train.loc[(train['accumulated_accuracy'] < 0.5) & (train['accumulated_accuracy'] > 0) & (train['assessment_attempt_count'] > 0) & (train['type'] == 'Assessment'), 'accuracy_group'] = 1

train.rename(columns = {'contains_true_assessment_count': 'num_correct',
                        'accumulated_accuracy':'accuracy',
                        'assessment_attempt_count': 'total_attempt'},inplace=True)
train = train.drop_duplicates(subset = 'game_session',keep = 'last')
train = train.reset_index(drop=False)
train.drop(columns = ['index'],axis = 1, inplace = True)
train.shape

(17690, 15)

In [60]:
#train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,total_attempt,num_correct,accuracy,accuracy_group
0,25fa8af4,901acc108f55a5a1,2019-08-06T05:22:32.357Z,"{""correct"":true,""stumps"":[1,2,4],""event_count""...",0006a69f,44,4100,31011,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,1,1.0,3.0
1,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:49.155Z,"{""correct"":false,""caterpillars"":[5,8,6],""event...",0006a69f,85,4110,90032,Bird Measurer (Assessment),Assessment,TREETOPCITY,11,0,0.0,0.0
2,25fa8af4,6bdf9623adc94d89,2019-08-06T05:38:08.036Z,"{""correct"":true,""stumps"":[1,2,4],""event_count""...",0006a69f,30,4100,18026,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,1,1.0,3.0
3,25fa8af4,9501794defd84e4d,2019-08-06T20:35:16.846Z,"{""correct"":true,""stumps"":[2,3,5],""event_count""...",0006a69f,38,4100,23043,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,2,1,0.5,2.0
4,17113b36,a9ef3ecb3d1acc6a,2019-08-06T20:50:12.115Z,"{""correct"":true,""caterpillars"":[4,8,5],""event_...",0006a69f,14,4110,13050,Bird Measurer (Assessment),Assessment,TREETOPCITY,1,1,1.0,3.0


**Test**

In [61]:
test =  test[(test['type'] == 'Assessment') &  ((test['event_count'] == 1) | ((test['event_code'] == 4100) & (test['title'] != 'Bird Measurer (Assessment)')) | ((test['event_code'] == 4110) & (test['title'] == 'Bird Measurer (Assessment)')))]

In [62]:
session_count = test['game_session'].value_counts().to_dict()
test['assessment_attempt_count'] = test['game_session'].map(session_count)

test['contains_true_assessment'] = test['event_data'].map(lambda x: True if (x.find('"correct":true')>=0) else False)

change_value = {
    True : 1,
    False : 0
}
test['contains_true_assessment'] = test['contains_true_assessment'].map(change_value)

correct_attempt = dict(test.groupby('game_session',sort=False)['contains_true_assessment'].sum())
test['contains_true_assessment_count'] = test['game_session'].map(correct_attempt)


for c in ['contains_true_assessment']:
    test.pop(c)

test['accumulated_accuracy'] = np.where((test['contains_true_assessment_count'] == 0),0,(test['contains_true_assessment_count']/test['assessment_attempt_count']))

test.loc[(test['type'] == 'Assessment'), 'accuracy_group'] = 0
test.loc[(test['accumulated_accuracy'] == 1) & (test['type'] == 'Assessment'), 'accuracy_group'] = 3
test.loc[(test['accumulated_accuracy'] == 0.5) & (test['type'] == 'Assessment'), 'accuracy_group'] = 2
test.loc[(test['accumulated_accuracy'] < 0.5) & (test['accumulated_accuracy'] > 0) & (test['assessment_attempt_count'] > 0) & (test['type'] == 'Assessment'), 'accuracy_group'] = 1

test.rename(columns = {'contains_true_assessment_count': 'num_correct',
                        'accumulated_accuracy':'accuracy',
                        'assessment_attempt_count': 'total_attempt'},inplace=True)

test = test.drop_duplicates(subset = 'game_session',keep = 'last')
test = test.reset_index(drop=False)
test.drop(columns = ['index'],axis = 1, inplace = True)
test.shape

(3347, 15)

In [None]:
#validation_data = get_validation_data(test,0.2,45)#
#validation_data = validation_data.reset_index(drop=False)
#validation_data.drop(columns = ['index'],axis = 1, inplace = True)

In [None]:
#validation_install_id = validation_data['installation_id'].unique().tolist()#

In [None]:
#test = test.loc[~test['installation_id'].isin(validation_install_id)]#
#test.shape

In [63]:
for c in tqdm(['event_id','game_session','installation_id','event_code','event_data','event_count','type','timestamp','world']):
    for df in [train,test]:
        df.pop(c)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




In [64]:
list_of_user_activities = list(set(train['title'].value_counts().index).union(set(test['title'].value_counts().index)))
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))

train['title'] = train['title'].map(activities_map)
test['title'] = test['title'].map(activities_map)
train_labels['title'] = train_labels['title'].map(activities_map)

In [None]:
#validation_data['title'] = validation_data['title'].map(activities_map)

In [65]:
x = train.drop(columns=['accuracy_group'],axis=1)
y = train['accuracy_group']

In [None]:
#val_x = validation_data.drop(columns=['accuracy_group'],axis=1)
#val_y = validation_data['accuracy_group']

In [36]:
def make_classifier():
    clf = CatBoostClassifier(
                               loss_function='MultiClass',
                               task_type="CPU",
                               learning_rate=0.01,
                               iterations=100,
                               od_type="Iter",
                               early_stopping_rounds=50,
                               random_seed=2019,
                               colsample_bylevel=0.87,
                               eval_metric='Kappa',
                              )
        
    return clf
oof = np.zeros(len(x))

In [51]:
#train.head()

Unnamed: 0,game_time,title,total_attempt,num_correct,accuracy,accuracy_group
690,31011,0,1,1,1.0,3.0
1171,35771,2,11,0,0.0,0.0
1177,42805,2,11,0,0.0,0.0
1182,47388,2,11,0,0.0,0.0
1187,50605,2,11,0,0.0,0.0


In [56]:
#y.head(10000)

690        3.0
1171       0.0
1177       0.0
1182       0.0
1187       0.0
1192       0.0
1195       0.0
1200       0.0
1205       0.0
1212       0.0
1218       0.0
1227       0.0
1259       3.0
2187       2.0
2196       2.0
2599       3.0
5362       3.0
5741       0.0
5754       0.0
5774       0.0
5803       0.0
5849       2.0
5855       2.0
6041       3.0
6890       2.0
6916       2.0
6956       3.0
6981       0.0
6985       0.0
6990       0.0
6995       0.0
7014       0.0
7144       3.0
7259       0.0
8513       0.0
8522       0.0
11365      0.0
11377      0.0
11387      0.0
11398      0.0
11408      0.0
11415      0.0
11441      0.0
11456      0.0
11460      0.0
11468      0.0
11471      0.0
15077      0.0
15084      0.0
15092      0.0
15097      0.0
15106      0.0
16721      2.0
16727      2.0
16739      1.0
16745      1.0
16750      1.0
16757      1.0
16765      1.0
16773      1.0
16779      1.0
16784      1.0
16790      1.0
16796      1.0
16802      1.0
16856      3.0
16884     

In [66]:
oof = np.zeros(len(x))
NFOLDS = 5
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=2019)


for fold, (trn_idx, test_idx) in enumerate(folds.split(x, y)):
    
    print(f'Training on fold {fold+1}')
    clf = make_classifier()
    clf.fit(x.loc[trn_idx], y.loc[trn_idx], eval_set=(x.loc[test_idx], y.loc[test_idx]),
                          use_best_model=True, verbose=500)
    
    oof[test_idx] = clf.predict(x.loc[test_idx]).reshape(len(test_idx))
    print('OOF QWK:', qwk(y, oof))
    
print('-' * 30)
print('OOF QWK:', qwk(y, oof))
print('-' * 30)

Training on fold 1
0:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 8.59ms	remaining: 851ms
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1
bestIteration = 0

Shrink model to first 1 iterations.
OOF QWK: 0.13317534257028285
Training on fold 2
0:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 7.69ms	remaining: 761ms
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1
bestIteration = 0

Shrink model to first 1 iterations.
OOF QWK: 0.2887943292164954
Training on fold 3
0:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 8.14ms	remaining: 806ms
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1
bestIteration = 0

Shrink model to first 1 iterations.
OOF QWK: 0.4773752552123316
Training on fold 4
0:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 8.35ms	remaining: 827ms
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1
bestIteration = 0

Shrink model to first 1 iterations.