**Notes**

**1) Description of the all the columns**





*   **event_id** - Randomly generated unique identifier for the event type. Maps to event_id column in specs table.
*   **game_session** - Randomly generated unique identifier grouping events within a single game or video play session.
*   **timestamp** - Client-generated datetime
*   **event_data** - Semi-structured JSON formatted string containing the events parameters. Default fields are: event_count, event_code, and game_time; otherwise fields are determined by the event type.
*   **installation_id** - Randomly generated unique identifier grouping game sessions within a single installed application instance.
*   **event_count** - Incremental counter of events within a game session (offset at 1). Extracted from event_data.
*   **event_code** - Identifier of the event 'class'. Unique per game, but may be duplicated across games. E.g. event code '2000' always identifies the 'Start Game' event for all games. Extracted from event_data.
*   **game_time** - Time in milliseconds since the start of the game session. Extracted from event_data.
*   **title** - Title of the game or video.
*   **type** - Media type of the game or video. Possible values are: 'Game', 'Assessment', 'Activity', 'Clip'.
*   **world** - The section of the application the game or video belongs to. Helpful to identify the educational curriculum goals of the media. Possible values are: 'NONE' (at the app's start screen), TREETOPCITY' (Length/Height), 'MAGMAPEAK' (Capacity/Displacement), 'CRYSTALCAVES' (Weight).






**2) Groupby data to get the number of attempts each installation_id played**


*   train_data.groupby(['game_session','installation_id'],as_index =False)['title'].agg({'value_counts'}).rename(columns={'value_counts':'Total_no'}).head()

*   test_data.groupby(['game_session','installation_id'])['title'].agg({'value_counts'}).rename(columns={'value_counts':'Total_no'}).index.get_level_values(3)


**3) Event Codes Meaning**

*   2000 : Start of the game
*   3010 : Voice description of what to do in the game
*   3110 : Starting of game with the voice description in the background
*   4070 : Player starting to play the game


**4) Data Analysis**
* All Event Id has the same value for a particular game title event though they have different installation id
* All Event Code have same value for a particular Event Id

**5) Approach to solutions**
* first 

**Importing the modules**

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns', None)
import datetime
from catboost import CatBoostClassifier
from time import time
from tqdm import tqdm_notebook as tqdm
import os
import random
import json
import pprint
import gc
from sklearn.model_selection import KFold,GroupKFold
from sklearn.metrics import confusion_matrix

**Making Event Determenistic**

In [None]:
def seed_everything(seed=0):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)

In [None]:
seed_everything(70)

**The Competition Eval Metric : Quadratic Weight Kappa**

In [None]:
def quadratic_weight_kappa(actual, prediction,n=4,hist_range=(0,3)):
  O = confusion_matrix(actual,prediction)
  O = np.divide(O,np.sum(O))

  W = np.zeros((n,n))
  for i in range(n):
    for j in range(n):
      W[i][j] = ((i-j)**2)/((n-1)**2)

  actual_histogram = np.histogram(actual,bins=n,range=hist_range)[0]
  prediction_histogram = np.histogram(prediction,bins=n,range=hist_range)[0]

  E = np.outer(actual_histogram,prediction_histogram)
  E = np.divide(E,np.sum(E))

  num = np.sum(np.multiply(W,O))
  density = np.sum(np.multiply(W,E))

  return 1 - np.divide(num,density)
    

In [None]:
def pretty_json(data):
    return pprint.pprint(json.loads(data))

In [None]:
def read_file():
    train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
    train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
    specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')
    test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
    submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')
    
    return train,train_labels,specs,test,submission

**Reading the files**

In [None]:
train,train_labels,specs,test,submission = read_file()

**Filtering out the train set having installation id that have atleast one assessment done**

In [None]:
train_install_id = list(train['installation_id'].unique())

In [None]:
assessment_id = list(train[train['type'] == 'Assessment']['installation_id'].unique())
train = train.loc[train['installation_id'].isin(assessment_id)]

In [None]:
list_of_user_activities = list(set(train['title'].value_counts().index).union(set(test['title'].value_counts().index)))
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))

train['title'] = train['title'].map(activities_map)
test['title'] = test['title'].map(activities_map)
train_labels['title'] = train_labels['title'].map(activities_map)

In [None]:
ids = list(train_labels['installation_id'].unique())
train = train.loc[train['installation_id'].isin(ids)]

In [None]:
session = train_labels['game_session'].values
acc_group = train_labels['accuracy_group'].values
match_data = dict(zip(session,acc_group))

In [None]:
win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
win_code[activities_map['Bird Measurer (Assessment)']] = 4110

In [None]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

In [296]:
train = train.reset_index(drop=False)
train.drop(columns = ['index'],axis = 1, inplace = True)

In [None]:
train = train[train['type'] == 'Assessment']

In [266]:
train['contains_true'] = train['event_data'].map(lambda x: True if (x.find('"correct":true')>=0) else None)
train['contains_true_assessment'] = np.where((((train['event_code'] == 4100) | (train['event_code'] == 4110)) & train['contains_true'] == True),True,None)

In [267]:
train['contains_false'] = train['event_data'].map(lambda x: True if (x.find('"correct":false')>=0) else None)
train['contains_false_assessment'] = np.where((((train['event_code'] == 4100) | (train['event_code'] == 4110)) & train['contains_false'] == True),True,None)

In [268]:
for c in ['contains_false','contains_true']:
    train.pop(c)

In [None]:
train.shape

In [None]:
train = train[(train['event_code'] == 4100) | (train['event_code'] == 4110)]
train.shape

In [None]:
train = train.reset_index(drop=False)
train.drop(columns = ['index'],axis = 1, inplace = True)

In [None]:
session_count = train['game_session'].value_counts().to_dict()
train['assessment_attempt_count'] = train['game_session'].map(session_count)

In [None]:
train.groupby('game_session')['contains_true_assessment'].value_counts()

In [272]:
attempt_true_count = train.groupby('game_session')['contains_true_assessment'].value_counts().to_dict()

In [274]:
true_attempt = {k : v for (k,i),v in attempt_true_count.items()}
true_attempt

{'00097cda27afb726': 1,
 '0014403daadf67aa': 2,
 '0014daa1d3e26eb2': 1,
 '001c49e9e9968dbe': 2,
 '0020fdaa239f55d4': 1,
 '002135f1e6df45c9': 1,
 '002396a496a0c1c7': 2,
 '0023fe98a0e66a8b': 1,
 '0034c95c473c0dd5': 1,
 '003a74ec8f56ef45': 1,
 '0060b5da47c841d3': 2,
 '00622da5dd5996f9': 1,
 '0067974a3f62e5b0': 1,
 '006c93ae4c78cd28': 1,
 '006ec667711e7708': 1,
 '007161ce1f543b24': 1,
 '00716ced96b7ada9': 1,
 '0073b15e5ff91b14': 1,
 '0078c238d5d45e61': 2,
 '007b3133e00e97e8': 1,
 '0083b64b5a3579f4': 1,
 '008a884d5604084d': 1,
 '00a35ec9a518d748': 1,
 '00a48b277d0268aa': 2,
 '00a63e7446844b73': 1,
 '00a715833d0cc35b': 1,
 '00a7420b803cdeb4': 1,
 '00b0fd253b62a00a': 1,
 '00ba742b1d76ed0e': 1,
 '00be283f1ac8adde': 1,
 '00c09991e764d247': 1,
 '00d13ab81e9a0623': 1,
 '00d678603e206f11': 1,
 '00df325eab5704a5': 1,
 '00e17f1d6e32959c': 1,
 '00e351a021e7726e': 1,
 '00ec10cbcd61bcd6': 1,
 '00ef3777547f55d6': 1,
 '00f4c6c013d5b6cf': 1,
 '00f567680739a6e0': 1,
 '00feedf9a0aa2d4e': 2,
 '010047c9b14b7f

In [291]:
train['correct_attempt'] = train['game_session'].map(true_attempt)

In [None]:
change_value = {
    True : 1,
    False : 0
}
train['contains_true_assessment'] = train['contains_true_assessment'].map(change_value)
train['contains_false_assessment'] = train['contains_false_assessment'].map(change_value)

In [None]:
train['accumulated_accuracy'] = (train['correct_attempt']/train['assessment_attempt_count'])

In [288]:
train_labels[train_labels['game_session'] == 'dc6bc0284039c9fb']

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
13,dc6bc0284039c9fb,001d0ed0,Mushroom Sorter (Assessment),1,0,1.0,3


In [301]:
for i in tqdm(range(train['correct_attempt'].shape[0])):
    if(train['correct_attempt'][i] == 'NaN'):
        train['correct_attempt'][i] == 0
    else:
        continue

HBox(children=(IntProgress(value=0, max=43901), HTML(value='')))




In [302]:
train.head(100)

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,contains_true_assessment,contains_false_assessment,assessment_attempt_count,correct_attempt,accumulated_accuracy
0,25fa8af4,901acc108f55a5a1,2019-08-06T05:22:32.357Z,"{""correct"":true,""stumps"":[1,2,4],""event_count""...",0006a69f,44,4100,31011,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,True,,1,1.0,1.0
1,17113b36,77b8ee947eb84b4e,2019-08-06T05:35:54.898Z,"{""correct"":false,""caterpillars"":[11,8,3],""even...",0006a69f,29,4110,35771,Bird Measurer (Assessment),Assessment,TREETOPCITY,,True,11,,1.0
2,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:01.927Z,"{""correct"":false,""caterpillars"":[11,8,11],""eve...",0006a69f,35,4110,42805,Bird Measurer (Assessment),Assessment,TREETOPCITY,,True,11,,1.0
3,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:06.512Z,"{""correct"":false,""caterpillars"":[11,8,5],""even...",0006a69f,40,4110,47388,Bird Measurer (Assessment),Assessment,TREETOPCITY,,True,11,,1.0
4,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:09.739Z,"{""correct"":false,""caterpillars"":[11,8,7],""even...",0006a69f,45,4110,50605,Bird Measurer (Assessment),Assessment,TREETOPCITY,,True,11,,1.0
5,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:13.951Z,"{""correct"":false,""caterpillars"":[11,8,4],""even...",0006a69f,50,4110,54822,Bird Measurer (Assessment),Assessment,TREETOPCITY,,True,11,,1.0
6,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:17.407Z,"{""correct"":false,""caterpillars"":[11,8,4],""even...",0006a69f,53,4110,58280,Bird Measurer (Assessment),Assessment,TREETOPCITY,,True,11,,1.0
7,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:21.390Z,"{""correct"":false,""caterpillars"":[11,8,2],""even...",0006a69f,58,4110,62256,Bird Measurer (Assessment),Assessment,TREETOPCITY,,True,11,,1.0
8,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:26.296Z,"{""correct"":false,""caterpillars"":[11,8,1],""even...",0006a69f,63,4110,67164,Bird Measurer (Assessment),Assessment,TREETOPCITY,,True,11,,1.0
9,17113b36,77b8ee947eb84b4e,2019-08-06T05:36:32.187Z,"{""correct"":false,""caterpillars"":[11,8,1],""even...",0006a69f,70,4110,73056,Bird Measurer (Assessment),Assessment,TREETOPCITY,,True,11,,1.0


In [271]:
train.groupby(['installation_id','game_session'])['contains_true_assessment'].value_counts()

installation_id  game_session      contains_true_assessment
0006a69f         6bdf9623adc94d89  True                        1
                 901acc108f55a5a1  True                        1
                 9501794defd84e4d  True                        1
                 a9ef3ecb3d1acc6a  True                        2
0006c192         197a373a77101924  True                        1
                 957406a905d59afd  True                        1
00129856         ae691ec5ad5652cf  True                        2
001d0ed0         9480b026e3e7d371  True                        1
                 c046a858e7c8bf03  True                        1
                 dc6bc0284039c9fb  True                        1
002db7e3         03e73cfb40d8d204  True                        1
                 113a557b60770369  True                        1
                 1c48bdca76efc7b0  True                        1
                 470a1bd6b99152a7  True                        1
                 5025f22d6e944

In [None]:
true_attempt['346d5f57cf40bac0']

In [None]:
def make_classifier():
    clf = CatBoostClassifier(
                               loss_function='MultiClass',
                               task_type="CPU",
                               learning_rate=0.01,
                               iterations=100,
                               od_type="Iter",
                               early_stopping_rounds=50,
                               random_seed=2019,
                               colsample_bylevel=0.87,
                               eval_metric='Kappa',
                              )
        
    return clf
oof = np.zeros(len(x))

In [None]:
params = {
       'loss_function':'MultiClass',
       'task_type':'CPU',
       'learning_rate':0.05,
       'iterations':20,
       'early_stopping_rounds':5,
       'random_seed':89,
       'colsample_bylevel':0.87,
       'eval_metric':'Kappa',
}

In [None]:
#cat_features = ['Clip','Game','session_title','Activity']

In [None]:
cat_features = ['Clip']

In [None]:
oof = np.zeros(len(x))
NFOLDS = 4
folds = KFold(n_splits=NFOLDS)

for fold, (trn_idx, test_idx) in enumerate(folds.split(x, y)):
    print(f'Training on fold {fold+1}')
    clf = make_classifier()
    clf.fit(x.loc[trn_idx], y.loc[trn_idx], eval_set=(x.loc[test_idx], y.loc[test_idx]),
                          use_best_model=True, verbose=500,cat_features=cat_features)
    oof[test_idx] = clf.predict(x.loc[test_idx]).reshape(len(test_idx))

    
print('-' * 30)
print('OOF QWK:', quadratic_weight_kappa(y, oof))
print('-' * 30)

In [None]:
# process test set
new_test = []
for ins_id,user_sample in tqdm(test.groupby(['installation_id'], sort=False), total=1000):
    a = get_data(user_sample, test_set=True)
    new_test.append(a)
    
test = pd.DataFrame(new_test)

In [None]:
test.head()

In [None]:
preds = clf.predict(test)
del test

## Make Submission

In [None]:
submission['accuracy_group'] = np.round(preds).astype('int')
submission.to_csv('submission.csv', index=None)
submission.head()

In [None]:
!rm -rf catboost_info