**Notes**

**1) Description of the all the columns**





*   **event_id** - Randomly generated unique identifier for the event type. Maps to event_id column in specs table.
*   **game_session** - Randomly generated unique identifier grouping events within a single game or video play session.
*   **timestamp** - Client-generated datetime
*   **event_data** - Semi-structured JSON formatted string containing the events parameters. Default fields are: event_count, event_code, and game_time; otherwise fields are determined by the event type.
*   **installation_id** - Randomly generated unique identifier grouping game sessions within a single installed application instance.
*   **event_count** - Incremental counter of events within a game session (offset at 1). Extracted from event_data.
*   **event_code** - Identifier of the event 'class'. Unique per game, but may be duplicated across games. E.g. event code '2000' always identifies the 'Start Game' event for all games. Extracted from event_data.
*   **game_time** - Time in milliseconds since the start of the game session. Extracted from event_data.
*   **title** - Title of the game or video.
*   **type** - Media type of the game or video. Possible values are: 'Game', 'Assessment', 'Activity', 'Clip'.
*   **world** - The section of the application the game or video belongs to. Helpful to identify the educational curriculum goals of the media. Possible values are: 'NONE' (at the app's start screen), TREETOPCITY' (Length/Height), 'MAGMAPEAK' (Capacity/Displacement), 'CRYSTALCAVES' (Weight).






**2) Groupby data to get the number of attempts each installation_id played**


*   train_data.groupby(['game_session','installation_id'],as_index =False)['title'].agg({'value_counts'}).rename(columns={'value_counts':'Total_no'}).head()

*   test_data.groupby(['game_session','installation_id'])['title'].agg({'value_counts'}).rename(columns={'value_counts':'Total_no'}).index.get_level_values(3)


**3) Event Codes Meaning**

*   2000 : Start of the game
*   3010 : Voice description of what to do in the game
*   3110 : Starting of game with the voice description in the background
*   4070 : Player starting to play the game


**4) Data Analysis**
* All Event Id has the same value for a particular game title event though they have different installation id
* All Event Code have same value for a particular Event Id

**5) Approach to solutions**
* first 

**Importing the modules**

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns', None)
import datetime
from catboost import CatBoostClassifier
from time import time
from tqdm import tqdm_notebook as tqdm
import os
import random
import json
import pprint
import gc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold,GroupKFold
from sklearn.metrics import confusion_matrix

**Making Event Determenistic**

In [None]:
def seed_everything(seed=0):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)

In [None]:
seed_everything(70)

**The Competition Eval Metric : Quadratic Weight Kappa**

In [None]:
def quadratic_weight_kappa(actual, prediction,n=4,hist_range=(0,3)):
  O = confusion_matrix(actual,prediction)
  O = np.divide(O,np.sum(O))

  W = np.zeros((n,n))
  for i in range(n):
    for j in range(n):
      W[i][j] = ((i-j)**2)/((n-1)**2)

  actual_histogram = np.histogram(actual,bins=n,range=hist_range)[0]
  prediction_histogram = np.histogram(prediction,bins=n,range=hist_range)[0]

  E = np.outer(actual_histogram,prediction_histogram)
  E = np.divide(E,np.sum(E))

  num = np.sum(np.multiply(W,O))
  density = np.sum(np.multiply(W,E))

  return 1 - np.divide(num,density)
    

In [None]:
def pretty_json(data):
    return pprint.pprint(json.loads(data))

In [None]:
def read_file():
    train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
    train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
    specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')
    test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
    submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')
    
    return train,train_labels,specs,test,submission

**Reading the files**

In [None]:
train,train_labels,specs,test,submission = read_file()

**Filtering out the train set having installation id that have atleast one assessment done**

In [None]:
train_install_id = list(train['installation_id'].unique())

In [None]:
assessment_id = list(train[train['type'] == 'Assessment']['installation_id'].unique())
train = train.loc[train['installation_id'].isin(assessment_id)]

In [None]:
train.shape

In [None]:
list_of_user_activities = list(set(train['title'].value_counts().index).union(set(test['title'].value_counts().index)))
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))

train['title'] = train['title'].map(activities_map)
test['title'] = test['title'].map(activities_map)
train_labels['title'] = train_labels['title'].map(activities_map)

In [None]:
ids = list(train_labels['installation_id'].unique())
train = train.loc[train['installation_id'].isin(ids)]

In [None]:
session = train_labels['game_session'].values
acc_group = train_labels['accuracy_group'].values
match_data = dict(zip(session,acc_group))

In [None]:
win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
win_code[activities_map['Bird Measurer (Assessment)']] = 4110

In [None]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

In [None]:
train = train[(train['type'] == 'Assessment') & (((train['event_code'] == 4100) & (train['title'] != 'Bird Measurer (Assessment)')) | ((train['event_code'] == 4110) & (train['title'] == 'Bird Measurer (Assessment)')))]
train.shape

In [None]:
train = train.reset_index(drop=False)
train.drop(columns = ['index'],axis = 1, inplace = True)

In [None]:
session_count = train['game_session'].value_counts().to_dict()
train['assessment_attempt_count'] = train['game_session'].map(session_count)

In [None]:
train['contains_true_assessment'] = train['event_data'].map(lambda x: True if (x.find('"correct":true')>=0) else False)
#train['contains_true_assessment'] = np.where((((train['event_code'] == 4100) | (train['event_code'] == 4110)) & train['contains_true'] == True),True,None)

In [None]:
change_value = {
    True : 1,
    False : 0
}
train['contains_true_assessment'] = train['contains_true_assessment'].map(change_value)

In [None]:
correct_attempt = dict(train.groupby('game_session',sort=False)['contains_true_assessment'].sum())
train['contains_true_assessment_count'] = train['game_session'].map(correct_attempt)

In [None]:
for c in ['contains_true_assessment']:
    train.pop(c)

In [None]:
train['accumulated_accuracy'] = np.where((train['contains_true_assessment_count'] == 0),0,(train['contains_true_assessment_count']/train['assessment_attempt_count']))

In [None]:
train.loc[(train['type'] == 'Assessment'), 'accuracy_group'] = 0
train.loc[(train['accumulated_accuracy'] == 1) & (train['type'] == 'Assessment'), 'accuracy_group'] = 3
train.loc[(train['accumulated_accuracy'] == 0.5) & (train['type'] == 'Assessment'), 'accuracy_group'] = 2
train.loc[(train['accumulated_accuracy'] < 0.5) & (train['accumulated_accuracy'] > 0) & (train['assessment_attempt_count'] > 0) & (train['type'] == 'Assessment'), 'accuracy_group'] = 1

In [None]:
train.rename(columns = {'contains_true_assessment_count': 'num_correct',
                        'accumulated_accuracy':'accuracy',
                        'assessment_attempt_count': 'total_attempt'},inplace=True)

In [None]:
train['num_incorrect'] = train['total_attempt'] - train['num_correct']

In [None]:
train = train[['event_id','game_session','timestamp','event_data','installation_id','event_count','game_time','event_code','title','type','world','num_correct','num_incorrect','total_attempt','accuracy','accuracy_group']]

In [None]:
for c in ['event_id','timestamp','event_data','event_code','event_count','type']:
    train.pop(c)

In [None]:
train.to_csv('./../working/train_data.csv',index=False)

In [115]:
train = pd.read_csv('./../working/train_data.csv')
train.shape

(32743, 11)

In [117]:
train.head(10)

Unnamed: 0,game_session,installation_id,game_time,title,type,world,num_correct,num_incorrect,total_attempt,accuracy,accuracy_group
0,901acc108f55a5a1,0006a69f,31011,37,Assessment,TREETOPCITY,1,0,1,1.0,3.0
1,6bdf9623adc94d89,0006a69f,18026,37,Assessment,TREETOPCITY,1,0,1,1.0,3.0
2,9501794defd84e4d,0006a69f,18484,37,Assessment,TREETOPCITY,1,1,2,0.5,2.0
3,9501794defd84e4d,0006a69f,23043,37,Assessment,TREETOPCITY,1,1,2,0.5,2.0
4,a9ef3ecb3d1acc6a,0006a69f,34209,11,Assessment,TREETOPCITY,1,0,1,1.0,3.0
5,197a373a77101924,0006c192,12635,35,Assessment,MAGMAPEAK,1,0,1,1.0,3.0
6,b2297d292892745a,0006c192,32388,37,Assessment,TREETOPCITY,0,4,4,0.0,0.0
7,b2297d292892745a,0006c192,38139,37,Assessment,TREETOPCITY,0,4,4,0.0,0.0
8,b2297d292892745a,0006c192,54974,37,Assessment,TREETOPCITY,0,4,4,0.0,0.0
9,b2297d292892745a,0006c192,79992,37,Assessment,TREETOPCITY,0,4,4,0.0,0.0


In [86]:
test = test.reset_index(drop=False)
test.drop(columns = ['index'],axis = 1, inplace = True)

In [88]:
session_count = test['game_session'].value_counts().to_dict()
test['assessment_attempt_count'] = test['game_session'].map(session_count)

In [89]:
test['contains_true_assessment'] = test['event_data'].map(lambda x: True if (x.find('"correct":true')>=0) else False)

In [90]:
change_value = {
    True : 1,
    False : 0
}
test['contains_true_assessment'] = test['contains_true_assessment'].map(change_value)

In [91]:
correct_attempt = dict(test.groupby('game_session',sort=False)['contains_true_assessment'].sum())
test['contains_true_assessment_count'] = test['game_session'].map(correct_attempt)

In [92]:
for c in ['contains_true_assessment']:
    test.pop(c)

In [94]:
test['accumulated_accuracy'] = np.where((test['contains_true_assessment_count'] == 0),0,(test['contains_true_assessment_count']/test['assessment_attempt_count']))

In [95]:
test.loc[(test['type'] == 'Assessment'), 'accuracy_group'] = 0
test.loc[(test['accumulated_accuracy'] == 1) & (test['type'] == 'Assessment'), 'accuracy_group'] = 3
test.loc[(test['accumulated_accuracy'] == 0.5) & (test['type'] == 'Assessment'), 'accuracy_group'] = 2
test.loc[(test['accumulated_accuracy'] < 0.5) & (test['accumulated_accuracy'] > 0) & (test['assessment_attempt_count'] > 0) & (test['type'] == 'Assessment'), 'accuracy_group'] = 1

In [96]:
test.rename(columns = {'contains_true_assessment_count': 'num_correct',
                        'accumulated_accuracy':'accuracy',
                        'assessment_attempt_count': 'total_attempt'},inplace=True)

In [97]:
test['num_incorrect'] = test['total_attempt'] - test['num_correct']

In [98]:
test = test[['event_id','game_session','timestamp','event_data','installation_id','event_count','game_time','event_code','title','type','world','num_correct','num_incorrect','total_attempt','accuracy','accuracy_group']]

In [99]:
for c in ['event_id','timestamp','event_data','event_code','event_count','type']:
    test.pop(c)

In [100]:
test.to_csv('./../working/test_data.csv',index=False)

In [103]:
test.head(100)

Unnamed: 0,game_session,installation_id,game_time,title,world,num_correct,num_incorrect,total_attempt,accuracy,accuracy_group
0,8b38fc0d2fd315dc,00abaee7,22737,Cart Balancer (Assessment),CRYSTALCAVES,1,0,1,1.0,3.0
1,009c890ce6c4f3e3,01242218,11474,Cauldron Filler (Assessment),MAGMAPEAK,1,1,2,0.5,2.0
2,009c890ce6c4f3e3,01242218,19775,Cauldron Filler (Assessment),MAGMAPEAK,1,1,2,0.5,2.0
3,e8e62de939f916bc,01242218,8504,Cart Balancer (Assessment),CRYSTALCAVES,1,0,1,1.0,3.0
4,ab61cae5e3215355,01242218,13935,Chest Sorter (Assessment),CRYSTALCAVES,0,3,3,0.0,0.0
5,ab61cae5e3215355,01242218,28219,Chest Sorter (Assessment),CRYSTALCAVES,0,3,3,0.0,0.0
6,ab61cae5e3215355,01242218,51687,Chest Sorter (Assessment),CRYSTALCAVES,0,3,3,0.0,0.0
7,31423dbcd717919e,01242218,22479,Mushroom Sorter (Assessment),TREETOPCITY,1,1,2,0.5,2.0
8,31423dbcd717919e,01242218,29958,Mushroom Sorter (Assessment),TREETOPCITY,1,1,2,0.5,2.0
9,597a8839a5a3468d,01242218,14080,Bird Measurer (Assessment),TREETOPCITY,1,2,3,0.333333,1.0


In [109]:
result = dict(test.groupby('installation_id')['accuracy_group'].last())

In [110]:
t = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')

In [113]:
t[t['installation_id'] == '017c5718']

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
3587,27253bdc,d1706431c69d0f17,2019-08-02T23:24:03.145Z,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
3588,27253bdc,7f8e671b050cfc16,2019-09-21T11:23:14.319Z,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
3589,27253bdc,9cbc7871cb68348e,2019-09-21T11:23:49.822Z,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
3590,27253bdc,dbe0b9903177b7ab,2019-09-21T11:24:14.904Z,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
3591,27253bdc,bada8e54f3bb8b3e,2019-09-21T11:24:34.545Z,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,Tree Top City - Level 1,Clip,TREETOPCITY
3592,4901243f,804275af3b58a38e,2019-09-21T11:25:28.440Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",017c5718,1,2000,0,Fireworks (Activity),Activity,TREETOPCITY
3593,beb0a7b9,804275af3b58a38e,2019-09-21T11:25:28.450Z,"{""description"":""Let's set off these fireworks....",017c5718,2,3010,0,Fireworks (Activity),Activity,TREETOPCITY
3594,02a42007,804275af3b58a38e,2019-09-21T11:25:30.441Z,"{""rocket"":1,""coordinates"":{""x"":128,""y"":614,""st...",017c5718,3,4030,2081,Fireworks (Activity),Activity,TREETOPCITY
3595,b88f38da,804275af3b58a38e,2019-09-21T11:25:32.355Z,"{""description"":""Let's set off these fireworks....",017c5718,4,3110,4014,Fireworks (Activity),Activity,TREETOPCITY
3596,e694a35b,804275af3b58a38e,2019-09-21T11:25:34.296Z,"{""rocket"":1,""height"":562,""duration"":3866,""coor...",017c5718,5,4020,5947,Fireworks (Activity),Activity,TREETOPCITY


In [114]:
pprint.pprint(json.loads(t['event_data'][3731]))

{'coordinates': {'stage_height': 762, 'stage_width': 1015, 'x': 695, 'y': 399},
 'event_code': 4030,
 'event_count': 67,
 'flower': 4,
 'flowers': [0, 0, 2, 4, 5],
 'game_time': 51473}


In [None]:
def make_classifier():
    clf = CatBoostClassifier(
                               loss_function='MultiClass',
                               task_type="CPU",
                               learning_rate=0.01,
                               iterations=100,
                               od_type="Iter",
                               early_stopping_rounds=50,
                               random_seed=2019,
                               colsample_bylevel=0.87,
                               eval_metric='Kappa',
                              )
        
    return clf
oof = np.zeros(len(x))

In [None]:
params = {
       'loss_function':'MultiClass',
       'task_type':'CPU',
       'learning_rate':0.05,
       'iterations':20,
       'early_stopping_rounds':5,
       'random_seed':89,
       'colsample_bylevel':0.87,
       'eval_metric':'Kappa',
}

In [None]:
#cat_features = ['Clip','Game','session_title','Activity']

In [None]:
cat_features = ['Clip']

In [None]:
oof = np.zeros(len(x))
NFOLDS = 4
folds = KFold(n_splits=NFOLDS)
clf_in_predict = []

for fold, (trn_idx, test_idx) in enumerate(folds.split(x, y)):
    print(f'Training on fold {fold+1}')
    clf = make_classifier()
    clf.fit(x.loc[trn_idx], y.loc[trn_idx], eval_set=(x.loc[test_idx], y.loc[test_idx]),
                          use_best_model=True, verbose=500,cat_features=cat_features)
    
    clf_in_predict = clf.predict(x.loc[test_idx].reshape(len(tets_idx)))
    
    print('-' * 30)
    print('Traning_predict QWK:', quadratic_weight_kappa(y, clf_in_predict))
    print('-' * 30)
    
    oof[test_idx] = clf.predict(x.loc[test_idx]).reshape(len(test_idx))

    
print('-' * 30)
print('OOF QWK:', quadratic_weight_kappa(y, oof))
print('-' * 30)

In [None]:
# process test set
new_test = []
for ins_id,user_sample in tqdm(test.groupby(['installation_id'], sort=False), total=1000):
    a = get_data(user_sample, test_set=True)
    new_test.append(a)
    
test = pd.DataFrame(new_test)

In [None]:
test.head()

In [None]:
preds = clf.predict(test)
del test

## Make Submission

In [None]:
submission['accuracy_group'] = np.round(preds).astype('int')
submission.to_csv('submission.csv', index=None)
submission.head()

In [None]:
!rm -rf catboost_info