**Notes**

**1) Description of the all the columns**





*   **event_id** - Randomly generated unique identifier for the event type. Maps to event_id column in specs table.
*   **game_session** - Randomly generated unique identifier grouping events within a single game or video play session.
*   **timestamp** - Client-generated datetime
*   **event_data** - Semi-structured JSON formatted string containing the events parameters. Default fields are: event_count, event_code, and game_time; otherwise fields are determined by the event type.
*   **installation_id** - Randomly generated unique identifier grouping game sessions within a single installed application instance.
*   **event_count** - Incremental counter of events within a game session (offset at 1). Extracted from event_data.
*   **event_code** - Identifier of the event 'class'. Unique per game, but may be duplicated across games. E.g. event code '2000' always identifies the 'Start Game' event for all games. Extracted from event_data.
*   **game_time** - Time in milliseconds since the start of the game session. Extracted from event_data.
*   **title** - Title of the game or video.
*   **type** - Media type of the game or video. Possible values are: 'Game', 'Assessment', 'Activity', 'Clip'.
*   **world** - The section of the application the game or video belongs to. Helpful to identify the educational curriculum goals of the media. Possible values are: 'NONE' (at the app's start screen), TREETOPCITY' (Length/Height), 'MAGMAPEAK' (Capacity/Displacement), 'CRYSTALCAVES' (Weight).






**2) Groupby data to get the number of attempts each installation_id played**


*   train_data.groupby(['game_session','installation_id'],as_index =False)['title'].agg({'value_counts'}).rename(columns={'value_counts':'Total_no'}).head()

*   test_data.groupby(['game_session','installation_id'])['title'].agg({'value_counts'}).rename(columns={'value_counts':'Total_no'}).index.get_level_values(3)


**3) Event Codes Meaning**

*   2000 : Start of the game
*   3010 : Voice description of what to do in the game
*   3110 : Starting of game with the voice description in the background
*   4070 : Player starting to play the game


**4) Data Analysis**
* All Event Id has the same value for a particular game title event though they have different installation id
* All Event Code have same value for a particular Event Id

**5) Approach to solutions**
* first 

**Importing the modules**

In [32]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns', None)
import datetime
from catboost import CatBoostClassifier
from time import time
from tqdm import tqdm_notebook as tqdm
import os
import random
import json
import pprint
import gc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold,GroupKFold
from sklearn.metrics import confusion_matrix

**Making Event Determenistic**

In [2]:
def seed_everything(seed=0):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)

In [3]:
seed_everything(70)

**The Competition Eval Metric : Quadratic Weight Kappa**

In [4]:
def quadratic_weight_kappa(actual, prediction,n=4,hist_range=(0,3)):
  O = confusion_matrix(actual,prediction)
  O = np.divide(O,np.sum(O))

  W = np.zeros((n,n))
  for i in range(n):
    for j in range(n):
      W[i][j] = ((i-j)**2)/((n-1)**2)

  actual_histogram = np.histogram(actual,bins=n,range=hist_range)[0]
  prediction_histogram = np.histogram(prediction,bins=n,range=hist_range)[0]

  E = np.outer(actual_histogram,prediction_histogram)
  E = np.divide(E,np.sum(E))

  num = np.sum(np.multiply(W,O))
  density = np.sum(np.multiply(W,E))

  return 1 - np.divide(num,density)
    

In [5]:
def pretty_json(data):
    return pprint.pprint(json.loads(data))

In [6]:
def read_file():
    train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
    train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
    specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')
    test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
    submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')
    
    return train,train_labels,specs,test,submission

**Reading the files**

In [7]:
train,train_labels,specs,test,submission = read_file()

**Filtering out the train set having installation id that have atleast one assessment done**

In [8]:
train_install_id = list(train['installation_id'].unique())

In [9]:
assessment_id = list(train[train['type'] == 'Assessment']['installation_id'].unique())
train = train.loc[train['installation_id'].isin(assessment_id)]

In [10]:
train.shape

(8294138, 11)

In [80]:
list_of_user_activities = list(set(train['title'].value_counts().index).union(set(test['title'].value_counts().index)))
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))

train['title'] = train['title'].map(activities_map)
test['title'] = test['title'].map(activities_map)
train_labels['title'] = train_labels['title'].map(activities_map)

In [11]:
ids = list(train_labels['installation_id'].unique())
train = train.loc[train['installation_id'].isin(ids)]

In [12]:
session = train_labels['game_session'].values
acc_group = train_labels['accuracy_group'].values
match_data = dict(zip(session,acc_group))

In [None]:
win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
win_code[activities_map['Bird Measurer (Assessment)']] = 4110

In [13]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

In [57]:
train = train.reset_index(drop=False)
train.drop(columns = ['index'],axis = 1, inplace = True)

In [15]:
train = train[train['type'] == 'Assessment']

In [16]:
train = train[(train['type'] == 'Assessment') & ((train['event_code'] == 4100) | ((train['event_code'] == 4110) & (train['title'] == 'Bird Measurer (Assessment)')))]
train.shape

(43901, 11)

In [69]:
train_labels[train_labels['game_session'] == '3beae5a17e0b26f6']

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
28,3beae5a17e0b26f6,003372b0,Cart Balancer (Assessment),1,0,1.0,3


In [79]:
train.head(100)

Unnamed: 0,game_session,installation_id,game_time,title,type,world,num_correct,num_incorrect,total_attempt,accuracy,accuracy_group
0,901acc108f55a5a1,0006a69f,31011,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1,1.0,3.0
1,77b8ee947eb84b4e,0006a69f,35771,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,11,11,0.0,0.0
2,77b8ee947eb84b4e,0006a69f,42805,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,11,11,0.0,0.0
3,77b8ee947eb84b4e,0006a69f,47388,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,11,11,0.0,0.0
4,77b8ee947eb84b4e,0006a69f,50605,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,11,11,0.0,0.0
5,77b8ee947eb84b4e,0006a69f,54822,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,11,11,0.0,0.0
6,77b8ee947eb84b4e,0006a69f,58280,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,11,11,0.0,0.0
7,77b8ee947eb84b4e,0006a69f,62256,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,11,11,0.0,0.0
8,77b8ee947eb84b4e,0006a69f,67164,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,11,11,0.0,0.0
9,77b8ee947eb84b4e,0006a69f,73056,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,11,11,0.0,0.0


In [59]:
session_count = train['game_session'].value_counts().to_dict()
train['assessment_attempt_count'] = train['game_session'].map(session_count)

In [60]:
train['contains_true_assessment'] = train['event_data'].map(lambda x: True if (x.find('"correct":true')>=0) else False)
#train['contains_true_assessment'] = np.where((((train['event_code'] == 4100) | (train['event_code'] == 4110)) & train['contains_true'] == True),True,None)

In [61]:
change_value = {
    True : 1,
    False : 0
}
train['contains_true_assessment'] = train['contains_true_assessment'].map(change_value)

In [62]:
correct_attempt = dict(train.groupby('game_session',sort=False)['contains_true_assessment'].sum())
train['contains_true_assessment_count'] = train['game_session'].map(correct_attempt)

In [66]:
for c in ['contains_true_assessment']:
    train.pop(c)

In [63]:
train['accumulated_accuracy'] = np.where((train['contains_true_assessment_count'] == 0),0,(train['contains_true_assessment_count']/train['assessment_attempt_count']))

In [64]:
train.loc[(train['type'] == 'Assessment'), 'accuracy_group'] = 0
train.loc[(train['accumulated_accuracy'] == 1) & (train['type'] == 'Assessment'), 'accuracy_group'] = 3
train.loc[(train['accumulated_accuracy'] == 0.5) & (train['type'] == 'Assessment'), 'accuracy_group'] = 2
train.loc[(train['accumulated_accuracy'] < 0.5) & (train['accumulated_accuracy'] > 0) & (train['assessment_attempt_count'] > 0) & (train['type'] == 'Assessment'), 'accuracy_group'] = 1

In [None]:
train.rename(columns = {'contains_true_assessment_count': 'num_correct',
                        'accumulated_accuracy':'accuracy',
                        'assessment_attempt_count': 'total_attempt'},inplace=True)

In [None]:
train['num_incorrect'] = train['total_attempt'] - train['contains_true_assessment_count']

In [None]:
for c in ['event_id','timestamp','event_data','event_code','event_count']:
    train.pop(c)

In [85]:
train.to_csv('./../working/train_data.csv',index=False)

In [86]:
test[(test['type'] == 'Assessment') & ((test['event_code'] == 4100) | ((test['event_code'] == 4110) & (test['title'] = 'Bird Measurer (Assessment)')))].shape

(1156414, 11)

In [89]:
test_labels = test[(test['type'] == 'Assessment') & ((test['event_code'] == 4100) | ((test['event_code'] == 4110) & (test['title'] == 'Bird Measurer (Assessment)')))]

In [90]:
test_labels.head(100)

(4756, 11)

In [None]:
def make_classifier():
    clf = CatBoostClassifier(
                               loss_function='MultiClass',
                               task_type="CPU",
                               learning_rate=0.01,
                               iterations=100,
                               od_type="Iter",
                               early_stopping_rounds=50,
                               random_seed=2019,
                               colsample_bylevel=0.87,
                               eval_metric='Kappa',
                              )
        
    return clf
oof = np.zeros(len(x))

In [None]:
params = {
       'loss_function':'MultiClass',
       'task_type':'CPU',
       'learning_rate':0.05,
       'iterations':20,
       'early_stopping_rounds':5,
       'random_seed':89,
       'colsample_bylevel':0.87,
       'eval_metric':'Kappa',
}

In [None]:
#cat_features = ['Clip','Game','session_title','Activity']

In [None]:
cat_features = ['Clip']

In [None]:
oof = np.zeros(len(x))
NFOLDS = 4
folds = KFold(n_splits=NFOLDS)
clf_in_predict = []

for fold, (trn_idx, test_idx) in enumerate(folds.split(x, y)):
    print(f'Training on fold {fold+1}')
    clf = make_classifier()
    clf.fit(x.loc[trn_idx], y.loc[trn_idx], eval_set=(x.loc[test_idx], y.loc[test_idx]),
                          use_best_model=True, verbose=500,cat_features=cat_features)
    
    clf_in_predict = clf.predict(x.loc[test_idx].reshape(len(tets_idx)))
    
    print('-' * 30)
    print('Traning_predict QWK:', quadratic_weight_kappa(y, clf_in_predict))
    print('-' * 30)
    
    oof[test_idx] = clf.predict(x.loc[test_idx]).reshape(len(test_idx))

    
print('-' * 30)
print('OOF QWK:', quadratic_weight_kappa(y, oof))
print('-' * 30)

In [None]:
# process test set
new_test = []
for ins_id,user_sample in tqdm(test.groupby(['installation_id'], sort=False), total=1000):
    a = get_data(user_sample, test_set=True)
    new_test.append(a)
    
test = pd.DataFrame(new_test)

In [None]:
test.head()

In [None]:
preds = clf.predict(test)
del test

## Make Submission

In [None]:
submission['accuracy_group'] = np.round(preds).astype('int')
submission.to_csv('submission.csv', index=None)
submission.head()

In [None]:
!rm -rf catboost_info