In [1]:
import numpy as np
from tqdm import tqdm
import json
import pandas as pd
import os
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
#import lightgbm as lgb

In [2]:
#Shape of data 
print('Reading train.csv file....')
train = pd.read_csv('data/train.csv')
print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

print('Reading test.csv file....')
test = pd.read_csv('data/test.csv')
print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

print('Reading train_labels.csv file....')
train_labels = pd.read_csv('data/train_labels.csv')
print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

print('Reading specs.csv file....')
specs = pd.read_csv('data/specs.csv')
print('Specs.csv file have {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))

print('Reading sample_submission.csv file....')
sample_submission = pd.read_csv('data/sample_submission.csv')
print('Sample_submission.csv file have {} rows and {} columns'.format(sample_submission.shape[0], sample_submission.shape[1]))

Reading train.csv file....
Training.csv file have 11341042 rows and 11 columns
Reading test.csv file....
Test.csv file have 1156414 rows and 11 columns
Reading train_labels.csv file....
Train_labels.csv file have 17690 rows and 7 columns
Reading specs.csv file....
Specs.csv file have 386 rows and 3 columns
Reading sample_submission.csv file....
Sample_submission.csv file have 1000 rows and 2 columns


In [3]:
train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [4]:
activities_map = dict(zip(train['title'].unique(), 
                          4100*np.ones(train['title'].nunique()).astype('int')))
activities_map['Bird Measurer (Assessment)'] = 4110

In [233]:
def extracting_duration(durations):
    dur_std = 0
    dur_sum = 0
    dur_mean = 0
    if len(durations) != 0:
        dur_sum = durations.iloc[-1]
        duration_norm = durations.diff().dropna()
        if len(duration_norm) >= 2:
            dur_std = duration_norm.std()
            dur_mean = duration_norm.mean()
    return dur_mean, dur_sum, dur_std


def feature_engineering(user_sample, test_data=False):
    output = []
    Cum_Assess, Cum_Activity, Cum_Clip, Cum_Game = 0, 0, 0, 0
    cum_corr, cum_incorr, cum_acc = 0, 0, 0
    cum_dur_assess, cum_dur_clip, cum_dur_game, cum_dur_activity = 0, 0, 0, 0
    counter = 0
    cum_acc_group = []
    # itarates through each session of one instalation_id
    for session_name, session in user_sample.groupby('game_session', sort=False):

        # Start a dict to have the feature characterestics
        features = {'Clip': 0, 'Activity': 0,
                    'Assess': 0, 'Game': 0,
                    'Cum_Clip': Cum_Clip, 'Cum_Activity': Cum_Activity,
                    'Cum_Assess': Cum_Assess, 'Cum_Game': Cum_Game,
                    'cum_dur_clip': cum_dur_clip, 'cum_dur_asses': cum_dur_assess,
                    'cum_dur_activity': cum_dur_activity, 'cum_dur_game': cum_dur_game}

        features['installation_id'] = session['installation_id'].unique()[0]
        features['game_session'] = session['game_session'].unique()[0]
        # event_counter includes all event codes and all types
        features['event_counter'] = session.iloc[-1]['event_count']

        # session type
        features['type'] = session['type'].unique()[0]
        # session title
        features['title'] = session['title'].unique()[0]

        # World
        features['world'] = session['world'].unique()[0]

        # Just get back those with event codes of 4100 and 4110
        all_attempts = session.query(
            f'event_code == {activities_map[features["title"]]}')
#        all_attempts = session

        if (features['type'] == 'Assessment'):
            # if we consider all event codes,
            # actions should be the same as event counter
            features['Assess'] += len(all_attempts['event_data'])
            Cum_Assess += features['Assess']

            # Durations
            features['assess_dur_mean'], features['assess_dur_sum'], \
                features['assess_dur_std'] = extracting_duration(
                    all_attempts['game_time'])
            cum_dur_assess += features['assess_dur_sum']

            # Check the numbers of correct atteampts
            features['cum_corr'] = cum_corr
            features['correct'] = all_attempts['event_data'].str.contains(
                'true').sum()
            cum_corr += features['correct']

            # Check the numbers of incorrect atteampts
            features['cum_incorrect'] = cum_incorr
            features['incorrect'] = all_attempts['event_data'].str.contains(
                'false').sum()
            cum_incorr += features['incorrect']

            # To compute accuracy
            features['cum_acc'] = cum_acc / counter if counter > 0 else 0
            features['mean_acc_group'] = sum(cum_acc_group) / counter if counter > 0 else 0
            counter += 1
            features['acc'] = features['correct'] / (features['Assess'])\
                if features['Assess'] != 0 else 0
            cum_acc += features['acc']

            # To find the accuracy group
            if features['acc'] == 0:
                features['acc_group'] = 0
            elif features['acc'] == 1:
                features['acc_group'] = 3
            elif features['acc'] == 0.5:
                features['acc_group'] = 2
            else:
                features['acc_group'] = 1
            cum_acc_group.append(features['acc_group'])

        elif features['type'] == 'Clip':
            # check the total number of clips
            features['Clip'] += len(all_attempts['event_data'])
            Cum_Clip += features['Clip']

            # Durations
            features['clip_dur_mean'], features['clip_dur_sum'], \
                features['clip_dur_std'] = extracting_duration(
                    all_attempts['game_time'])
            cum_dur_clip += features['clip_dur_sum']

        elif features['type'] == 'Activity':
            # check the total number of clips
            features['Activity'] += len(all_attempts['event_data'])
            Cum_Activity += features['Activity']

            # Durations
            features['activity_dur_mean'], features['activity_dur_sum'], \
                features['activity_dur_std'] = extracting_duration(
                    all_attempts['game_time'])
            cum_dur_activity += features['activity_dur_sum']

        elif features['type'] == 'Game':
            # check the total number of Games
            features['Game'] += len(all_attempts['event_data'])
            Cum_Game += features['Game']

            # Durations
            features['game_dur_mean'], features['game_dur_sum'], \
                features['game_dur_std'] = extracting_duration(
                    all_attempts['game_time'])
            cum_dur_game += features['game_dur_sum']

        if features.get('Assess', 0) > 0 or test_data:
            output.append(features)
    if test_data:
        return output[-1]
    return output

In [250]:
# groups_train = train.groupby('installation_id', sort = False)
# g_train = groups_train.get_group('0006a69f')
# ss = pd.DataFrame(feature_engineering(g_train, False))

# groups = test.groupby('installation_id', sort = False)
# g_test = groups.get_group('00abaee7')
# ss = pd.DataFrame(feature_engineering(g_test, True), index=[0])
# ss.T

In [36]:
train_labels.head()

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.0,0
2,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.5,2
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.0,3


### Process train set

In [262]:
# Apply compile function to each installation_id in train dataset
groups = train.groupby('installation_id', sort = False)
df_train = pd.DataFrame()
count = 0
temp_out = []
for ins_id, user_sample in tqdm(groups):
    temp_out += feature_engineering(user_sample)
df_train = pd.DataFrame(temp_out)
#del temp_out
print(df_train.shape)
df_train['installation_id'].equals(train_labels['installation_id'])

100%|██████████| 17000/17000 [19:25<00:00, 17.29it/s]  


(17690, 29)


True

In [263]:
df_train.head()

Unnamed: 0,Activity,Assess,Clip,Cum_Activity,Cum_Assess,Cum_Clip,Cum_Game,Game,acc,acc_group,...,cum_dur_game,cum_incorrect,event_counter,game_session,incorrect,installation_id,mean_acc_group,title,type,world
0,0,1,0,0,0,0,0,0,1.0,3,...,0,0,48,901acc108f55a5a1,0,0006a69f,0.0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1,0,11,0,0,1,0,4,0,0.0,0,...,185103,0,87,77b8ee947eb84b4e,11,0006a69f,3.0,Bird Measurer (Assessment),Assessment,TREETOPCITY
2,0,1,0,0,12,0,4,0,1.0,3,...,185103,11,35,6bdf9623adc94d89,0,0006a69f,1.5,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
3,0,2,0,0,13,0,4,0,0.5,2,...,185103,11,42,9501794defd84e4d,1,0006a69f,1.5,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
4,0,1,0,0,15,0,8,0,1.0,3,...,320634,12,32,a9ef3ecb3d1acc6a,0,0006a69f,1.6,Bird Measurer (Assessment),Assessment,TREETOPCITY


### Process test set

In [236]:
temp_data = []
for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False)):
    a = feature_engineering(user_sample, test_data = True)
    temp_data.append(a)
    
df_test = pd.DataFrame(temp_data)
del temp_data
print(df_test.shape)
df_test['installation_id'].equals(sample_submission['installation_id'])

100%|██████████| 1000/1000 [02:11<00:00,  5.70it/s]

(1000, 29)





True

In [268]:
df_test.to_csv('data_compiled/df_test.csv', index = False)
df_train.to_csv('data_compiled/df_train.csv', index = False)