In [13]:
import pandas as pd
import numpy as np, gc

In [9]:
tmp = pd.read_csv('predict-student-performance-from-game-play/train.csv', usecols=[0])
tmp = tmp.groupby('session_id').session_id.agg('count')

PIECES = 10
CHUNK = int(np.ceil(len(tmp)/PIECES))

reads = []
skips = [0]
for k in range(PIECES):
    a = k * CHUNK
    b = (k + 1) * CHUNK
    if b > len(tmp): b = len(tmp)
    r = tmp.iloc[a:b].sum()
    reads.append(r)
    skips.append(skips[-1]+r)    
    
print(f'To avoid memory error, we will read train in {PIECES} pieces of sizes:')
print(reads)

To avoid memory error, we will read train in 10 pieces of sizes:
[2684191, 2631991, 2638304, 2657670, 2644229, 2629801, 2596616, 2602258, 2619995, 2591891]


In [11]:
CATS = ['event_name', 'fqid', 'room_fqid', 'text']
NUMS = ['elapsed_time', 'level', 'page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration']

EVENTS = ['navigate_click', 'person_click', 'cutscene_click', 'object_click','object_hover','map_hover','notification_click','notebook_click','map_click','boservation_click','checkpoint']

In [4]:
def feature_engineer(train):
    dfs = []
    for c in CATS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in EVENTS:
        train[c] = (train.event_name == c).astype('int8')
    for c in EVENTS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    train = train.drop(EVENTS, axis=1)
    
    df = pd.concat(dfs, axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    
    return df

In [14]:
%%time

all_pieces = []
print(f'Processing train as {PIECES} pieces to avoid memory error...')
for k in range(PIECES):
    print(k,', ',end='')
    SKIPS = 0
    if k>0: SKIPS = range(1, skips[k]+1)
    train = pd.read_csv('predict-student-performance-from-game-play/train.csv', nrows=reads[k], skiprows=SKIPS)
    df = feature_engineer(train)
    all_pieces.append(df)
    
print('\n')
del train; gc.collect()
df = pd.concat(all_pieces, axis=0)
print('Shape of all train data after feature enginerring : ', df.shape)
df.head()

Processing train as 10 pieces to avoid memory error...
0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 

Shape of all train data after feature enginerring :  (70686, 32)
Wall time: 4min 58s


Unnamed: 0_level_0,level_group,event_name_nunique,fqid_nunique,room_fqid_nunique,text_nunique,elapsed_time_mean,level_mean,page_mean,room_coor_x_mean,room_coor_y_mean,...,person_click_sum,cutscene_click_sum,object_click_sum,object_hover_sum,map_hover_sum,notification_click_sum,notebook_click_sum,map_click_sum,boservation_click_sum,checkpoint_sum
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20090312431273200,0-4,10,30,7,56,85793.56,1.945455,-1.0,7.701275,-71.413749,...,22.0,28.0,11.0,4.0,4.0,8,0.0,2.0,0,1
20090312431273200,13-22,10,49,12,168,1040601.0,17.402381,-1.0,-130.34717,-162.00431,...,123.0,60.0,20.0,13.0,14.0,10,0.0,6.0,0,1
20090312431273200,5-12,10,39,11,124,357205.2,8.054054,-1.0,14.306062,-57.269322,...,104.0,12.0,28.0,21.0,9.0,9,0.0,8.0,0,1
20090312433251036,0-4,11,22,6,49,97633.42,1.870504,0.0,-84.04596,-53.671082,...,18.0,36.0,15.0,5.0,3.0,5,2.0,3.0,0,1
20090312433251036,13-22,11,73,16,183,2498852.0,17.762529,5.1,-30.762282,-142.861892,...,145.0,65.0,83.0,66.0,186.0,14,50.0,45.0,0,1
