In [51]:
import pandas as pd
import numpy as np

# Load Train Data and Labels

In [52]:
%%time

train = pd.read_csv(r".\predict-student-performance-from-game-play\train.csv")
print(train.shape)
print(train.head())


(13174211, 20)
          session_id  index  elapsed_time      event_name   name  level  page  \
0  20090312431273200      0             0  cutscene_click  basic      0   NaN   
1  20090312431273200      1          1323    person_click  basic      0   NaN   
2  20090312431273200      2           831    person_click  basic      0   NaN   
3  20090312431273200      3          1147    person_click  basic      0   NaN   
4  20090312431273200      4          1863    person_click  basic      0   NaN   

   room_coor_x  room_coor_y  screen_coor_x  screen_coor_y  hover_duration  \
0  -413.991405  -159.314686          380.0          494.0             NaN   
1  -413.991405  -159.314686          380.0          494.0             NaN   
2  -413.991405  -159.314686          380.0          494.0             NaN   
3  -413.991405  -159.314686          380.0          494.0             NaN   
4  -412.991405  -159.314686          381.0          494.0             NaN   

                            text   

In [53]:
targets = pd.read_csv(r".\predict-student-performance-from-game-play\train_labels.csv")
targets["session"] = targets["session_id"].apply(lambda x: int(x.split("_")[0]))
targets["question"] = targets["session_id"].apply(lambda x: int(x.split("_")[-1][1:]))
print(targets.shape)
print(targets.head())


(212022, 4)
             session_id  correct            session  question
0  20090312431273200_q1        1  20090312431273200         1
1  20090312433251036_q1        0  20090312433251036         1
2  20090314121766812_q1        1  20090314121766812         1
3  20090314363702160_q1        1  20090314363702160         1
4  20090314441803444_q1        1  20090314441803444         1


# Feature Engineering

In [54]:
# print unique values
print(train.columns)
for col in train:
    print(col)
    print(train[col].unique())


Index(['session_id', 'index', 'elapsed_time', 'event_name', 'name', 'level',
       'page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
       'hover_duration', 'text', 'fqid', 'room_fqid', 'text_fqid',
       'fullscreen', 'hq', 'music', 'level_group'],
      dtype='object')
session_id
[20090312431273200 20090312433251036 20090314121766812 ...
 22100217104993650 22100219442786200 22100221145014656]
index
[    0     1     2 ... 20471 20472 20473]
elapsed_time
[      0    1323     831 ... 5485166 5486753 5487952]
event_name
['cutscene_click' 'person_click' 'navigate_click' 'observation_click'
 'notification_click' 'object_click' 'object_hover' 'map_hover'
 'map_click' 'checkpoint' 'notebook_click']
name
['basic' 'undefined' 'close' 'open' 'prev' 'next']
level
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]
page
[nan  0.  1.  3.  4.  5.  6.  2.]
room_coor_x
[-413.99140522 -412.99140522  478.48507949 ...  332.69606976  369.91285894
  252.29965302

In [55]:
CAT_COLS = ["event_name", "fqid", "room_fqid", "text"]
NUM_COLS = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
            'screen_coor_x', 'screen_coor_y', 'hover_duration']

EVENTS = [x for x in train["event_name"].unique()]

In [56]:
def feature_engineer(data: pd.DataFrame):
    dfs = []
    for col in CAT_COLS:
        temp = data.groupby(["session_id", "level_group"])[col].agg("nunique")
        temp.name = f"{temp.name}_nunique"
        dfs.append(temp)
    
    for col in NUM_COLS:
        temp = data.groupby(["session_id", "level_group"])[col].agg("mean")
        temp.name = f"{temp.name}_mean"
        dfs.append(temp)
    
    for col in NUM_COLS:
        temp = data.groupby(["session_id", "level_group"])[col].agg("std")
        temp.name = f"{temp.name}_std"
        dfs.append(temp)
    
    for ev in EVENTS:
        data[ev] = (data["event_name"] == ev).astype("int8")    # turn bool series into 0-1 values series
    
    for ev in EVENTS + ["elapsed_time"]:
        temp = data.groupby(["session_id", "level_group"])[ev].agg("sum")
        temp.name = f"{temp.name}_sum"
        dfs.append(temp)
    
    data = data.drop(EVENTS, axis=1)
    df = pd.concat(dfs, axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index("session_id")

    return df

In [57]:
%%time
refined_train = feature_engineer(train)
print(refined_train.shape)
refined_train.head()


(35337, 33)
Wall time: 1min 1s


Unnamed: 0_level_0,level_group,event_name_nunique,fqid_nunique,room_fqid_nunique,text_nunique,elapsed_time_mean,level_mean,page_mean,room_coor_x_mean,room_coor_y_mean,...,navigate_click_sum,observation_click_sum,notification_click_sum,object_click_sum,object_hover_sum,map_hover_sum,map_click_sum,checkpoint_sum,notebook_click_sum,elapsed_time_sum
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20090312431273200,0-4,10,30,7,56,85793.56,1.945455,-1.0,7.701275,-71.413749,...,81.0,4,8,11.0,4.0,4.0,2.0,1,0.0,14155937
20090312431273200,13-22,10,49,12,168,1040601.0,17.402381,-1.0,-130.34717,-162.00431,...,170.0,3,10,20.0,13.0,14.0,6.0,1,0.0,437052322
20090312431273200,5-12,10,39,11,124,357205.2,8.054054,-1.0,14.306062,-57.269322,...,103.0,1,9,28.0,21.0,9.0,8.0,1,0.0,105732736
20090312433251036,0-4,11,22,6,49,97633.42,1.870504,0.0,-84.04596,-53.671082,...,49.0,2,5,15.0,5.0,3.0,3.0,1,2.0,13571045
20090312433251036,13-22,11,73,16,183,2498852.0,17.762529,5.1,-30.762282,-142.861892,...,637.0,5,14,83.0,66.0,186.0,45.0,1,50.0,3241011333


# Train XGBoost Model

In [58]:
FEATURES = [col for col in refined_train.columns if col != "level_group"]
print(f"Number of features is: {len(FEATURES)}")
ALL_USERS = refined_train.index.unique()
print(f"Number of users is: {len(ALL_USERS)}")

Number of features is: 32
Number of users is: 11779


In [59]:
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from tqdm import tqdm

In [66]:
gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS), 18)), index=ALL_USERS)
models = {}

# 5 Group K fold
for i, (train_index, test_index) in tqdm(enumerate(gkf.split(X=refined_train, groups=refined_train.index))):
    print("#=" * 20)
    print("#=" * 9, f" {i+1} ", "#=" * 9)
    print("#=" * 20)

    xgb_params = {
    'objective' : 'binary:logistic',
    'eval_metric':'logloss',
    'learning_rate': 0.05,
    'max_depth': 4,
    'n_estimators': 1000,
    'early_stopping_rounds': 50,
    'tree_method':'hist',
    'subsample':0.8,
    'colsample_bytree': 0.4,
    'use_label_encoder' : False}

    # Iterate through question 1 to 18
    for q in range(1, 19):

        # Use the respective dataset
        if q <= 3: grp = '0-4'
        elif q <= 13: grp = '5-12'
        elif q <= 22: grp = '13-22'

        # Train data
        train_x = refined_train.iloc[train_index]
        train_x = train_x.loc[train_x["level_group"] == grp]
        train_users = train_x.index.values
        train_y = targets.loc[targets["question"] == q].set_index("session").loc[train_users]

        # Valid data
        valid_x = refined_train.iloc[test_index]
        valid_x = valid_x.loc[valid_x["level_group"] == grp]
        valid_users = valid_x.index.values
        valid_y = targets.loc[targets["question"] == q].set_index("session").loc[valid_users]

        # Train model
        clf = XGBClassifier(**xgb_params)
        clf.fit(train_x[FEATURES].astype("float32"), train_y["correct"], eval_set=[(valid_x[FEATURES].astype("float32"), valid_y["correct"])], verbose = 0)

        print(f"{q}({clf.best_ntree_limit}), ",end = "")

        # Save model
        models[f"{grp}_{q}"] = clf
        oof.loc[valid_users, q-1] = clf.predict_proba(valid_x[FEATURES].astype("float32"))[:, 1]

0it [00:00, ?it/s]

#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=
#=#=#=#=#=#=#=#=#=  1  #=#=#=#=#=#=#=#=#=
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=
1(134), 2(123), 3(94), 4(279), 5(85), 6(204), 7(127), 8(79), 9(97), 10(206), 11(213), 12(68), 13(165), 14(134), 15(141), 16(70), 17(65), 

1it [00:07,  7.83s/it]

18(132), #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=
#=#=#=#=#=#=#=#=#=  2  #=#=#=#=#=#=#=#=#=
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=
1(136), 2(105), 3(125), 4(109), 5(86), 6(174), 7(117), 8(66), 9(112), 10(160), 11(68), 12(84), 13(74), 14(152), 15(186), 16(85), 17(40), 

2it [00:14,  7.14s/it]

18(102), #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=
#=#=#=#=#=#=#=#=#=  3  #=#=#=#=#=#=#=#=#=
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=
1(54), 2(131), 3(97), 4(168), 5(66), 6(74), 7(80), 8(56), 9(150), 10(110), 11(72), 12(74), 13(153), 14(166), 15(133), 16(88), 17(84), 

3it [00:21,  6.86s/it]

18(117), #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=
#=#=#=#=#=#=#=#=#=  4  #=#=#=#=#=#=#=#=#=
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=
1(161), 2(122), 3(111), 4(134), 5(154), 6(122), 7(168), 8(46), 9(63), 10(92), 11(85), 12(99), 13(90), 14(123), 15(162), 16(45), 17(82), 

4it [00:28,  7.29s/it]

18(151), #=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=
#=#=#=#=#=#=#=#=#=  5  #=#=#=#=#=#=#=#=#=
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=
1(147), 2(151), 3(94), 4(78), 5(156), 6(144), 7(134), 8(50), 9(147), 10(138), 11(60), 12(79), 13(123), 14(128), 15(232), 16(98), 17(98), 

5it [00:37,  7.54s/it]

18(71), 




In [48]:
print(refined_train.index.values)
print(targets.columns)
print(targets.index)

[20090312431273200 20090312431273200 20090312431273200 ...
 22100221145014656 22100221145014656 22100221145014656]
Index(['session_id', 'correct', 'session', 'question'], dtype='object')
RangeIndex(start=0, stop=212022, step=1)
