This is a modified Version of Chris's notebook:https://www.kaggle.com/code/cdeotte/xgboost-baseline-0-676

In [3]:
import pandas as pd, numpy as np
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

# Load Train Data and Labels

In [None]:
train = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv').astype({'level':'int16', 'index': 'int16', 'elapsed_time':'int16','room_coor_x':'float32','room_coor_y':'float32','screen_coor_x':'float32','screen_coor_y':'float32'}).drop(['index','fullscreen','hq','music','name','text'], axis=1)

In [None]:
def df_scale(train):
#     train.drop(['fullscreen', 'hq', 'music'], axis=1, inplace=True)
    train['room_coor_x'] = train['room_coor_x'].fillna(0)
    train['room_coor_y'] = train['room_coor_y'].fillna(0)
    train['screen_coor_x'] = train['screen_coor_x'].fillna(0)
    train['screen_coor_y'] = train['screen_coor_y'].fillna(0)
    train['hover_duration'] = train['hover_duration'].fillna(0)
    train['page'].fillna(-1)
    return train


In [None]:
train = df_scale(train)

In [None]:
targets = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]) )
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
print( targets.shape )
targets.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
items = train['page'].values
labels = items.reshape(-1,1)

oh_page_encoder = OneHotEncoder()

oh_page_encoder.fit(labels)
oh_page_labels = oh_page_encoder.transform(labels)


In [None]:
import os
path = '/kaggle/temp'
if not os.path.exists(path):
    os.makedirs(path)

In [None]:
def oh_page_out(df: pd.DataFrame) -> None:

    train_cat = oh_page_labels.toarray()

    tmp_df = pd.DataFrame(train_cat, columns=[
                          'page_' + str(col) for col in range(-1, 7)]).astype(dtype = 'int8')
    pd.concat([df[['session_id', 'level_group']], tmp_df], axis=1).to_csv("/kaggle/temp/page_train.csv", index=False)


In [None]:
oh_page_out(train)

In [None]:
train = train.drop(columns=['page'],axis=1)

In [None]:
train.columns

# Feature Engineer
We create basic aggregate features. Try creating more features to boost CV and LB! The idea for EVENTS feature is from [here][1]

[1]: https://www.kaggle.com/code/kimtaehun/lightgbm-baseline-with-aggregated-log-data

In [None]:
NUNIQUE = ['event_name','fqid', 'room_fqid']
MMS = ['elapsed_time', 'hover_duration'] # mean,std
MEAN = ['level']
STD = ['room_coor_x', 'room_coor_y','screen_coor_x','screen_coor_y']
EVENTS = ['navigate_click','person_click','cutscene_click','object_click',
          'map_hover','notification_click','map_click','observation_click',
          'checkpoint']
PAGE = ['page_-1', 'page_0', 'page_1', 'page_2',
        'page_3', 'page_4', 'page_5', 'page_6']


In [None]:
train.columns

In [None]:
train['text_fqid'].value_counts() # name 6개, event_name 11개, 

In [None]:
def feature_engineer(train):
    dfs = []
    for c in NUNIQUE:
        tmp = train.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    # for c in MMS:
    #     tmp = train.groupby(['session_id','level_group'])[c].agg('mean')
    #     tmp.name = tmp.name + '_mean'
    #     dfs.append(tmp)
    # for c in PAGE:
    #     tmp = train.groupby(['session_id', 'level_group'])[c].agg('sum')
    #     tmp.name = tmp.name + '_sum'
    #     dfs.append(tmp)
    for c in MMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('max')
        tmp.name = tmp.name + '_max'
        dfs.append(tmp)
    for c in MMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('min')
        tmp.name = tmp.name + '_min'
        dfs.append(tmp)
    for c in MMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in MEAN:
        tmp = train.groupby(['session_id','level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in STD:
        tmp = train.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in EVENTS:
        train[c] = (train.event_name == c).astype('int8')
    for c in EVENTS + ['elapsed_time']:
        tmp = train.groupby(['session_id','level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    train = train.drop(EVENTS,axis=1)
    df = pd.concat(dfs,axis=1)
    # df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df

In [None]:
def feature_engineer_page(train):
    dfs = []
    for c in PAGE:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)

    df = pd.concat(dfs, axis=1)
    # df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df


In [None]:
d_types = {"session_id": 'int64',"level_group":'str', "page_-1":'int16', "page_0":'int16',"page_1":'int16',"page_2":'int16',"page_3":'int16',"page_4":'int16',"page_5":'int16',"page_6":'int16'}

In [None]:
%%time
train = pd.concat([feature_engineer(train), feature_engineer_page(pd.read_csv('/kaggle/temp/page_train.csv', dtype = d_types))], axis=1)


In [None]:
train.columns

In [None]:
t_col = ['level_group', 'event_name_nunique', 'fqid_nunique',
       'room_fqid_nunique', 'elapsed_time_max', 'hover_duration_max',
       'elapsed_time_min', 'hover_duration_min', 'elapsed_time_std',
       'hover_duration_std', 'level_mean', 'room_coor_x_std',
       'room_coor_y_std', 'screen_coor_x_std', 'screen_coor_y_std',
       'navigate_click_sum', 'person_click_sum', 'cutscene_click_sum',
       'object_click_sum', 'map_hover_sum', 'notification_click_sum',
       'map_click_sum', 'observation_click_sum', 'checkpoint_sum',
       'elapsed_time_sum', 'TRASH_level_group', 'page_-1_sum', 'page_0_sum',
       'page_1_sum', 'page_2_sum', 'page_3_sum', 'page_4_sum', 'page_5_sum',
       'page_6_sum']


In [None]:
train.columns = t_col

In [None]:
train.columns

In [None]:
train = train.drop('TRASH_level_group',axis=1)


# Train XGBoost Model
We train one model for each of 18 questions. Furthermore, we use data from `level_groups = '0-4'` to train model for questions 1-3, and `level groups '5-12'` to train questions 4 thru 13 and `level groups '13-22'` to train questions 14 thru 18. Because this is the data we get (to predict corresponding questions) from Kaggle's inference API during test inference. We can improve our model by saving a user's previous data from earlier `level_groups` and using that to predict future `level_groups`.

In [None]:
FEATURES = train.columns[1:]
print(f'{len(FEATURES)}개의 특성')
ALL_USERS = train.index.unique()
print(f'{len(ALL_USERS)}명의 유저 정보')


In [None]:
train = train.reset_index()
train = train.set_index('session_id')


In [None]:
train

You can also use grid search and Bayesian optimization methods

In [None]:
gkf = GroupKFold(n_splits=2)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS),18)), index=ALL_USERS)
models = {}

# COMPUTE CV SCORE WITH 5 GROUP K FOLD
for i, (train_index, test_index) in enumerate(gkf.split(X=train, groups=train.index)):
    print('#'*25)
    print('### Fold',i+1)
    print('#'*25)
    
    xgb_params = {
    'objective' : 'binary:logistic',
    'eval_metric':'logloss',
    'learning_rate': 0.005,
    'max_depth': 4,
    'n_estimators': 1500,
    'early_stopping_rounds': 50,
    'tree_method':'hist',
    'subsample':0.8,
    'colsample_bytree': 0.4,
    'use_label_encoder' : None}
    
    # ITERATE THRU QUESTIONS 1 THRU 18
    for t in range(1, 19):
        
        # USE THIS TRAIN DATA WITH THESE QUESTIONS
        if t<=3: grp = '0-4'
        elif t<=13: grp = '5-12'
        elif t<=22: grp = '13-22'
            
        # TRAIN DATA
        train_x = train.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = targets.loc[targets.q==t].set_index('session').loc[train_users]
        
        # VALID DATA
        valid_x = train.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = targets.loc[targets.q==t].set_index('session').loc[valid_users]
        
        # TRAIN MODEL        
        clf =  XGBClassifier(**xgb_params)
        clf.fit(train_x[FEATURES].astype('float32'), train_y['correct'],
                eval_set=[ (valid_x[FEATURES].astype('float32'), valid_y['correct']) ],
                verbose=0)
        print(f'{t}({clf.best_ntree_limit}), ',end='')
        
        # SAVE MODEL, PREDICT VALID OOF
        models[f'{grp}_{t}'] = clf
        oof.loc[valid_users, t-1] = clf.predict_proba(valid_x[FEATURES].astype('float32'))[:,1]
        
    print()

# Compute CV Score
We need to convert prediction probabilities into `1s` and `0s`. The competition metric is F1 Score which is the harmonic mean of precision and recall. Let's find the optimal threshold for `p > threshold` when to predict `1` and when to predict `0` to maximize F1 Score.

In [None]:
# PUT TRUE LABELS INTO DATAFRAME WITH 18 COLUMNS
true = oof.copy()
for k in range(18):
    # GET TRUE LABELS
    tmp = targets.loc[targets.q == k+1].set_index('session').loc[ALL_USERS]
    true[k] = tmp.correct.values

In [None]:
# FIND BEST THRESHOLD TO CONVERT PROBS INTO 1s AND 0s
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.81,0.01):
    print(f'{threshold:.02f}, ',end='')
    preds = (oof.values.reshape((-1))>threshold).astype('int')
    m = f1_score(true.values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold

In [None]:
import matplotlib.pyplot as plt

# PLOT THRESHOLD VS. F1_SCORE
plt.figure(figsize=(20,5))
plt.plot(thresholds,scores,'-o',color='blue')
plt.scatter([best_threshold], [best_score], color='blue', s=300, alpha=1)
plt.xlabel('Threshold',size=14)
plt.ylabel('Validation F1 Score',size=14)
plt.title(f'Threshold vs. F1_Score with Best F1_Score = {best_score:.3f} at Best Threshold = {best_threshold:.3}',size=18)
plt.show()

In [None]:
print('When using optimal threshold...')
for k in range(18):
        
    # COMPUTE F1 SCORE PER QUESTION
    m = f1_score(true[k].values, (oof[k].values>best_threshold).astype('int'), average='macro')
    print(f'Q{k}: F1 =',m)
    
# COMPUTE F1 SCORE OVERALL
m = f1_score(true.values.reshape((-1)), (oof.values.reshape((-1))>best_threshold).astype('int'), average='macro')
print('==> Overall F1 =',m)

# Infer Test Data

In [None]:
# IMPORT KAGGLE API
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

# CLEAR MEMORY
import gc
del train, targets, oof, true
_ = gc.collect()

In [None]:
limits = {'0-4': (1, 4), '5-12': (4, 14), '13-22': (14, 19)}

counter = 0

for (sample_submission, test) in iter_test:
    df = df_scale(test)

    df = df.astype({'level': 'int16', 'index': 'int16', 'elapsed_time': 'int16', 'room_coor_x': 'float32', 'room_coor_y': 'float32',
                       'screen_coor_x': 'float32', 'screen_coor_y': 'float32'}).drop(['index', 'fullscreen', 'hq', 'music', 'name', 'text'], axis=1)
    if counter == 0:
        print(sample_submission.head())
        print(df.head())
        print(df.shape)
    oh_page_out(df)
    print("oh_page out df")
    print(df.head())

    df = df.drop(columns=['page'], axis=1)
    print("drop page")
    print(df.head())

    
    # FEATURE ENGINEER TEST DATA
    print("start read csv..")
    df = pd.concat([feature_engineer(df), feature_engineer_page(
        pd.read_csv('/kaggle/temp/page_train.csv', dtype=d_types))], axis=1)
    print("read dataframe and concat feature_engineer and feature engineer_page")

    df.columns = t_col
    print("change df col")
    print(df.head())


    df = df.drop('TRASH_level_group', axis=1)
    df = df.reset_index()
    df = df.set_index('session_id')
    print("drop Trash_level_group!")
    print(df.head())

    
    # INFER TEST DATA
    grp = test.level_group.values[0]
    a, b = limits[grp]
    for t in range(a, b):
        clf = models[f'{grp}_{t}']
        p = clf.predict_proba(df[FEATURES].astype('float32'))[:, 1]
        mask = sample_submission.session_id.str.contains(f'q{t}')
        sample_submission.loc[mask, 'correct'] = int(p.item() > best_threshold)
        print("sample submission update success!")

    env.predict(sample_submission)
    counter += 1

In [None]:
# limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

# for (sample_submission, test) in iter_test:
#     test = df_scale(test)
#     print(test.columns)
#     test = test.astype({'level':'int16', 'index': 'int16', 'elapsed_time':'int16','room_coor_x':'float32','room_coor_y':'float32','screen_coor_x':'float32','screen_coor_y':'float32'}).drop(['index','fullscreen','hq','music','name','text'], axis=1)
#     oh_page_out(test)
#     test = test.drop(columns=['page'],axis=1)
    
#     test = test.drop('TRASH_level_group',axis=1)
#     test = test.reset_index()
#     test = test.set_index('session_id')
#     test.columns = t_col

#     # FEATURE ENGINEER TEST DATA
#     test = pd.concat([feature_engineer(test), feature_engineer_page(pd.read_csv('/kaggle/temp/page_train.csv'))], axis=1)
    
#     # INFER TEST DATA
#     grp = test.level_group.values[0]
#     a,b = limits[grp]
#     for t in range(a,b):
#         clf = models[f'{grp}_{t}']
#         p = clf.predict_proba(df[FEATURES].astype('float32'))[:,1]
#         mask = sample_submission.session_id.str.contains(f'q{t}')
#         sample_submission.loc[mask,'correct'] = int(p.item()>best_threshold)
    
#     env.predict(sample_submission)

# EDA submission.csv

In [None]:
df = pd.read_csv('submission.csv')
print( df.shape )
df.info()

In [None]:
print(df.correct.mean())