In [None]:
import pandas as pd, numpy as np
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import f1_score

In [None]:
train = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv')
train['room_coor_x'] = train['room_coor_x'].fillna(0)
train['room_coor_y'] = train['room_coor_y'].fillna(0)
train['screen_coor_x'] = train['screen_coor_x'].fillna(0)
train['screen_coor_y'] = train['screen_coor_y'].fillna(0)
train['hover_duration'] = train['hover_duration'].fillna(0)
train['page'] = train['page'].fillna(-1)

In [None]:
targets = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]) )
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

In [None]:
targets = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]) )
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
print( targets.shape )
targets.head()

In [None]:
NUNIQUE = ['room_fqid', 'text','fqid','event_name']
MMS = ['elapsed_time', 'hover_duration','index','page','room_coor_x', 'room_coor_y','screen_coor_x','screen_coor_y']
EVENTS = ['navigate_click','person_click','cutscene_click','object_click','notification_click',
          'map_click','observation_click','checkpoint','map_hover','notebook_click']

In [None]:
def feature_engineer(train):
    dfs = []
    for c in NUNIQUE:
        tmp = train.groupby(['session_id','level_group','level'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in MMS:
        tmp = train.groupby(['session_id','level_group','level'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in MMS:
        tmp = train.groupby(['session_id','level_group','level'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)  

    for c in EVENTS:
        train[c] = (train.event_name == c).astype('int8')
    for c in EVENTS + ['elapsed_time','hover_duration','index']:
        tmp = train.groupby(['session_id','level_group','level'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    train = train.drop(EVENTS,axis=1)
    df = pd.concat(dfs,axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df

In [None]:
%%time
df = feature_engineer(train)
print( df.shape )
df.head()

In [None]:
df.columns[1:]

In [None]:
feature = ['level', 'room_fqid_nunique', 'text_nunique', 'fqid_nunique',
       'event_name_nunique', 'elapsed_time_mean', 'hover_duration_mean',
       'index_mean', 'page_mean', 'room_coor_x_mean', 'room_coor_y_mean',
       'screen_coor_x_mean', 'screen_coor_y_mean', 'elapsed_time_std',
       'hover_duration_std', 'index_std', 'page_std', 'room_coor_x_std',
       'room_coor_y_std', 'screen_coor_x_std', 'screen_coor_y_std',
       'navigate_click_sum', 'person_click_sum', 'cutscene_click_sum',
       'object_click_sum', 'notification_click_sum', 'map_click_sum',
       'observation_click_sum', 'checkpoint_sum', 'map_hover_sum',
       'notebook_click_sum', 'elapsed_time_sum', 'hover_duration_sum',
       'index_sum']

In [None]:
def df_feature_engineer(df):
    dfs = []
    for c in feature:
        tmp = df.groupby(['session_id','level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in feature:
        tmp = df.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in feature:
        tmp = df.groupby(['session_id','level_group'])[c].agg('max')
        tmp.name = tmp.name + '_max'
        dfs.append(tmp)
    for c in feature:
        tmp = df.groupby(['session_id','level_group'])[c].agg('min')
        tmp.name = tmp.name + '_min'
        dfs.append(tmp)
    for c in feature:
        tmp = df.groupby(['session_id','level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
        
    df_df = pd.concat(dfs,axis=1)
    df_df = df_df.reset_index()
    df_df = df_df.set_index('session_id')
    return df_df

In [None]:
df_df = df_feature_engineer(df)

In [None]:
FEATURES = [c for c in df_df.columns if c != 'level_group']
print('We will train with', len(FEATURES) ,'features')
ALL_USERS = df_df.index.unique()
print('We will train with', len(ALL_USERS) ,'users info')

In [None]:
gkf = GroupKFold(n_splits=20)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS),18)), index=ALL_USERS)
models = {}

# COMPUTE CV SCORE WITH 5 GROUP K FOLD
for i, (train_index, test_index) in enumerate(gkf.split(X=df_df, groups=df_df.index)):
    print(' ')
    print('Fold',i+1)
    print(' ')
    
    xgb_params = {
    'objective' : 'binary:logistic',
    'eval_metric':'logloss',
    'learning_rate': 0.05,
    'max_depth': 4,
    'n_estimators': 1000,
    'early_stopping_rounds': 50,
    'tree_method':'hist',
    'subsample':0.8,
    'colsample_bytree': 0.58}
    
    # ITERATE THRU QUESTIONS 1 THRU 18
    for t in range(1,19):
        
        # USE THIS TRAIN DATA WITH THESE QUESTIONS
        if t<=3: grp = '0-4'
        elif t<=13: grp = '5-12'
        else: grp = '13-22'
            
        # TRAIN DATA
        train_x = df_df.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = targets.loc[targets.q==t].set_index('session').loc[train_users]
        
        # VALID DATA
        valid_x = df_df.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = targets.loc[targets.q==t].set_index('session').loc[valid_users]
        
        # TRAIN MODEL        
        clf =  XGBClassifier(**xgb_params)
        clf.fit(train_x[FEATURES].astype('float32'), train_y['correct'],
                eval_set=[ (valid_x[FEATURES].astype('float32'), valid_y['correct']) ],
                verbose=0)
        print(f'{t}({clf.best_ntree_limit}), ',end='')
        
        # SAVE MODEL, PREDICT VALID OOF
        models[f'{grp}_{t}'] = clf
        oof.loc[valid_users, t-1] = clf.predict_proba(valid_x[FEATURES].astype('float32'))[:,1]
        
    print()

In [None]:
true = oof.copy()
for k in range(18):
    # GET TRUE LABELS
    tmp = targets.loc[targets.q == k+1].set_index('session').loc[ALL_USERS]
    true[k] = tmp.correct.values

In [None]:
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.81,0.01):
    print(f'{threshold:.02f}, ',end='')
    preds = (oof.values.reshape((-1))>threshold).astype('int')
    m = f1_score(true.values.reshape((-1)), preds, average='macro')   
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold

In [None]:
print('When using optimal threshold...')
for k in range(18):
        
    # COMPUTE F1 SCORE PER QUESTION
    m = f1_score(true[k].values, (oof[k].values>best_threshold).astype('int'), average='macro')
    print(f'Q{k}: F1 =',m)
    
# COMPUTE F1 SCORE OVERALL
m = f1_score(true.values.reshape((-1)), (oof.values.reshape((-1))>best_threshold).astype('int'), average='macro')
print('==> Overall F1 =',m)

In [None]:
# IMPORT KAGGLE API
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

# CLEAR MEMORY
import gc
del train, targets, df, oof, true
_ = gc.collect()

In [None]:
limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (sample_submission, test) in iter_test:
    
    test['room_coor_x'] = test['room_coor_x'].fillna(0)
    test['room_coor_y'] = test['room_coor_y'].fillna(0)
    test['screen_coor_x'] = test['screen_coor_x'].fillna(0)
    test['screen_coor_y'] = test['screen_coor_y'].fillna(0)
    test['hover_duration'] = test['hover_duration'].fillna(0)
    test['page'] = test['page'].fillna(-1)

    # FEATURE ENGINEER TEST DATA
    df = feature_engineer(test)

    df_df = df_feature_engineer(df)


    # INFER TEST DATA
    grp = test.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        clf = models[f'{grp}_{t}']
        p = clf.predict_proba(df_df[FEATURES].astype('float32'))[:,1]
        mask = sample_submission.session_id.str.contains(f'q{t}')
        sample_submission.loc[mask,'correct'] = int(p.item()>best_threshold)
    
    env.predict(sample_submission)

In [None]:
df = pd.read_csv('submission.csv')
print( df.shape )
df.head()

In [None]:
print(df.correct.mean())