In [124]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [143]:
train = pd.read_csv('predict-student-performance-from-game-play/train.csv')

In [144]:
def df_scale(train):
    train.drop(['fullscreen','hq','music','screen_coor_x','screen_coor_y','name','text'], axis=1, inplace=True)
    train['room_coor_x'].fillna(0, inplace = True)
    train['room_coor_y'].fillna(0, inplace=True)
    train['hover_duration'].fillna(0, inplace=True)
    train['page'].fillna(-1, inplace=True)
    train['fqid'].fillna(-1,  inplace=True).astype('category')

    return train

In [147]:
train['fqid'].value_counts()

worker           939555
archivist        563259
gramps           561000
wells            394234
toentry          392221
                  ...  
block_badge         854
need_glasses        675
block_badge_2       508
fox                 166
block_1              25
Name: fqid, Length: 127, dtype: int64

In [148]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

items = train['page'].values
labels = items.reshape(-1,1)

oh_encoder = OneHotEncoder()


oh_encoder.fit(labels)
oh_labels = oh_encoder.transform(labels)

train_cat = oh_labels.toarray()

tmp_df = pd.DataFrame(train_cat, columns=['page_' + str(col) for col in range(-1,7)])
train = pd.concat([train.drop(columns=['page'],axis=1),tmp_df],axis=1)

In [149]:
lb_encoder = LabelEncoder()

lb_items = train['fqid'].values
lb_labels = items.reshape(-1, 1)
lb_encoder.fit(lb_labels)
lb_labels_transform = lb_encoder.transform(lb_labels)
train['fqid'] = lb_labels_transform


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [150]:
train['fqid'].value_counts()

7    12889465
5       51154
1       50714
6       45417
4       44207
0       37232
3       31350
2       24672
Name: fqid, dtype: int64

In [151]:
targets = pd.read_csv('predict-student-performance-from-game-play/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]))
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]))

In [134]:
OBJECT = ['event_name', 'room_fqid']
NUMS = ['elapsed_time', 'room_coor_x',
        'level', 'room_coor_y', 'hover_duration']

EVENTS = ['navigate_click', 'person_click', 'cutscene_click', 'object_click',
          'map_hover', 'notification_click', 'map_click', 'observation_click',
          'checkpoint']

PAGE = ['page_-1', 'page_0', 'page_1', 'page_2',
        'page_3', 'page_4', 'page_5', 'page_6']
FQID = ['fqid']
# FQID = ['fqid' + str(col) for col in range(128)]
# TEXT_FQID = ['text_fqid' + str(col) for col in range(127)]

In [135]:
def feature_engineer(train):

    dfs = []
    for c in OBJECT:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in PAGE:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
#     for c in TEXT_FQID:
#         tmp = train.groupby(['session_id', 'level_group'])[c].agg('sum')
#         tmp.name = tmp.name + '_sum'
#         dfs.append(tmp)
    for c in EVENTS:
        train[c] = (train.event_name == c).astype('int8')
    for c in EVENTS + ['elapsed_time']:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    train = train.drop(EVENTS, axis=1)

    df = pd.concat(dfs, axis=1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df


In [136]:
%%time
df = feature_engineer(train)
print(df.shape)

(35337, 32)
CPU times: user 24.2 s, sys: 2.62 s, total: 26.8 s
Wall time: 27.1 s


In [137]:
FEATURES = df.columns[1:]
print(f'{len(FEATURES)}개의 특성')
ALL_USERS = df.index.unique()
print(f'{len(ALL_USERS)}명의 유저 정보')

31개의 특성
11779명의 유저 정보


In [138]:
gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS), 18)), index=ALL_USERS)
models = {}

# COMPUTE CV SCORE WITH 5 GROUP K FOLD
for i, (train_index, test_index) in enumerate(gkf.split(X=df, groups=df.index)):
    print('')
    print('Fold', i+1)
    print('')

    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.05,
        'max_depth': 4,
        'n_estimators': 1000,
        'early_stopping_rounds': 50,
        'tree_method': 'hist',
        'subsample': 0.8,
        'colsample_bytree': 0.4
    }

    # ITERATE THRU QUESTIONS 1 THRU 18
    for t in range(1, 19):

        # USE THIS TRAIN DATA WITH THESE QUESTIONS
        if t <= 4:
            grp = '0-4'
        elif t <= 12:
            grp = '5-12'
        else:
            grp = '13-22'

        # TRAIN DATA
        train_x = df.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = targets.loc[targets.q == t].set_index(
            'session').loc[train_users]

        # VALID DATA
        valid_x = df.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = targets.loc[targets.q == t].set_index(
            'session').loc[valid_users]

        # TRAIN MODEL
        clf = XGBClassifier(**xgb_params)
        clf.fit(train_x[FEATURES].astype('float32'), train_y['correct'],
                eval_set=[(valid_x[FEATURES].astype(
                    'float32'), valid_y['correct'])],
                verbose=0)
        print(f'{t}({clf.best_ntree_limit}), ', end='')

        # SAVE MODEL, PREDICT VALID OOF
        models[f'{grp}_{t}'] = clf
        oof.loc[valid_users, t -
                1] = clf.predict_proba(valid_x[FEATURES].astype('float32'))[:, 1]

    print()



Fold 1

1(161), 2(132), 3(99), 4(169), 5(131), 6(174), 7(86), 8(51), 9(168), 10(160), 11(107), 12(83), 13(180), 14(117), 15(161), 16(75), 17(85), 18(122), 

Fold 2

1(128), 2(101), 3(133), 4(114), 5(93), 6(126), 7(190), 8(46), 9(97), 10(248), 11(88), 12(83), 13(191), 14(151), 15(84), 16(65), 17(36), 18(103), 

Fold 3

1(71), 2(111), 3(108), 4(88), 5(129), 6(61), 7(85), 8(66), 9(77), 10(182), 11(51), 12(80), 13(93), 14(157), 15(183), 16(77), 17(81), 18(115), 

Fold 4

1(122), 2(127), 3(113), 4(73), 5(151), 6(122), 7(188), 8(53), 9(55), 10(105), 11(63), 12(100), 13(114), 14(126), 15(119), 16(89), 17(78), 18(133), 

Fold 5

1(99), 2(120), 3(107), 4(78), 5(105), 6(140), 7(119), 8(60), 9(118), 10(81), 11(74), 12(100), 13(81), 14(239), 15(119), 16(69), 17(90), 18(80), 


In [121]:
# True 레이블을 18개의 열로 구성된 데이터 프레임에 넣습니다
true = oof.copy()
for k in range(18):
    # GET TRUE LABELS
    tmp = targets.loc[targets.q == k+1].set_index('session').loc[ALL_USERS]
    true[k] = tmp.correct.values

In [139]:
# 확률을 1과 0으로 변환하기 위한 최적의 임계값을 찾습니다
scores = []
thresholds = []
best_score = 0
best_threshold = 0

for threshold in np.arange(0.4, 0.81, 0.01):
    preds = (oof.values.reshape((-1)) > threshold).astype('int')
    m = f1_score(true.values.reshape((-1)), preds, average='macro')
    scores.append(m)
    thresholds.append(threshold)
    if m > best_score:
        best_score = m
        best_threshold = threshold
print(f'Best 임계점: {best_threshold:.2f}')
print(f'Best F1_score: {best_score:.4f}')


Best 임계점: 0.62
Best F1_score: 0.6766


In [82]:
del train
del df
del oof
del targets

In [21]:
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi


In [22]:
api = KaggleApi()


In [23]:
api.authenticate()

In [180]:
sample_submission = pd.read_csv("predict-student-performance-from-game-play/sample_submission.csv")
test = pd.read_csv("predict-student-performance-from-game-play/test.csv")


In [181]:
limits = {'0-4': (1, 4), '5-12': (5, 14), '13-22': (14, 19)}


test = df_scale(test)

items = test['page'].values
labels = items.reshape(-1, 1)

oh_labels = oh_encoder.transform(labels)

train_cat = oh_labels.toarray()

tmp_df = pd.DataFrame(train_cat, columns=['page_' + str(col) for col in range(-1, 7)])
test = pd.concat([test.drop(columns=['page'], axis=1), tmp_df], axis=1)

    # FEATURE ENGINEER TEST DATA
df = feature_engineer(test)



32

In [169]:
# grp = test.level_group.values[0]
# a, b = limits[grp]
# for t in range(a, b):
#     clf = models[f'{grp}_{t}']
#     p = clf.predict_proba(df[FEATURES].astype('float32'))[:, 1]
#     mask = sample_submission.session_id.str.contains(f'q{t}')
#     print(mask)
#     sample_submission.loc[mask, 'correct'] = int(p.item() > best_threshold)

# # clf.predict(sample_submission)


0

In [179]:
df[FEATURES]


Unnamed: 0_level_0,event_name_nunique,room_fqid_nunique,fqid_nunique,elapsed_time_mean,room_coor_x_mean,level_mean,room_coor_y_mean,hover_duration_mean,elapsed_time_std,room_coor_x_std,...,navigate_click_sum,person_click_sum,cutscene_click_sum,object_click_sum,map_hover_sum,notification_click_sum,map_click_sum,observation_click_sum,checkpoint_sum,elapsed_time_sum
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20090109393214576,11,6,24,117119.8,39.449107,2.05,-64.625451,2006.8,66619.89,440.144543,...,62,21,27,9,2,5,3,3,1,16396773
20090109393214576,11,12,52,6165666.0,-124.577142,17.918089,-220.648267,973.864865,227751.2,629.713345,...,240,137,78,40,20,10,7,4,1,3613080500
20090109393214576,11,12,43,2092368.0,48.26039,7.686567,-43.350532,2301.315789,2124842.0,378.100318,...,130,127,11,175,17,9,6,2,1,1121509230
20090312143683264,11,7,27,142418.3,91.577137,1.803681,-112.486741,3066.555556,78257.63,440.053605,...,62,18,33,22,2,9,2,4,1,23214178
20090312143683264,11,15,55,2195108.0,-21.133174,17.335626,-198.835547,864.096154,307875.7,576.561514,...,414,113,55,37,31,8,12,5,1,1595843559
20090312143683264,11,13,56,791150.4,42.823079,8.445172,-78.412808,1379.492958,245688.4,414.653052,...,276,102,16,92,22,9,13,8,1,483392868
20090312331414616,11,6,23,112832.8,67.333464,1.861538,-61.57197,2176.818182,61004.48,431.739627,...,41,19,30,10,4,6,2,1,1,14668268
20090312331414616,11,13,62,1262481.0,-93.299934,18.332689,-156.284179,794.542373,193471.9,575.032012,...,202,102,55,50,26,11,9,6,1,652702482
20090312331414616,11,11,46,558520.5,8.512538,8.459119,-70.478232,1561.096154,136600.2,357.487622,...,92,97,11,36,25,9,7,1,1,177609534


In [172]:
counter = 0
# The API will deliver two dataframes in this specific order,
# for every session+level grouping (one group per session for each checkpoint)
# if counter==0:
#     display(sample_submission.head())
#     display(test.head())
#     print(test.shape)
best = best_threshold # best_threshold는 상수 

print(best)
p = clf.predict_proba(df[FEATURES].astype('float32'))[:, 1] # df. = feature engineer 돌린 test
print(p) # 리스트  # 사람 3 문제 54 인당 18개 ? 사람 3명 3그룹 9 


0.6200000000000002
[0.80293745 0.2460512  0.53370196 0.63770825 0.18540768 0.32812896
 0.828757   0.30069208 0.41950473]


In [174]:
# users make predictions here using the test data
for index, row in sample_submission.iterrows():
    print(f"index = {index}, row = {row}")
    q = int(row['session_id'].split('_')[-1][1:])

    # if p.item() > best.item():
    #     sample_submission.loc[index, 'correct'] = True
    # else:
    #     sample_submission.loc[index, 'correct'] = False
    # sample_submission.loc[index, 'correct'] = (lambda x: True if p > best else False)

    # env.predict appends the session+level sample_submission to the overall
    # submission
counter += 1


index = 0, row = session_id        20090109393214576_q1
correct                              0
session_level    20090109393214576_0-4
Name: 0, dtype: object
index = 1, row = session_id        20090312143683264_q1
correct                              0
session_level    20090312143683264_0-4
Name: 1, dtype: object
index = 2, row = session_id        20090312331414616_q1
correct                              0
session_level    20090312331414616_0-4
Name: 2, dtype: object
index = 3, row = session_id        20090109393214576_q2
correct                              0
session_level    20090109393214576_0-4
Name: 3, dtype: object
index = 4, row = session_id        20090312143683264_q2
correct                              0
session_level    20090312143683264_0-4
Name: 4, dtype: object
index = 5, row = session_id        20090312331414616_q2
correct                              0
session_level    20090312331414616_0-4
Name: 5, dtype: object
index = 6, row = session_id        20090109393214576_q3
co

In [None]:
limits = {'0-4': (1, 4), '5-12': (5, 14), '13-22': (14, 19)}

for (sample_submission, test) in iter_test:

    test = df_scale(test)

    items = test['page'].values
    labels = items.reshape(-1, 1)

    oh_labels = oh_encoder.transform(labels)

    train_cat = oh_labels.toarray()

    tmp_df = pd.DataFrame(train_cat, columns=[
                          'page_' + str(col) for col in range(-1, 7)])
    test = pd.concat([test.drop(columns=['page'], axis=1), tmp_df], axis=1)

    # FEATURE ENGINEER TEST DATA
    df = feature_engineer(test)

    grp = test.level_group.values[0]
    a, b = limits[grp]
    for t in range(a, b):
        clf = models[f'{grp}_{t}']
        p = clf.predict_proba(df[FEATURES].astype('float32'))[:, 1]
        mask = sample_submission.session_id.str.contains(f'q{t}')
        sample_submission.loc[mask, 'correct'] = int(p.item() > best_threshold)

    env.predict(sample_submission)


In [121]:
sample_submission


Unnamed: 0,session_id,correct,session_level
0,20090109393214576_q1,<function <lambda> at 0x15acbe550>,20090109393214576_0-4
1,20090312143683264_q1,<function <lambda> at 0x15acd25e0>,20090312143683264_0-4
2,20090312331414616_q1,<function <lambda> at 0x15ac49670>,20090312331414616_0-4
3,20090109393214576_q2,<function <lambda> at 0x15ab3ddc0>,20090109393214576_0-4
4,20090312143683264_q2,<function <lambda> at 0x15ac618b0>,20090312143683264_0-4
5,20090312331414616_q2,<function <lambda> at 0x15ac61d30>,20090312331414616_0-4
6,20090109393214576_q3,<function <lambda> at 0x15ace0af0>,20090109393214576_0-4
7,20090312143683264_q3,<function <lambda> at 0x15ace0790>,20090312143683264_0-4
8,20090312331414616_q3,<function <lambda> at 0x15ace08b0>,20090312331414616_0-4
9,20090109393214576_q4,<function <lambda> at 0x15ace0ee0>,20090109393214576_5-12


In [107]:
## the end result is a submission file containing all test session predictions
! head submission.csv
df = pd.read_csv('submission.csv')
print('Sample submission shape:', df.shape )
print('Sample submission average prediction:', df.correct.mean() )
df.head()

head: submission.csv: No such file or directory


FileNotFoundError: [Errno 2] No such file or directory: 'submission.csv'

In [None]:
print(df.correct.mean())
