In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
train = pd.read_csv('predict-student-performance-from-game-play/train.csv')

In [3]:
def df_scale(train):
    train.drop(['fullscreen','hq','music','screen_coor_x','screen_coor_y','name','text'], axis=1, inplace=True)
    train['room_coor_x'].fillna(0)
    train['room_coor_y'].fillna(0)
    train['hover_duration'].fillna(0)
    train['page'].fillna(-1)
    return train

In [4]:
train = df_scale(train)

In [5]:
from sklearn.preprocessing import OneHotEncoder

items = train['page'].values
labels = items.reshape(-1,1)

oh_encoder = OneHotEncoder()

oh_encoder.fit(labels)
oh_labels = oh_encoder.transform(labels)

train_cat = oh_labels.toarray()

tmp_df = pd.DataFrame(train_cat, columns=['page_' + str(col) for col in range(-1,7)])
train = pd.concat([train.drop(columns=['page'],axis=1),tmp_df],axis=1)

In [6]:
targets = pd.read_csv('predict-student-performance-from-game-play/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]))
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]))

In [7]:
OBJECT = ['event_name', 'room_fqid', 'fqid']
NUMS = ['elapsed_time', 'room_coor_x',
        'level', 'room_coor_y', 'hover_duration']

EVENTS = ['navigate_click', 'person_click', 'cutscene_click', 'object_click',
          'map_hover', 'notification_click', 'map_click', 'observation_click',
          'checkpoint']

PAGE = ['page_-1', 'page_0', 'page_1', 'page_2',
        'page_3', 'page_4', 'page_5', 'page_6']
# FQID = ['fqid' + str(col) for col in range(128)]
# TEXT_FQID = ['text_fqid' + str(col) for col in range(127)]

In [8]:
def feature_engineer(train):

    dfs = []
    for c in OBJECT:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in PAGE:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
#     for c in TEXT_FQID:
#         tmp = train.groupby(['session_id', 'level_group'])[c].agg('sum')
#         tmp.name = tmp.name + '_sum'
#         dfs.append(tmp)
    for c in EVENTS:
        train[c] = (train.event_name == c).astype('int8')
    for c in EVENTS + ['elapsed_time']:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    train = train.drop(EVENTS, axis=1)

    df = pd.concat(dfs, axis=1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df


In [9]:
%%time
df = feature_engineer(train)
print(df.shape)

(35337, 32)
CPU times: user 27 s, sys: 2.79 s, total: 29.8 s
Wall time: 30.2 s


In [10]:
FEATURES = df.columns[1:]
print(f'{len(FEATURES)}개의 특성')
ALL_USERS = df.index.unique()
print(f'{len(ALL_USERS)}명의 유저 정보')

31개의 특성
11779명의 유저 정보


In [11]:
gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS), 18)), index=ALL_USERS)
models = {}

# COMPUTE CV SCORE WITH 5 GROUP K FOLD
for i, (train_index, test_index) in enumerate(gkf.split(X=df, groups=df.index)):
    print('')
    print('Fold', i+1)
    print('')

    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.05,
        'max_depth': 4,
        'n_estimators': 1000,
        'early_stopping_rounds': 50,
        'tree_method': 'hist',
        'subsample': 0.8,
        'colsample_bytree': 0.4
    }

    # ITERATE THRU QUESTIONS 1 THRU 18
    for t in range(1, 19):

        # USE THIS TRAIN DATA WITH THESE QUESTIONS
        if t <= 4:
            grp = '0-4'
        elif t <= 12:
            grp = '5-12'
        else:
            grp = '13-22'

        # TRAIN DATA
        train_x = df.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = targets.loc[targets.q == t].set_index(
            'session').loc[train_users]

        # VALID DATA
        valid_x = df.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = targets.loc[targets.q == t].set_index(
            'session').loc[valid_users]

        # TRAIN MODEL
        clf = XGBClassifier(**xgb_params)
        clf.fit(train_x[FEATURES].astype('float32'), train_y['correct'],
                eval_set=[(valid_x[FEATURES].astype(
                    'float32'), valid_y['correct'])],
                verbose=0)
        print(f'{t}({clf.best_ntree_limit}), ', end='')

        # SAVE MODEL, PREDICT VALID OOF
        models[f'{grp}_{t}'] = clf
        oof.loc[valid_users, t -
                1] = clf.predict_proba(valid_x[FEATURES].astype('float32'))[:, 1]

    print()



Fold 1

1(118), 2(111), 3(99), 4(140), 5(65), 6(189), 7(96), 8(60), 9(113), 10(125), 11(125), 12(89), 13(194), 14(137), 15(116), 16(84), 17(86), 18(165), 

Fold 2

1(106), 2(106), 3(137), 4(112), 5(61), 6(108), 7(106), 8(46), 9(116), 10(139), 11(50), 12(111), 13(136), 14(118), 15(122), 16(64), 17(39), 18(105), 

Fold 3

1(53), 2(110), 3(82), 4(88), 5(99), 6(58), 7(101), 8(55), 9(133), 10(104), 11(69), 12(79), 13(158), 14(199), 15(112), 16(120), 17(127), 18(103), 

Fold 4

1(151), 2(175), 3(102), 4(114), 5(130), 6(145), 7(72), 8(68), 9(66), 10(111), 11(98), 12(113), 13(82), 14(110), 15(152), 16(59), 17(56), 18(189), 

Fold 5

1(138), 2(141), 3(85), 4(74), 5(78), 6(208), 7(93), 8(64), 9(246), 10(92), 11(69), 12(93), 13(116), 14(209), 15(200), 16(66), 17(71), 18(78), 


In [12]:
# True 레이블을 18개의 열로 구성된 데이터 프레임에 넣습니다
true = oof.copy()
for k in range(18):
    # GET TRUE LABELS
    tmp = targets.loc[targets.q == k+1].set_index('session').loc[ALL_USERS]
    true[k] = tmp.correct.values

In [13]:
# 확률을 1과 0으로 변환하기 위한 최적의 임계값을 찾습니다
scores = []
thresholds = []
best_score = 0
best_threshold = 0

for threshold in np.arange(0.4, 0.81, 0.01):
    preds = (oof.values.reshape((-1)) > threshold).astype('int')
    m = f1_score(true.values.reshape((-1)), preds, average='macro')
    scores.append(m)
    thresholds.append(threshold)
    if m > best_score:
        best_score = m
        best_threshold = threshold
print(f'Best 임계점: {best_threshold:.2f}')
print(f'Best F1_score: {best_score:.4f}')


Best 임계점: 0.62
Best F1_score: 0.6774


In [15]:
del train
del df
del oof
del targets

NameError: name 'train' is not defined

In [21]:
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi


In [22]:
api = KaggleApi()


In [23]:
api.authenticate()

In [25]:
api.competitions_submissions_list(id='jo_wilder')

ApiException: (403)
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'Date': 'Fri, 17 Feb 2023 07:30:23 GMT', 'Access-Control-Allow-Credentials': 'true', 'Set-Cookie': 'ka_sessionid=8d50a517192c3da34f5f85ddbf0be859; max-age=2626560; path=/, GCLB=CPz708Xw_7q-YQ; path=/; HttpOnly', 'Transfer-Encoding': 'chunked', 'Vary': 'Accept-Encoding', 'Turbolinks-Location': 'https://www.kaggle.com/api/v1/competitions/submissions/list/jo_wilder', 'X-Kaggle-MillisecondsElapsed': '50', 'X-Kaggle-RequestId': '551ba5d4629e0c3c4fad32842bd63a09', 'X-Kaggle-ApiVersion': '1.5.12', 'X-Frame-Options': 'SAMEORIGIN', 'Strict-Transport-Security': 'max-age=63072000; includeSubDomains; preload', 'Content-Security-Policy': "object-src 'none'; script-src 'nonce-pGvQ6fwMeTnxR4R4tP2uig==' 'report-sample' 'unsafe-inline' 'unsafe-eval' 'strict-dynamic' https: http:; frame-src 'self' https://www.kaggleusercontent.com https://www.youtube.com/embed/ https://polygraph-cool.github.io https://www.google.com/recaptcha/ https://form.jotform.com https://submit.jotform.us https://submit.jotformpro.com https://submit.jotform.com https://www.docdroid.com https://www.docdroid.net https://kaggle-static.storage.googleapis.com https://kaggle-static-staging.storage.googleapis.com https://kkb-dev.jupyter-proxy.kaggle.net https://kkb-staging.jupyter-proxy.kaggle.net https://kkb-production.jupyter-proxy.kaggle.net https://kkb-dev.firebaseapp.com https://kkb-staging.firebaseapp.com https://kkb-production.firebaseapp.com https://kaggle-metastore-test.firebaseapp.com https://kaggle-metastore.firebaseapp.com https://apis.google.com https://content-sheets.googleapis.com/ https://accounts.google.com/ https://storage.googleapis.com https://docs.google.com https://drive.google.com https://calendar.google.com/; base-uri 'none'; report-uri https://csp.withgoogle.com/csp/kaggle/20201130;", 'X-Content-Type-Options': 'nosniff', 'Referrer-Policy': 'strict-origin-when-cross-origin', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"code":403,"message":"Permission \u0027competitions.participate\u0027 was denied"}


In [26]:
import sys
sys.path.append('predict-student-performance-from-game-play')

In [20]:

env = jo_wilder.make_env()
# when did i call make_env mf!!!!!


ModuleNotFoundError: No module named 'jo_wilder'

In [20]:
iter_test = env.iter_test()

In [18]:
sample_submission = pd.read_csv("predict-student-performance-from-game-play/sample_submission.csv")
test = pd.read_csv("predict-student-performance-from-game-play/test.csv")


In [26]:
# limits = {'0-4':(1,4), '5-12':(5,14), '13-22':(14,19)}

# for (sample_submission, test) in iter_test:
    
#     test = df_scale(test)
    
    
#     items = test['page'].values
#     labels = items.reshape(-1,1)

    
#     oh_labels = oh_encoder.transform(labels)

#     train_cat = oh_labels.toarray()

#     tmp_df = pd.DataFrame(train_cat, columns=['page_' + str(col) for col in range(-1,7)])
#     test = pd.concat([test.drop(columns=['page'],axis=1),tmp_df],axis=1)
    
    
#     # FEATURE ENGINEER TEST DATA
#     df = feature_engineer(test) 
    
#     grp = test.level_group.values[0]
#     a,b = limits[grp]
#     for t in range(a,b):
#         clf = models[f'{grp}_{t}']
#         p = clf.predict_proba(df[FEATURES].astype('float32'))[:,1]
#         mask = sample_submission.session_id.str.contains(f'q{t}')
#         sample_submission.loc[mask,'correct'] = int(p.item()>best_threshold)
    
#     env.predict(sample_submission)

In [19]:
limits = {'0-4': (1, 4), '5-12': (5, 14), '13-22': (14, 19)}


test = df_scale(test)

items = test['page'].values
labels = items.reshape(-1, 1)

oh_labels = oh_encoder.transform(labels)

train_cat = oh_labels.toarray()

tmp_df = pd.DataFrame(train_cat, columns=['page_' + str(col) for col in range(-1, 7)])
test = pd.concat([test.drop(columns=['page'], axis=1), tmp_df], axis=1)

    # FEATURE ENGINEER TEST DATA
df = feature_engineer(test)

grp = test.level_group.values[0]
a, b = limits[grp]
for t in range(a, b):
    clf = models[f'{grp}_{t}']
    p = clf.predict_proba(df[FEATURES].astype('float32'))[:, 1]
    mask = sample_submission.session_id.str.contains(f'q{t}')
    sample_submission.loc[mask, 'correct'] = int(p.item() > best_threshold)

# clf.predict(sample_submission)


ValueError: can only convert an array of size 1 to a Python scalar

In [29]:
counter = 0
# The API will deliver two dataframes in this specific order,
# for every session+level grouping (one group per session for each checkpoint)
for (sample_submission, test) in iter_test:
    if counter==0:
        display(sample_submission.head())
        display(test.head())
        print(test.shape)
        
    ## users make predictions here using the test data
    for index,row in sample_submission.iterrows():
        q = int( row['session_id'].split('_')[-1][1:] )
        p = int( question_means[q]>best_threshold )
        sample_submission.loc[index,'correct'] = p
    
    ## env.predict appends the session+level sample_submission to the overall
    ## submission
    env.predict(sample_submission)
    counter += 1

In [33]:
## the end result is a submission file containing all test session predictions
! head submission.csv
df = pd.read_csv('submission.csv')
print('Sample submission shape:', df.shape )
print('Sample submission average prediction:', df.correct.mean() )
df.head()

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)


EmptyDataError: No columns to parse from file

In [None]:
print(df.correct.mean())
