In [1]:
import pandas as pd, numpy as np
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
import pickle
import sys
import os
import gc

# Load Train Data and Labels

In [2]:
dtypes = {"session_id": 'int64',
          "index": np.int16,
          "elapsed_time": np.int32,
          "event_name": 'category',
          "name": 'category',
          "level": np.int8,
          "page": np.float16,
          "room_coor_x": np.float16,
          "room_coor_y": np.float16,
          "screen_coor_x": np.float16,
          "screen_coor_y": np.float16,
          "hover_duration": np.float32,
          "text": 'category',
          "fqid": 'category',
          "room_fqid": 'category',
          "text_fqid": 'category',
          "fullscreen": np.int8,
          "hq": np.int8,
          "music": np.int8,
          "level_group": 'category'
          }
use_col = ['session_id', 'index', 'elapsed_time', 'event_name', 'name', 'level', 'page',
           'room_coor_x', 'room_coor_y', 'hover_duration', 'text', 'fqid', 'room_fqid', 'text_fqid', 'level_group']

In [3]:
targets = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]) )
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
print( targets.shape )
targets.head()

(424116, 4)


Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1


In [4]:
feature_df = pd.read_csv('/kaggle/input/featur/feature_sort.csv')

# Feature Engineer

In [5]:
def delt_time_def(df):
    df.sort_values(by=['session_id', 'elapsed_time'], inplace=True)
    df['d_time'] = df['elapsed_time'].diff(1)
    df['d_time'].fillna(0, inplace=True)
    df['delt_time'] = df['d_time'].clip(0, 103000)
    df['delt_time_next'] = df['delt_time'].shift(-1)
    return df

In [6]:
def feature_engineer(train, kol_f):
    global kol_col, kol_col_max
    kol_col = 9
    kol_col_max = 11+kol_f*2
    col = [i for i in range(0,kol_col_max)]
    new_train = pd.DataFrame(index=train['session_id'].unique(), columns=col, dtype=np.float16)  
    new_train[10] = new_train.index # "session_id"    

    new_train[0] = train.groupby(['session_id'])['d_time'].quantile(q=0.3)
    new_train[1] = train.groupby(['session_id'])['d_time'].quantile(q=0.8)
    new_train[2] = train.groupby(['session_id'])['d_time'].quantile(q=0.5)
    new_train[3] = train.groupby(['session_id'])['d_time'].quantile(q=0.65)
    new_train[4] = train.groupby(['session_id'])['hover_duration'].agg('mean')
    new_train[5] = train.groupby(['session_id'])['hover_duration'].agg('std')    
    new_train[6] = new_train[10].apply(lambda x: int(str(x)[:2])).astype(np.uint8) # "year"
    new_train[7] = new_train[10].apply(lambda x: int(str(x)[2:4])+1).astype(np.uint8) # "month"
    new_train[8] = new_train[10].apply(lambda x: int(str(x)[4:6])).astype(np.uint8) # "day"
    new_train[9] = new_train[10].apply(lambda x: int(str(x)[6:8])).astype(np.uint8) + new_train[10].apply(lambda x: int(str(x)[8:10])).astype(np.uint8)/60
    new_train[10] = 0
    new_train = new_train.fillna(-1)
    
    return new_train

In [7]:
def feature_next_t(row_f, new_train, train, gran_1, gran_2, i):
    global kol_col
    kol_col +=1
    col1 = row_f['col1']
    val1 = row_f['val1']
    maska = (train[col1] == val1)
    if row_f['kol_col'] == 1:       
        new_train[kol_col] = train[maska].groupby(['session_id'])['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska].groupby(['session_id'])['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska].groupby(['session_id'])['index'].count()          
    elif row_f['kol_col'] == 2: 
        col2 = row_f['col2']
        val2 = row_f['val2']
        maska = maska & (train[col2] == val2)        
        new_train[kol_col] = train[maska].groupby(['session_id'])['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska].groupby(['session_id'])['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska].groupby(['session_id'])['index'].count()
    return new_train

In [8]:
def feature_next_t_otvet(row_f, new_train, train, gran_1, gran_2, i):
    global kol_col
    kol_col +=1
    col1 = row_f['col1']
    val1 = row_f['val1']
    maska = (train[col1] == val1)
    if row_f['kol_col'] == 1:      
        new_train[kol_col] = train[maska]['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska]['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska]['index'].count()          
    elif row_f['kol_col'] == 2: 
        col2 = row_f['col2']
        val2 = row_f['val2']
        maska = maska & (train[col2] == val2)        
        new_train[kol_col] = train[maska]['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska]['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska]['index'].count()
    return new_train

In [9]:
def experiment_feature_next_t_otvet(row_f, new_train, train, gran_1, gran_2, i):
    global kol_col
    kol_col +=1
    if row_f['kol_col'] == 1: 
        maska = train[row_f['col1']] == row_f['val1']
        new_train[kol_col] = train[maska]['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska]['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska]['index'].count()          
    elif row_f['kol_col'] == 2: 
        col2 = row_f['col2']
        val2 = row_f['val2']
        maska = (train[col1] == val1) & (train[col2] == val2)        
        new_train[kol_col] = train[maska]['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska]['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska]['index'].count()
    return new_train

In [10]:
def feature_quest_otvet(new_train, train, quest, kol_f):
    global kol_col
    kol_col = 9
    g1 = 0.7 
    g2 = 0.3 

    feature_q = feature_df[feature_df['quest'] == quest].copy()
    feature_q.reset_index(drop=True, inplace=True)
    
    gran1 = round(kol_f * g1)
    gran2 = round(kol_f * g2)    
    for i in range(0, kol_f):         
        row_f = feature_q.loc[i]
        new_train = feature_next_t_otvet(row_f, new_train, train, i < gran1, i <  gran2, i) 
    col = [i for i in range(0,kol_col+1)]
    return new_train[col]

In [11]:
def feature_engineer_new(new_train, train, feature_q, kol_f):
    g1 = 0.7 
    g2 = 0.3 
    gran1 = round(kol_f * g1)
    gran2 = round(kol_f * g2)    
    for i in range(0, kol_f): 
        row_f = feature_q.loc[i]       
        new_train = feature_next_t(row_f, new_train, train, i < gran1, i <  gran2, i)         
    return new_train

In [12]:
def feature_quest(new_train, train, quest, kol_f):
    global kol_col
    kol_col = 9
    feature_q = feature_df[feature_df['quest'] == quest].copy()
    feature_q.reset_index(drop=True, inplace=True)
    new_train = feature_engineer_new(new_train, train, feature_q, kol_f)
    col = [i for i in range(0,kol_col+1)]
    return new_train[col]

In [13]:
def create_model(old_train, quests, models, list_kol_f):
    
    kol_quest = len(quests)
    # ITERATE THRU QUESTIONS
    for q in quests:
        print('### quest ', q, end='')
        new_train = feature_engineer(old_train, list_kol_f[q])
        train_x = feature_quest(new_train, old_train, q, list_kol_f[q])
        print (' ---- ', 'train_q.shape = ', train_x.shape)
           
        # TRAIN DATA
        train_users = train_x.index.values
        train_y = targets.loc[targets.q==q].set_index('session').loc[train_users]

        # TRAIN MODEL 

        model = CatBoostClassifier(
            n_estimators = 300,
            learning_rate= 0.045,
            depth = 6
        )
        
        model.fit(train_x.astype('float32'), train_y['correct'], verbose=False)

        # SAVE MODEL, PREDICT VALID OOF
        models[f'{q}'] = model
    print('***')
    
    return models

In [14]:
models = {}
best_threshold = 0.63

In [15]:
list_kol_f = {
    1:140,3:110,
    4:120, 5:220, 6:130, 7:110, 8:110, 9:100, 10:140, 11:120,
    14: 160, 15:160, 16:130, 17:140             
             }

In [16]:
os.chdir(r'/kaggle/working/')

In [17]:
df0_4 = pd.read_csv('/kaggle/input/featur/train_0_4t.csv', dtype=dtypes) 
kol_lvl = (df0_4 .groupby(['session_id'])['level'].agg('nunique') < 5)
list_session = kol_lvl[kol_lvl].index
df0_4  = df0_4 [~df0_4 ['session_id'].isin(list_session)]
df0_4 = delt_time_def(df0_4)

quests_0_4 = [1, 3] 
# list_kol_f = {1:140,3:110}

models = create_model(df0_4, quests_0_4, models, list_kol_f)
del df0_4

### quest  1 ----  train_q.shape =  (23562, 290)





### quest  3 ----  train_q.shape =  (23562, 230)
***


In [18]:
df5_12 = pd.read_csv('/kaggle/input/featur/train_5_12t.csv', dtype=dtypes)
kol_lvl = (df5_12.groupby(['session_id'])['level'].agg('nunique') < 8)
list_session = kol_lvl[kol_lvl].index
df5_12 = df5_12[~df5_12['session_id'].isin(list_session)]
df5_12 = delt_time_def(df5_12)
quests_5_12 = [4, 5, 6, 7, 8, 9, 10, 11] 

# list_kol_f = {4:110, 5:220, 6:120, 7:110, 8:110, 9:100, 10:140, 11:120}

models = create_model(df5_12, quests_5_12, models, list_kol_f)
del df5_12

### quest  4 ----  train_q.shape =  (23561, 250)
### quest  5 ----  train_q.shape =  (23561, 450)
### quest  6 ----  train_q.shape =  (23561, 270)
### quest  7 ----  train_q.shape =  (23561, 230)
### quest  8 ----  train_q.shape =  (23561, 230)
### quest  9 ----  train_q.shape =  (23561, 210)
### quest  10 ----  train_q.shape =  (23561, 290)
### quest  11 ----  train_q.shape =  (23561, 250)
***


In [19]:
df13_22 = pd.read_csv('/kaggle/input/featur/train_13_22t.csv', dtype=dtypes) 
kol_lvl = (df13_22 .groupby(['session_id'])['level'].agg('nunique') < 10)
list_session = kol_lvl[kol_lvl].index
df13_22  = df13_22 [~df13_22 ['session_id'].isin(list_session)]
df13_22 = delt_time_def(df13_22)

quests_13_22 = [14, 15, 16, 17] 
# list_kol_f = {14: 160, 15:160, 16:105, 17:140}

models = create_model(df13_22, quests_13_22, models, list_kol_f)


### quest  14 ----  train_q.shape =  (22986, 330)
### quest  15 ----  train_q.shape =  (22986, 330)
### quest  16 ----  train_q.shape =  (22986, 270)
### quest  17 ----  train_q.shape =  (22986, 290)
***


In [20]:
# # Saving a Model
# import os
# dr = '/kaggle/working/models'

# if not os.path.exists(dr):
#     os.mkdir(r'/kaggle/working/models')
    
# os.chdir(r'/kaggle/working/models')
# # for q in quests_0_4 + quests_5_12 + quests_13_22:
# #     print(q)
# #     fadf
# #     models[q].save_model(f'cat_model_{q}.bin')

# qnx = ['1','3','4','5','6','7','8','9','10','11','14','15','16','17']

# for q in qnx:
#     models[q].save_model(f'cat_model_{q}.bin')
    

In [21]:
#Model Reading
# dir = '/kaggle/input/catbust/'
# for q in quests_0_4 + quests_5_12 + quests_13_22:
#     models[q] = CatBoostClassifier().load_model(dir+f'cat_model_{q}.bin')

**Infer Test Data**

In [22]:
# import jo_wilder

# try:
#     jo_wilder.make_env.__called__ = False
#     env.__called__ = False
#     type(env)._state = type(type(env)._state).__dict__['INIT']
# except:
#     pass

# env = jo_wilder.make_env()
# iter_test = env.iter_test()    

In [23]:
import time

In [24]:
def convert_arr(dataset_df):
     
    max_rows = 200 #dataset_df['session_id'].value_counts().max()

    # Create a list to store reshaped arrays
#     df_arrays = []  
    session_rows = dataset_df.head(max_rows)
    session_rows.drop('session_id',inplace=True, axis=1)
    
#     session_rows.drop('index',inplace=True, axis=1)                

    sc = StandardScaler()   
    session_rows  = sc.fit_transform(session_rows) #[NUMERICAL]) 
    
    num_rows = session_rows.shape[0]
    pad_rows = max_rows - num_rows
    padded_arr = np.pad(session_rows, ((0, pad_rows), (0, 0)), mode='constant')
#     padded_arrays.append(padded_arr)


 
    # Append the reshaped array to the list
#     df_arrays.append(session_rows) 

    # Display the reshaped array
#     print(len(df_arrays))
    return np.array(padded_arr)

In [25]:
# import numpy as np

# def pad_arrays(arrays):
#     max_rows = 200 #max(arr.shape[0] for arr in arrays)
#     padded_arrays = []

#     for arr in arrays:
#         num_rows = arr.shape[0]
#         pad_rows = max_rows - num_rows
#         padded_arr = np.pad(arr, ((0, pad_rows), (0, 0)), mode='constant')
#         padded_arrays.append(padded_arr)

#     return np.array(padded_arrays)

In [26]:
def feature_engineer1(test_df):
    cols = ['session_id','elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration',#'level_group',
            'd_time','delt_time','delt_time_next']
    test_df = test_df[cols]
    
    
#     test_df.drop(['fullscreen','hq','music','text'],inplace=True, axis=1)
#     test_object_columns = test_df.select_dtypes(include='object').columns
#     test_object_columns = test_object_columns.drop('level_group')
     
#     test_df[test_object_columns] = test_df[test_object_columns].astype('category') 
    
#     print("in fe ",test_df.columns)
#     test_df = fillnull(test_df)
#     print("after fillnull ",test_df.columns)
#     test_df = pd.read_csv(f,usecols=cols)
#     test_df = test_df[cols]
#     test_df.fillna(0,inplace=True)
    
    
#     test_df.drop('level_group',inplace=True, axis=1)
    
    test_df = test_df.reset_index(drop=True)
#     print("in fe after ind 1",test_df.columns)
    test_df = test_df.set_index('session_id',drop=False)
#     print("in fe after ind 2",test_df.columns)
#     df = 
    for column in cols:
        if test_df[column].isna().sum()>0:
            test_df[column].fillna(0,inplace=True,axis=0) 
    
    df = convert_arr(test_df)
    df = np.expand_dims(df, axis=0)
#     test_data1 = pad_arrays(df)
    
            
       
    
    
    
    return df

In [27]:
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import f1_score
from keras.callbacks import Callback,ModelCheckpoint
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
import keras.backend as K

# def f1_score_metric(y_true, y_pred):
#     y_pred_binary = tf.round(y_pred)  # Convert probabilities to binary predictions
#     return f1_score(y_true, y_pred_binary)

def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [28]:
import os
os.chdir(r'/kaggle/working/')

import jo_wilder_310
jo_wilder_310.make_env.func_dict['__called__'] = False
env = jo_wilder_310.make_env()

iter_test = env.iter_test()
# g_end4 = 0
# g_end5 = 0

# print("dfasdfa")

quests_0_4 = [1,2,3]
quests_5_12 = [4, 5, 6, 7, 8, 9, 10, 11] 
quests_13_22 = [14, 15, 16, 17] 

list_q = {'0-4':quests_0_4, '5-12':quests_5_12, '13-22':quests_13_22}
for (test, sam_sub) in iter_test:
#     print(test.shape)
    sam_sub['question'] = [int(label.split('_')[1][1:]) for label in sam_sub['session_id']]    
    grp = test.level_group.values[0]   
    sam_sub['correct'] = 1
    sam_sub.loc[sam_sub.question.isin([5, 8, 10, 13, 15]), 'correct'] = 0  
    old_train = delt_time_def(test[test.level_group == grp])
    
    tpuqns = [1,2,3] #,14,15,16,17,18]
    
    if grp=='0-4':
#         print("grp here ",grp)
        test = delt_time_def(test)
        test_data = feature_engineer1(test)  
    
#     print("dafasdfas")
       
    for q in list_q[grp]:
#         print('q here ',q)
        
#         start4 = time.time() 
        
#         end4 = time.time() - start4
#         g_end4 += end4
        
#         start5 = time.time()     
        
        if q in tpuqns:

            os.chdir(r'/kaggle/input/std-pred-transformer-6a/models') 
            filename = 'tpu'+'_'+f'{grp}_{q}'
#             print("file ",filename)

            tr_model = pickle.load(open(filename, 'rb')) 

            pred= tr_model(test_data).numpy().ravel()


            predictions = [1 if x > 0.5 else 0 for x in pred]
#             print("prediccted for q ",q,"using tpu model")
        else:
            
            new_train = feature_engineer(old_train, list_kol_f[q])
            new_train = feature_quest_otvet(new_train, old_train, q, list_kol_f[q])
        
        
            clf = models[f'{q}']
            p = clf.predict_proba(new_train.astype('float32'))[:,1]   
            predictions = [1 if x > best_threshold else 0 for x in p]
#             print("predicted for q ",q,"using cb model")

#             end5 = time.time() - start5
#             g_end5 += end5


        mask = sam_sub.question == q 
#         print("q before update",q)
#             x = int(p[0]>best_threshold)
        sam_sub.loc[mask,'correct'] = predictions  
        try:
            del predictions
        except NameError:
            pass
    try:
        del test_data
    except NameError:
        pass

        
        
    sam_sub = sam_sub[['session_id', 'correct']]  
    os.chdir(r'/kaggle/working/')
    env.predict(sam_sub)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  session_rows.drop('session_id',inplace=True, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  session_rows.drop('session_id',inplace=True, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  session_rows.drop('session_id',inplace=True, axis=1)


# EDA submission.csv

In [29]:
df = pd.read_csv('submission.csv')
print( df.shape )
df.iloc[-54:]

(54, 2)


Unnamed: 0,session_id,correct
0,20090109393214576_q1,1
1,20090109393214576_q2,1
2,20090109393214576_q3,1
3,20090109393214576_q4,1
4,20090109393214576_q5,0
5,20090109393214576_q6,1
6,20090109393214576_q7,1
7,20090109393214576_q8,0
8,20090109393214576_q9,1
9,20090109393214576_q10,0
