In [1]:
%load_ext Cython

In [2]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
import lightgbm as lgb
import pickle
import datetime
import collections
from sklearn.preprocessing import LabelEncoder
import random
import os

In [3]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(707)

In [4]:
train_pickle = '../input/riiid-cross-validation-files/cv1_train.pickle'
valid_pickle = '../input/riiid-cross-validation-files/cv1_valid.pickle'
question_file = '../input/riiid-test-answer-prediction/questions.csv'
debug = False
build = False

# read data
feld_needed = ['row_id','timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation','user_answer']

## feature engineering

In [5]:
# Transform tags into lists of ints:
questions_df = pd.read_csv(question_file)
questions_df['part'] = (questions_df['part'] - 1).astype('uint8')
questions_df['correct_answer'] = questions_df['correct_answer'].astype('uint8')
questions_df['tags'] = questions_df['tags'].apply(lambda ts: [int(x) for x in str(ts).split() if x != 'nan'])

tag_rank = []
tag_columns = []
tag_to_questions = {}
for i, row in questions_df.iterrows():
    for t in row['tags']:
        tag_rank.append(t)
        if t not in tag_to_questions:
            tag_to_questions[t] = set()
        tag_to_questions[t].add(row['question_id'])
tags_df = pd.DataFrame([{'tag':t,'questions':qs}for t,qs in tag_to_questions.items()])
tag_rank, counts = zip(*collections.Counter(tag_rank).most_common(1))
print(tag_rank)
for t in tag_rank:
    tag_columns.append('tags_' + str(t))
    for i in range(len(questions_df)):
        if t in questions_df.iloc[i]['tags']:
            questions_df.at[i,'tags_' + str(t)] = 1
        else:
            questions_df.at[i,'tags_' + str(t)] = 0
    questions_df['tags_' + str(t)] = questions_df['tags_' + str(t)].astype('uint8')

del questions_df['bundle_id']
print(tag_columns)

le = LabelEncoder()
encoded = le.fit_transform(questions_df['tags'].astype(str))
decoded = le.inverse_transform(encoded)
questions_df['enc_tags'] = encoded.astype('uint16')
del questions_df['tags'], le

questions_df.head(3)

(92,)
['tags_92']


Unnamed: 0,question_id,correct_answer,part,tags_92,enc_tags
0,0,0,0,0,981
1,1,1,0,0,306
2,2,0,0,1,250


In [6]:
%%cython
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd
import gc

DTYPE = np.int32
ctypedef np.int32_t np_int_t


@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def add_user_feats_without_update(df,
                                  answer_per_dict,
                                  answered_correctly_sum_u_dict,
                                  answered_correctly_cumsum_u_dict,
                                  answered_incorrectly_cumsum_u_dict,
                                  count_u_dict,
                                  parts_u_dict,
                                  parts_count_u_dict,
                                  answered_diff_sum_u_dict,
                                  avg_c_sum_u_dict,
                                  parts_avg_c_dict,
                                  user_answer_per_sum_dict,
                                  parts_user_answer_per_sum_dict,
                                  content_correct_user_mean_dict,
                                  content_correct_count_dict,
                                  parts_content_correct_user_mean_dict,
                                  last_correct_timestamp_dict,
                                  last_incorrect_timestamp_dict,
                                  like_answer_dict,
                                  like_answer_three_dict,
                                  dislike_answer_dict,
                                  dislike_answer_three_dict,
                                  parts_answered_correctly_cumsum_u_dict,
                                  parts_answered_incorrectly_cumsum_u_dict):
    cdef int arr_size = len(df)
    cdef np.ndarray[np_int_t, ndim=1] acsu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] accu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] aicu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] paccu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] paicu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] cu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=2] ptu = np.zeros([arr_size,7], dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=2] ptcu = np.zeros([arr_size,7], dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] adsu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] avcu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] pavc = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[float, ndim=1] uaps = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[float, ndim=1] puaps = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[float, ndim=1] cucm = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[float, ndim=1] pcucm = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[long, ndim=1] lct = np.zeros(arr_size, dtype=long)
    cdef np.ndarray[long, ndim=1] lit = np.zeros(arr_size, dtype=long)
    cdef np.ndarray[float, ndim=1] like = np.zeros(arr_size, dtype=np.float32)
    cdef np.ndarray[float, ndim=1] dislike = np.zeros(arr_size, dtype=np.float32)
    cdef int cnt, i
    cdef np.ndarray[long, ndim=1] row
    
    for cnt,row in enumerate(df[['user_id','part','content_id','timestamp','correct_answer']].values):
        acsu[cnt] = answered_correctly_sum_u_dict[row[0]]
        accu[cnt] = answered_correctly_cumsum_u_dict[row[0]]
        aicu[cnt] = answered_incorrectly_cumsum_u_dict[row[0]]
        paccu[cnt] = parts_answered_correctly_cumsum_u_dict[row[1]][row[0]]
        paicu[cnt] = parts_answered_incorrectly_cumsum_u_dict[row[1]][row[0]]
        cu[cnt] = count_u_dict[row[0]]
        adsu[cnt] = answered_diff_sum_u_dict[row[0]]
        avcu[cnt] = avg_c_sum_u_dict[row[0]] 
        uaps[cnt] = user_answer_per_sum_dict[row[0]]
        lct[cnt] = row[3] - last_correct_timestamp_dict[row[0]]
        lit[cnt] = row[3] - last_incorrect_timestamp_dict[row[0]]
        if row[1] == 1:
            if (row[3] == 0) or (dict_sub(answered_correctly_sum_u_dict[row[0]] ,count_u_dict[row[0]])==0):
                like[cnt] = np.nan
                dislike[cnt] = np.nan
            else:
                like[cnt] = like_answer_three_dict[row[4]][row[0]] / (dict_sub(cu[cnt] ,acsu[cnt]))
                dislike[cnt] = dislike_answer_three_dict[row[4]][row[0]] / dict_sub(cu[cnt] ,acsu[cnt])
        else:
            if (row[3] == 0) or (dict_sub(answered_correctly_sum_u_dict[row[0]] , count_u_dict[row[0]])==0):
                like[cnt] = np.nan
                dislike[cnt] = np.nan
            else:
                like[cnt] = like_answer_dict[row[4]][row[0]] / (dict_sub(cu[cnt] , acsu[cnt]))
                dislike[cnt] = dislike_answer_dict[row[4]][row[0]] / dict_sub(cu[cnt] , acsu[cnt])
                
        if content_correct_count_dict[row[2]] > 0:
            cucm[cnt] = content_correct_user_mean_dict[row[2]] / content_correct_count_dict[row[2]]
            pcucm[cnt] = parts_content_correct_user_mean_dict[row[2]] / content_correct_count_dict[row[2]]

        for i in range(7):
            ptu[cnt,i] = parts_u_dict[i][row[0]]
            ptcu[cnt,i] = parts_count_u_dict[i][row[0]]
            if i == row[1]:
                pavc[cnt] = parts_avg_c_dict[i][row[0]] / dict_sum(parts_count_u_dict[i][row[0]] , 1)

                
        if ptcu[cnt,row[1]] != 0:
            puaps[cnt] = parts_user_answer_per_sum_dict[row[1]][row[0]] / ptcu[cnt,row[1]]
            
    df['answered_correctly_sum_u'] = acsu
    df['answered_correctly_sum_u'] = df['answered_correctly_sum_u'].astype('uint16')
    df['answered_cumsum_u'] = accu - aicu
    df['answered_cumsum_u'] = df['answered_cumsum_u'].astype('int8')
    df['part_answered_cumsum_u'] = paccu - paicu
    df['part_answered_cumsum_u'] = df['part_answered_cumsum_u'].astype('int8') 
    df['count_u'] = cu
    df['count_u'] = df['count_u'].astype('uint16')
    df['answered_correctly_avg_u'] = df['answered_correctly_sum_u'] / df['count_u']
    df['answered_correctly_avg_u'] = df['answered_correctly_avg_u'].astype('float16')
    df['answered_diff_mean'] = adsu  / cu
    df['answered_diff_mean'] = df['answered_diff_mean'].astype('float16')
    df['avg_c_mean'] = avcu / cu
    df['avg_c_mean'] = df['avg_c_mean'].astype('float16')
    df['part_avg_c_mean'] = pavc
    df['part_avg_c_mean'] = df['part_avg_c_mean'].astype('uint8')
    df['avg_c_per_u'] = df['avg_c_mean'] / (df['answered_correctly_avg_u'] * 100)
    df['avg_c_per_u'] = df['avg_c_per_u'].astype('float16')
    
    df['user_answer_per_mean'] = uaps
    df['user_answer_per_mean'] = df['user_answer_per_mean']  / df['count_u']
    df['user_answer_per_mean'] = df['user_answer_per_mean'].astype('float16')
    
    df['part_user_answer_per_mean']= puaps
    df['part_user_answer_per_mean'] = df['part_user_answer_per_mean'].astype('float16')
    
    df['content_lv'] = cucm
    df['content_lv'] = df['content_lv'].astype('float16')
    df['part_content_lv'] = pcucm
    df['part_content_lv'] = df['part_content_lv'].astype('float16')
    df.loc[df['content_lv']==0,'content_lv']=0.5
    df.loc[df['part_content_lv']==0,'part_content_lv']=0.5
    
    df['last_correct_timelag'] = lct
    df['last_correct_timelag'] = df['last_correct_timelag'].astype('uint32')
    df['last_incorrect_timelag'] = lit
    df['last_incorrect_timelag'] = df['last_incorrect_timelag'].astype('uint32')

    df['is_like_answer'] = like
    df['is_like_answer'] = df['is_like_answer'].astype('float16')
    df['is_dislike_answer'] = dislike
    df['is_dislike_answer'] = df['is_dislike_answer'].astype('float16')
    
    df['part_count_per'] = 0
    df['lr_count_per'] = 0
    
    cdef str pnum
    for i in range(7):
        pnum = str(i)
        df['p' + pnum + '_count_u'] = ptcu[:,i]
        df['p' + pnum + '_count_u'] = df['p' + pnum + '_count_u']
        df['p' + pnum + '_count_u'] = df['p' + pnum + '_count_u'].astype('uint32')
        df['p' + pnum + '_mean_u'] = ptu[:,i] / ptcu[:,i]
        df['p' + pnum + '_mean_u']  = df['p' + pnum + '_mean_u'] * (df['p' + pnum + '_count_u'] / df['count_u'])
        df['p' + pnum + '_mean_u'] = df['p' + pnum + '_mean_u'].astype('float16')
        df.loc[df['part']==i,'part_count_per'] = df['p' + pnum + '_count_u'] / df['count_u']
    df.replace([np.inf, -np.inf], np.nan,inplace=True)
    df['part_count_per'] = df['part_count_per'].astype('float16')
        
    df.loc[df['part']<4,'lr_count_per'] = ((df['p0_count_u'] + df['p1_count_u'] + df['p2_count_u'] + df['p3_count_u']) / df['count_u'])
    df.loc[df['part']>3,'lr_count_per'] = ((df['p4_count_u'] + df['p5_count_u'] + df['p6_count_u']) / df['count_u']).astype('float16')
    df['lr_count_per'] = df['lr_count_per'].astype('float16')
    
    df['part_count_per'].replace([np.inf, -np.inf], np.nan,inplace=True)
    return df

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def update_user_feats(df, 
                      dict answer_per_dict,
                      answered_correctly_sum_u_dict,
                      answered_correctly_cumsum_u_dict,
                      answered_incorrectly_cumsum_u_dict,
                      count_u_dict,
                      parts_u_dict,
                      parts_count_u_dict,
                      answered_diff_sum_u_dict,
                      avg_c_sum_u_dict,
                      parts_avg_c_dict,
                      user_answer_per_sum_dict,
                      parts_user_answer_per_sum_dict,
                      content_correct_user_mean_dict,
                      content_correct_count_dict,
                      parts_content_correct_user_mean_dict,
                      last_correct_timestamp_dict,
                      last_incorrect_timestamp_dict,
                      like_answer_dict,
                      like_answer_three_dict,
                      dislike_answer_dict,
                      dislike_answer_three_dict,
                      parts_answered_correctly_cumsum_u_dict,
                      parts_answered_incorrectly_cumsum_u_dict):
    #講義列あり
    cdef np.ndarray[long, ndim=1] row
    #                   0             1                    2           3               4                     5            6           7              8            
    for row in df[['user_id','answered_correctly','content_type_id','part','answered_correctly_avg_c','content_id','user_answer','timestamp','correct_answer']].values:
        if row[2] == 0:
            if row[1] == 1:
                answered_correctly_sum_u_dict[row[0]] = dict_sum(answered_correctly_sum_u_dict[row[0]],1)
                answered_correctly_cumsum_u_dict[row[0]] = dict_sum(answered_correctly_cumsum_u_dict[row[0]],1)
                answered_incorrectly_cumsum_u_dict[row[0]] = 0
                parts_answered_correctly_cumsum_u_dict[row[3]][row[0]] = dict_sum(parts_answered_correctly_cumsum_u_dict[row[3]][row[0]],1)
                parts_answered_incorrectly_cumsum_u_dict[row[3]][row[0]] = 0
                last_correct_timestamp_dict[row[0]] = row[7]
            else:
                answered_correctly_cumsum_u_dict[row[0]] = 0
                answered_incorrectly_cumsum_u_dict[row[0]] = dict_sum(answered_incorrectly_cumsum_u_dict[row[0]],1)
                parts_answered_correctly_cumsum_u_dict[row[3]][row[0]] = 0
                parts_answered_incorrectly_cumsum_u_dict[row[3]][row[0]] = dict_sum(parts_answered_incorrectly_cumsum_u_dict[row[3]][row[0]],1)
                last_incorrect_timestamp_dict[row[0]] = row[7]
                if row[3] == 1:
                    like_answer_three_dict[row[6]][row[0]] = dict_sum(like_answer_three_dict[row[6]][row[0]],1)
                    for j in [0,1,3]:
                        if row[6] != j:
                            dislike_answer_three_dict[j][row[0]] = dict_sum(dislike_answer_three_dict[j][row[0]],1)
                else:
                    like_answer_dict[row[6]][row[0]] = dict_sum(like_answer_dict[row[6]][row[0]],1)
                    for j in [0,1,2,3]:
                        if row[6] != j:
                            dislike_answer_dict[j][row[0]] = dict_sum(dislike_answer_dict[j][row[0]],1)
                    
            answered_diff_sum_u_dict[row[0]] = dict_sum(answered_diff_sum_u_dict[row[0]],abs(row[4] - (row[1] * 100)))
            
            if row[5] in answer_per_dict[row[6]]:
                user_answer_per_sum_dict[row[0]] = user_answer_per_sum_dict[row[0]] + answer_per_dict[row[6]][row[5]]
                parts_user_answer_per_sum_dict[row[3]][row[0]] = parts_user_answer_per_sum_dict[row[3]][row[0]] + answer_per_dict[row[6]][row[5]]
            else:
                user_answer_per_sum_dict[row[0]] = user_answer_per_sum_dict[row[0]] + 0.33
                parts_user_answer_per_sum_dict[row[3]][row[0]] = parts_user_answer_per_sum_dict[row[3]][row[0]] + 0.33
            
            avg_c_sum_u_dict[row[0]] = dict_sum(avg_c_sum_u_dict[row[0]],row[4])
            count_u_dict[row[0]] = dict_sum(count_u_dict[row[0]],1)

            parts_u_dict[row[3]][row[0]] = dict_sum(parts_u_dict[row[3]][row[0]],row[1])
            parts_count_u_dict[row[3]][row[0]] = dict_sum(parts_count_u_dict[row[3]][row[0]],1)
            parts_avg_c_dict[row[3]][row[0]] = dict_sum(parts_avg_c_dict[row[3]][row[0]],row[4])
            
            if row[1] == 1:
                content_correct_count_dict[row[5]] =  dict_sum(content_correct_count_dict[row[5]],1)
                content_correct_user_mean_dict[row[5]] = content_correct_user_mean_dict[row[5]] + (answered_correctly_sum_u_dict[row[0]] / count_u_dict[row[0]])
                parts_content_correct_user_mean_dict[row[5]] = parts_content_correct_user_mean_dict[row[5]] + (parts_u_dict[row[3]][row[0]] / parts_count_u_dict[row[3]][row[0]])
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef int dict_sum(int a, int b):
    return a + b

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef int dict_sub(int a, int b):
    return a - b

In [7]:
%%cython
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def add_time_feats(df,time_u_dict,lect_u_dict):
    cdef int arr_size = len(df)
    cdef int cnt
    cdef np.ndarray[long, ndim=1] row
    cdef np.ndarray[long, ndim=1] tu = np.zeros(arr_size,dtype=long)
    cdef np.ndarray[long, ndim=1] lc = np.zeros(arr_size,dtype=long)
    for cnt,row in enumerate(df[['user_id','timestamp','content_type_id']].values):
        if (row[1] - time_u_dict[row[0]]>0):
            tu[cnt] = dict_sub(row[1],time_u_dict[row[0]])
        elif (row[1] == 0):
            tu[cnt] = 0
        else:
            tu[cnt] = tu[cnt - 1]
        lc[cnt] = lect_u_dict[row[0]]
        
        time_u_dict[row[0]] = row[1]
        if (row[2] == 1):
            lect_u_dict[row[0]] = lect_u_dict[row[0]] + 1
    
    cdef int split = 60*60*24
    cdef np.ndarray[long, ndim=1] tu_day = tu // split
    cdef np.ndarray[long, ndim=1] tu_time = tu % split 

    df['lag_time'] = tu_time
    df['lag_time'] = df['lag_time'].astype('uint16')
    df['lag_day'] = tu_day
    df['lag_day'] = df['lag_day'].astype('uint16')
    df.loc[df['lag_day']>0,'lag_time'] = np.iinfo(np.uint16).max
    df['lecture_count'] = lc
    df.loc[df['lecture_count']>np.iinfo(np.uint8).max,'lecture_count'] = np.iinfo(np.uint8).max
    df['lecture_count'] = df['lecture_count'].astype('uint8')
    return df
            
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef int dict_sub(long a, int b):
    return a - b

In [8]:
questions_df = questions_df.set_index('question_id')
questions_df.index.name = 'content_id'

In [9]:
#content_answer_per生成
train = pd.read_pickle(train_pickle)[feld_needed]
train = train.loc[train['content_type_id']==0][['content_id','user_answer']]
tmp = train.groupby('content_id').count()
tmp.rename(columns={'user_answer':'count'},inplace=True)
train = pd.read_pickle(train_pickle)[feld_needed]
train = train.loc[train['content_type_id']==0][['content_id','user_answer','content_type_id']]
tmp2 = train.groupby(['content_id','user_answer']).count().reset_index()
tmp2 = tmp2.merge(tmp,left_on='content_id',right_index=True,how='left')
tmp2['answer_per'] = tmp2['content_type_id'] / tmp2['count']
tmp2 = tmp2[['content_id','user_answer','answer_per']]
tmp2['answer_per'].fillna(0.3,inplace=True)
answer_per_dict = {}
for i in range(4):
    answer_per_dict[i] = tmp2.loc[tmp2['user_answer']==i].set_index('content_id')[['answer_per']].to_dict()['answer_per']
del train, tmp2, tmp

In [10]:
train = pd.read_pickle(train_pickle)[feld_needed]
valid = pd.read_pickle(valid_pickle)[feld_needed]
# answered correctly average for each content
# content_type_idが異なっていて同じコンテンツIDが存在する
content_df = train.loc[train['content_type_id']==0][['content_id','answered_correctly']].groupby(['content_id']).agg(['mean']).reset_index()
content_df.columns = ['content_id', 'answered_correctly_avg_c']
content_df['answered_correctly_avg_c'] = (content_df['answered_correctly_avg_c'] * 100).astype(np.uint8)
content_df = content_df.set_index('content_id')
content_df.index.name = 'content_id'

if debug:
    train = pd.read_pickle(train_pickle)
    train[:1000000].to_csv('debug.csv')
    train = train[feld_needed]
    train = train[:1000000]
    valid = valid[:10000]
else:
    #user_id split because user trace
    #current active user trace 
    print('all =',train['row_id'].min(),train['row_id'].max())
    train = train.sort_values('row_id')
    train = train[int(len(train)/2):]
    print('current =',train['row_id'].min(),train['row_id'].max())
    users = np.random.choice(train['user_id'].unique(), int(len(train['user_id'].unique()) * 8 / 10), replace=True)

    train = pd.read_pickle(train_pickle)[feld_needed]
    train = train.loc[train['user_id'].isin(users)]

print(train.shape)

all = 0 101230331
current = 50611238 101230331
(27270555, 9)


In [11]:
%%cython
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def data_format(df,questions_df,content_df,float prior_question_elapsed_time_mean):
    df['row_id'] = df['row_id'].astype('uint32')
    df['user_id'] = df['user_id'].astype('int32')
    df['content_type_id'] = df['content_type_id'].astype('uint8')
    df.loc[df['content_type_id'] != 0,'content_id'] = 532 #暫定
    df['content_id'] = df['content_id'].astype('uint16')
    # changing dtype to avoid lightgbm error
    df['prior_question_had_explanation'] = df.prior_question_had_explanation.fillna(False).astype('uint8')
    df['prior_question_elapsed_time'] = df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
    df['prior_question_elapsed_time'] = (df['prior_question_elapsed_time'] / 1000).astype('uint16')
    df['timestamp'] = (df['timestamp'] / 1000).astype(np.uint32)
    # merge
    df = pd.concat([df.reset_index(drop=True), questions_df.reindex(df['content_id'].values).reset_index(drop=True)], axis=1)
    df = pd.concat([df.reset_index(drop=True), content_df.reindex(df['content_id'].values).reset_index(drop=True)], axis=1)
    return df

In [12]:
# fill with mean value for prior_question_elapsed_time
# note that `train.prior_question_elapsed_time.mean()` dose not work!
# please refer https://www.kaggle.com/its7171/can-we-trust-pandas-mean for detail.
prior_question_elapsed_time_mean = train.prior_question_elapsed_time.dropna().values.mean()

train = data_format(train,questions_df,content_df,prior_question_elapsed_time_mean)
valid = data_format(valid,questions_df,content_df,prior_question_elapsed_time_mean)

# memory compaction
train.loc[train['answered_correctly'] < 0,'answered_correctly'] = 0
train['answered_correctly'] = train['answered_correctly'].astype('uint8')
train['user_answer'] = train['user_answer'].astype('uint8')
valid.loc[valid['answered_correctly'] < 0,'answered_correctly'] = 0
valid['answered_correctly'] = valid['answered_correctly'].astype('uint8')
valid['user_answer'] = valid['user_answer'].astype('uint8')

In [13]:
train = train.loc[train.content_type_id == False].reset_index(drop=True)
valid = valid.loc[valid.content_type_id == False].reset_index(drop=True)

In [14]:
%%cython
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd

DTYPE = np.int32
ctypedef np.int32_t np_int_t

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def add_prior_feats(df, q_stats_dict,q_enc_tag_dict,q_prior_root_dict,
                    prior_content_dict, prior_prior_content_dict, prior_time_dict, 
                    prior_time_per_sum_dict,
                    prior_lag_dict, prior_prior_lag_dict,
                    lag_sum_dict, prior_avg_c_dict, prior_prior_avg_c_dict,
                    part_lag_sum_dict, prior_part_dict):
    cdef int arr_size = len(df)
    cdef int cnt
    cdef np.ndarray[int, ndim=1] row
    cdef np.ndarray[np_int_t, ndim=1] pc = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] ppc = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] eqtag = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[float, ndim=1] tp = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[np_int_t, ndim=1] pt = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] ul = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] ull = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] pe = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] ls = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] pac = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] ppac = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[float, ndim=1] ptps = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[np_int_t, ndim=1] pls = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[float, ndim=1] pr = np.zeros(arr_size,dtype=np.float32)
    
    for cnt,row in enumerate(df[['user_id','content_id','prior_question_elapsed_time','lag_time' ,'prior_question_had_explanation','answered_correctly_avg_c','part']].values):
        ppc[cnt] = prior_prior_content_dict[row[0]]
        pc[cnt] = prior_content_dict[row[0]]
        if prior_prior_content_dict[row[0]] in q_stats_dict:
            pt[cnt] = prior_time_dict[row[0]] / q_stats_dict[prior_prior_content_dict[row[0]]]
        else:
            pt[cnt] = 1
        ls[cnt] = lag_sum_dict[row[0]]
        pac[cnt] = prior_avg_c_dict[row[0]]
        ppac[cnt] = prior_prior_avg_c_dict[row[0]]
        pls[cnt] = part_lag_sum_dict[row[6]][row[0]]
        if (q_enc_tag_dict[pc[cnt]] == q_enc_tag_dict[row[1]]):
            eqtag[cnt] = 1
        else:
            eqtag[cnt] = 0
        if prior_content_dict[row[0]] > 0:
            if (row[1],prior_content_dict[row[0]]) in q_prior_root_dict:
                pr[cnt] = q_prior_root_dict[(row[1],prior_content_dict[row[0]])]
        else:
            if (row[1],-999) in q_prior_root_dict:
                pr[cnt] = q_prior_root_dict[(row[1],-999)]
        if (prior_content_dict[row[0]] > 0) & (prior_content_dict[row[0]] in q_stats_dict):
            tp[cnt] = row[2] / q_stats_dict[prior_content_dict[row[0]]]
            prior_time_per_sum_dict[row[0]] = prior_time_per_sum_dict[row[0]] + tp[cnt]
        else:
            tp[cnt] = 1
        ptps[cnt] = prior_time_per_sum_dict[row[0]]
        
        if prior_content_dict[row[0]] > 0:
            prior_part_dict[row[0]] = row[6]
            part_lag_sum_dict[prior_part_dict[row[0]]][row[0]] = part_lag_sum_dict[prior_part_dict[row[0]]][row[0]] + tp[cnt]
        prior_prior_content_dict[row[0]] = prior_content_dict[row[0]]
        prior_content_dict[row[0]] = row[1]
        prior_time_dict[row[0]] = row[2] #1つ前のコンテンツの回答時間
        ul[cnt] = prior_lag_dict[row[0]]
        ull[cnt] = prior_prior_lag_dict[row[0]]
        prior_prior_lag_dict[row[0]] = prior_lag_dict[row[0]]
        prior_lag_dict[row[0]] = row[3]
        lag_sum_dict[row[0]] = lag_sum_dict[row[0]] + row[3]
        prior_prior_avg_c_dict[row[0]] = prior_avg_c_dict[row[0]]
        prior_avg_c_dict[row[0]] = row[5]
            
    df['prior_content_id'] = pc
    df['prior_content_id'] = df['prior_content_id'].astype('uint16')
    df['prior_content_diff'] = df['content_id'] - df['prior_content_id']
    df.loc[df['timestamp']==0, 'prior_content_diff'] = -999
    #
    df['prior_root'] = pr
    df['prior_root'] = df['prior_root'].astype('float16')
    
    df['is_same_prior'] = eqtag
    df.loc[df['prior_content_id'] == df['content_id'],'is_same_prior'] = df['is_same_prior'] + 2
    df['is_same_prior'] = df['is_same_prior'].astype('uint8')
    df['lag_time_per'] = df['lag_time'] / df['prior_question_elapsed_time'].astype('float32')
    df['elapsed_lag_per'] = tp
    df['elapsed_lag_per'] = df['elapsed_lag_per'].astype('float16')
    df['elapsed_time_per_mean'] = ptps
    df['elapsed_time_per_mean'] = df['elapsed_time_per_mean'].astype('float16')
    
    df['part_elapsed_time_per_mean'] = pls
    df['part_elapsed_time_per_mean'] = df['part_elapsed_time_per_mean'].astype('float16')

    df['prior_prior_lag_time'] = ull
    df['prior_prior_lag_time'] = df['prior_prior_lag_time'].astype('uint16')
    df['prior_lag_time'] = ul
    df['prior_lag_time'] = df['prior_lag_time'].astype('uint16')
    df['lag_lag_time'] = df['lag_time'] / df['prior_lag_time']
    df['lag_lag_time'] = df['lag_lag_time'].astype('float16')
    df['lag_sum'] = ls
    df['prior_avg_c'] = pac
    df['prior_avg_c'] = df['prior_avg_c'].astype('uint8')
    df['prior_prior_avg_c'] = ppac
    df['prior_prior_avg_c'] = df['prior_prior_avg_c'].astype('uint8')
    return df

In [15]:
#q_stats_dict = pd.read_csv('../input/riiiddataset/question_stats.csv').set_index('content_id')[['q_elapsed_time_mean']].to_dict()['q_elapsed_time_mean']
#correct answer only
q_stats_dict = pd.read_csv('../input/riiiddataset/correct_q_elapsed_time_mean.csv').set_index('content_id')[['correct_q_elapsed_time_mean']].to_dict()['correct_q_elapsed_time_mean']
q_enc_tag_dict = questions_df[['enc_tags']].to_dict()['enc_tags']
q_ans_dict = questions_df[['correct_answer']].to_dict()['correct_answer']
q_prior_root_dict = pd.read_csv('../input/riiiddataset/prior_position_per.csv').fillna(-999).set_index(['content_id','prior_content_id']).to_dict()['prior_position_per']

In [16]:
%%cython
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd

DTYPE = np.int32
ctypedef np.int32_t np_int_t

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def add_user_content_feats(df, user_content_dict, user_tags_dict, user_repeat_count_dict):
    cdef int arr_size = len(df)
    cdef int cnt,i
    cdef np.ndarray[np_int_t, ndim=1] row
    cdef np.ndarray[np_int_t, ndim=1] uc = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] rc = np.zeros(arr_size,dtype=DTYPE)
    for cnt,row in enumerate(df[['user_id','content_id','answered_correctly','enc_tags','part']].values):
        if (row[1] in user_content_dict[row[0]]):
            uc[cnt] = user_content_dict[row[0]][row[1]]
        else:
            uc[cnt] = 0
        if (row[3] in user_tags_dict[row[0]]):
            uc[cnt] = uc[cnt] + (user_tags_dict[row[0]][row[3]] * 2)

        rc[cnt] = user_repeat_count_dict[row[4]][row[0]]
        
        if row[2] == 0:
            user_content_dict[row[0]][row[1]] = 1
            user_tags_dict[row[0]][row[3]] = 1
        else:
            user_content_dict[row[0]][row[1]] = 2
            user_tags_dict[row[0]][row[3]] = 2
            
        if row[1] in user_content_dict[row[0]]:
            if not row[0] in user_repeat_count_dict[row[4]]:
                user_repeat_count_dict[row[4]][row[0]] = 0
            user_repeat_count_dict[row[4]][row[0]] = user_repeat_count_dict[row[4]][row[0]] + 1
    df['done_content_tag'] = uc
    df['done_content_tag'] = df['done_content_tag'].astype('uint8')
    df['repeat_part_per'] = rc
    return df

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def update_user_content_feats(df,user_content_dict, user_tags_dict, user_repeat_count_dict):
    #講義列あり
    cdef int arr_size = len(df)
    cdef int cnt
    cdef np.ndarray[np_int_t, ndim=1] row
    for cnt,row in enumerate(df[['user_id','content_id','answered_correctly','enc_tags','content_type_id','part']].values):
        if row[4] == 0:
            if row[2] == 0:
                user_content_dict[row[0]][row[1]] = 1
                user_tags_dict[row[0]][row[3]] = 1
            else:
                user_content_dict[row[0]][row[1]] = 2
                user_tags_dict[row[0]][row[3]] = 2
            if row[1] in user_content_dict[row[0]]:
                if not row[0] in user_repeat_count_dict[row[5]]:
                    user_repeat_count_dict[row[5]][row[0]] = 0
            user_repeat_count_dict[row[5]][row[0]] = user_repeat_count_dict[row[5]][row[0]] + 1
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def add_user_content_feats_without_update(df,user_content_dict, user_tags_dict, user_repeat_count_dict):
    cdef int arr_size = len(df)
    cdef np.ndarray[np_int_t, ndim=1] uc = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] rc = np.zeros(arr_size,dtype=DTYPE)
    cdef int cnt, i
    cdef np.ndarray[np_int_t, ndim=1] row
    #content_type_idの考慮不要
    for cnt,row in enumerate(df[['user_id','content_id','enc_tags','part']].values):
        if (row[1] in user_content_dict[row[0]]):
            uc[cnt] = user_content_dict[row[0]][row[1]]
        else:
            uc[cnt] = 0
        
        if (row[2] in user_tags_dict[row[0]]):
            uc[cnt] = uc[cnt] + (user_tags_dict[row[0]][row[2]] * 2)
        rc[cnt] = user_repeat_count_dict[row[3]][row[0]]
        
    df['done_content_tag'] = uc
    df['done_content_tag'] = df['done_content_tag'].astype('uint8')
    df['repeat_part_per'] = rc
    return df

In [17]:
%%cython
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def post_round(df,list use_tags,list use_content):
    df.loc[~df['enc_tags'].isin(use_tags),'enc_tags'] = 65535
    df.loc[~df['content_id'].isin(use_content),'content_id'] = 532 #暫定
    return df

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def post_features(df):
    cdef int i
    df['lag_mean'] = (df['lag_sum'] / df['count_u'])
    df.loc[df['lag_mean'] > 65535,'lag_mean'] = 65535
    df['lag_mean'] = df['lag_mean'].astype('float16')
    
    df['elapsed_time_per_mean'] = df['elapsed_time_per_mean'] / df['count_u']
    df['elapsed_time_per_mean'].fillna(1,inplace=True)
    df['elapsed_time_per_mean'] = df['elapsed_time_per_mean'].astype('float16')
    
    cdef list parts = list(df['part'].unique())
    cdef str p
    for i in parts:
        p = str(i)
        df.loc[df['part']==i,'part_elapsed_time_per_mean'] = df['part_elapsed_time_per_mean'] / df['p' + p + '_count_u']
        df.loc[df['p' + p  + '_count_u']>0,'repeat_part_per'] = df['repeat_part_per'] / df['p' + p  + '_count_u']

    df['prior_question_elapsed_time'] = df['prior_question_elapsed_time'] / df['part_elapsed_time_per_mean']
    
    df.loc[df['part_user_answer_per_mean'] == 0, 'part_user_answer_per_mean'] = np.nan
    df['repeat_part_per'] = df['repeat_part_per'].astype('float16')
    return df

In [18]:
#cat round
round_max = np.iinfo(np.uint16).max

use_tags = list(train['enc_tags'].value_counts()[train['enc_tags'].value_counts()>3].index)
use_content = list(train['content_id'].value_counts()[train['content_id'].value_counts()>3].index)

In [19]:
del train,valid
_=gc.collect()

## modeling

In [20]:
TARGET = 'answered_correctly'
FEATS = ['answered_correctly_avg_u','prior_root',#, 'answered_correctly_sum_u',, 'prior_question_had_explanation' 
         'answered_cumsum_u',#'q_elapsed_time_mean',#'prior_prior_question_elapsed_time_per','q_elapsed_time_per',
         'answered_correctly_avg_c','avg_c_per_u','part_avg_c_mean','elapsed_time_per_mean','part_answered_cumsum_u',
         'part','lag_mean','user_answer_per_mean','part_user_answer_per_mean',#'lag_time_per',
         'count_u','prior_question_elapsed_time','lag_time','prior_avg_c',#'prior_prior_avg_c',#,'lag_day',,'is_same_prior_content'
         'part_count_per','lr_count_per','part_elapsed_time_per_mean','last_incorrect_timelag','last_correct_timelag',
         'p0_mean_u','p1_mean_u','p2_mean_u','p3_mean_u','p4_mean_u','p5_mean_u','p6_mean_u','done_content_tag','avg_c_mean',#'done_tags',
         'content_id','lecture_count','content_lv','part_content_lv','is_same_prior','prior_content_diff',#'enc_tags'
         'answered_diff_mean','prior_lag_time','prior_prior_lag_time','elapsed_lag_per','is_like_answer','is_dislike_answer','repeat_part_per']#,'lag_lag_time','lag_tail_half_mean'
#categorical_feature:high-cardinalityなカテゴリ変数
CATEGORICAL = ['part','content_id']

In [21]:
models = []
root = '../input/model-deploy-20210104-root-samplingoutput/'
file = root + 'lgb.pkl'
models.append(pickle.load(open(file, 'rb'))) 
root = '../input/modeldeploy20210104rootsamplinguserchangeoutput/'
file = root + 'lgb.pkl'
models.append(pickle.load(open(file, 'rb'))) 
root = '../input/modeldeploy20210104rootsamplinguserchangev2output/'
file = root + 'lgb.pkl'
models.append(pickle.load(open(file, 'rb'))) 
root = '../input/model-deploy-20210106-loutput/'
file = root + 'lgb.pkl'
models.append(pickle.load(open(file, 'rb'))) 
root = '../input/model-deploy-20210106-woutput/'
file = root + 'lgb.pkl'
models.append(pickle.load(open(file, 'rb'))) 

In [22]:
root = '../input/data-deploy-20210101output/'

content_df = pd.read_csv(root + 'content_df.csv')
content_df = content_df.set_index('content_id')
content_df.index.name = 'content_id'
content_df['answered_correctly_avg_c'] = content_df['answered_correctly_avg_c'].astype(np.uint8)

with open(root + 'time_u_dict.pickle', 'rb') as f:
    time_u_dict = pickle.load(f)
        
with open(root + 'prior_content_dict.pickle', 'rb') as f:
    prior_content_dict = pickle.load(f)
        
with open(root + 'prior_prior_content_dict.pickle', 'rb') as f:
    prior_prior_content_dict = pickle.load(f)
        
with open(root + 'answered_correctly_sum_u_dict', 'rb') as f:
    answered_correctly_sum_u_dict = pickle.load(f)   
        
with open(root + 'answered_correctly_cumsum_u_dict.pickle', 'rb') as f:
    answered_correctly_cumsum_u_dict = pickle.load(f)  
        
with open(root + 'answered_incorrectly_cumsum_u_dict.pickle', 'rb') as f:
    answered_incorrectly_cumsum_u_dict = pickle.load(f) 
        
with open(root + 'count_u_dict', 'rb') as f:
    count_u_dict = pickle.load(f)
        
with open(root + 'parts_count_u_dict.pickle', 'rb') as f:
    parts_count_u_dict = pickle.load(f)
        
with open(root + 'parts_u_dict.pickle', 'rb') as f:
    parts_u_dict = pickle.load(f)
        
with open(root + 'answered_diff_sum_u_dict.pickle', 'rb') as f:
    answered_diff_sum_u_dict = pickle.load(f)
        
with open(root + 'user_content_dict.pickle', 'rb') as f:
    user_content_dict = pickle.load(f)
        
with open(root + 'user_tags_dict.pickle', 'rb') as f:
    user_tags_dict = pickle.load(f)
    
with open(root + 'prior_lag_dict.pickle','rb') as f:
    prior_lag_dict = pickle.load(f)

with open(root + 'prior_prior_lag_dict.pickle','rb') as f:
    prior_prior_lag_dict = pickle.load(f)
        
with open(root + 'prior_time_dict.pickle', 'rb') as f:
    prior_time_dict = pickle.load(f)
    
with open(root + 'prior_time_per_sum_dict.pickle','rb') as f:
    prior_time_per_sum_dict = pickle.load(f)   
    
with open(root + 'part_lag_sum_dict.pickle','rb') as f:
    part_lag_sum_dict = pickle.load(f)
    
with open(root + 'prior_part_dict.pickle','rb') as f:
    prior_part_dict = pickle.load(f)
    
with open(root + 'lag_sum_dict.pickle','rb') as f:
    lag_sum_dict = pickle.load(f)
        
with open(root + 'avg_c_sum_u_dict.pickle','rb') as f:
    avg_c_sum_u_dict = pickle.load(f)
    
with open(root + 'user_answer_per_sum_dict.pickle','rb') as f:
    user_answer_per_sum_dict = pickle.load(f)
    
with open(root + 'parts_user_answer_per_sum_dict.pickle','rb') as f:
    parts_user_answer_per_sum_dict = pickle.load(f)
    
with open(root + 'content_correct_user_mean_dict.pickle', 'rb') as f:
    content_correct_user_mean_dict = pickle.load(f)
        
with open(root + 'content_correct_count_dict.pickle', 'rb') as f:
    content_correct_count_dict = pickle.load(f)

with open(root + 'parts_content_correct_user_mean_dict.pickle', 'rb') as f:
    parts_content_correct_user_mean_dict = pickle.load(f)
        
with open(root + 'prior_avg_c_dict.pickle', 'rb') as f:
    prior_avg_c_dict = pickle.load(f)

with open(root + 'prior_prior_avg_c_dict.pickle', 'rb') as f:
    prior_prior_avg_c_dict = pickle.load(f)
        
with open(root + 'parts_avg_c_dict.pickle', 'rb') as f:
    parts_avg_c_dict = pickle.load(f)
        
with open(root + 'lect_u_dict.pickle', 'rb') as f:
    lect_u_dict = pickle.load(f)
        
with open(root + 'last_correct_timestamp_dict.pickle','rb') as f:
    last_correct_timestamp_dict = pickle.load(f)
        
with open(root + 'last_incorrect_timestamp_dict.pickle','rb') as f:
    last_incorrect_timestamp_dict = pickle.load(f)       
        
with open(root + 'like_answer_three_dict.pickle','rb') as f:
    like_answer_three_dict = pickle.load(f)
        
with open(root + 'like_answer_dict.pickle', 'rb') as f:
    like_answer_dict = pickle.load(f)

with open(root + 'dislike_answer_three_dict.pickle','rb') as f:
    dislike_answer_three_dict = pickle.load(f)
        
with open(root + 'dislike_answer_dict.pickle', 'rb') as f:
    dislike_answer_dict = pickle.load(f)
        
with open(root + 'parts_answered_correctly_cumsum_u_dict.pickle','rb') as f:
    parts_answered_correctly_cumsum_u_dict = pickle.load(f)

with open(root + 'parts_answered_incorrectly_cumsum_u_dict.pickle','rb') as f:
    parts_answered_incorrectly_cumsum_u_dict = pickle.load(f)
    
with open(root + 'user_repeat_count_dict.pickle', 'rb') as f:
    user_repeat_count_dict = pickle.load(f)

# model weight initialize

In [23]:
user_count = defaultdict(int)
model_weight = {}
users = {}
loss = {}
for m in range(len(models)):
    model_weight[m] = defaultdict(int)
    users[m] = defaultdict(int)

root = '../input/model-deploy-20210104-root-samplingoutput/'
file = root + 'train_users.pickle'
users[0] = pickle.load(open(file, 'rb'))
loss[0] = defaultdict(int)
root = '../input/modeldeploy20210104rootsamplinguserchangeoutput/'
file = root + 'train_users.pickle'
users[1] = pickle.load(open(file, 'rb'))
loss[1] = defaultdict(int)
root = '../input/modeldeploy20210104rootsamplinguserchangev2output/'
file = root + 'train_users.pickle'
users[2] = pickle.load(open(file, 'rb'))
loss[2] = defaultdict(int)


for d in range(len(users)):
    for u in users[d]:
        model_weight[d][u] = 1
        user_count[u] = 1

In [24]:
%%cython
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd

DTYPE = np.int32
ctypedef np.int32_t np_int_t

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def get_model_weight(df,dict model_weight,user_count):
    
    cdef np.ndarray[np_int_t, ndim=1] row
    cdef int arr_size = len(df)
    cdef int cnt
    cdef np.ndarray[float, ndim=1] weight0 = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[float, ndim=1] weight1 = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[float, ndim=1] weight2 = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[float, ndim=1] weight3 = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[float, ndim=1] weight4 = np.zeros(arr_size,dtype=np.float32)
    for cnt,row in enumerate(df[['user_id','content_type_id','part']].values):
        if row[1] == 0:
            if user_count[row[0]] == 0:
                weight0[cnt] = 1/3
                weight1[cnt] = 1/3
                weight2[cnt] = 1/3
            else:
                weight0[cnt] = model_weight[0][row[0]] / user_count[row[0]]
                weight1[cnt] = model_weight[1][row[0]] / user_count[row[0]]
                weight2[cnt] = model_weight[2][row[0]] / user_count[row[0]]
        if row[2]<4:
            weight3[cnt] = 0.15
            weight4[cnt] = 0
        else:
            weight3[cnt] = 0
            weight4[cnt] = 0.15
            
          
    df['weight0'] = weight0 / (weight0 + weight1 + weight2  + weight3 + weight4)
    df['weight1'] = weight1 / (weight0 + weight1 + weight2  + weight3 + weight4)
    df['weight2'] = weight2 / (weight0 + weight1 + weight2  + weight3 + weight4)
    df['weight3'] = weight3 / (weight0 + weight1 + weight2  + weight3 + weight4)
    df['weight4'] = weight4 / (weight0 + weight1 + weight2  + weight3 + weight4)

    df['weight_all'] = df['weight0'] + df['weight1'] + df['weight2'] + df['weight3'] + df['weight4']
    return df

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def update_model_weight(df,dict model_weight,user_count,loss):
    cdef int best,taget,cnt
    cdef np.ndarray[np_int_t, ndim=1] row
    cdef int arr_size = len(df)
    df['predicts0'] = (df['predicts0'] * 100).astype('uint8')
    df['predicts1'] = (df['predicts1'] * 100).astype('uint8')
    df['predicts2'] = (df['predicts2'] * 100).astype('uint8')
    
    #                       　　　   0                  1         　　       2             3           4            5       
    for cnt,row in enumerate(df[['user_id','answered_correctly','content_type_id','predicts0','predicts1','predicts2']].values):
        if row[2] == 0:
            target = row[1] * 100
            loss[0][row[0]] = loss[0][row[0]] + abs(target - row[3]) 
            loss[1][row[0]] = loss[1][row[0]] + abs(target - row[4]) 
            loss[2][row[0]] = loss[2][row[0]] + abs(target - row[5])
            best = np.argmin([loss[0][row[0]],loss[1][row[0]],loss[2][row[0]]])
            model_weight[best][row[0]] = model_weight[best][row[0]] + 1
            user_count[row[0]] = user_count[row[0]] + 1

In [25]:
class Iter_Valid(object):
    def __init__(self, df, max_user=1000):
        df = df.reset_index(drop=True)
        self.df = df
        self.user_answer = df['user_answer'].astype(str).values
        self.answered_correctly = df['answered_correctly'].astype(str).values
        df['prior_group_responses'] = "[]"
        df['prior_group_answers_correct'] = "[]"
        self.sample_df = df[df['content_type_id'] == 0][['row_id']]
        self.sample_df['answered_correctly'] = 0
        self.len = len(df)
        self.user_id = df.user_id.values
        self.task_container_id = df.task_container_id.values
        self.content_type_id = df.content_type_id.values
        self.max_user = max_user
        self.current = 0
        self.pre_user_answer_list = []
        self.pre_answered_correctly_list = []

    def __iter__(self):
        return self
    
    def fix_df(self, user_answer_list, answered_correctly_list, pre_start):
        df= self.df[pre_start:self.current].copy()
        sample_df = self.sample_df[pre_start:self.current].copy()
        df.loc[pre_start,'prior_group_responses'] = '[' + ",".join(self.pre_user_answer_list) + ']'
        df.loc[pre_start,'prior_group_answers_correct'] = '[' + ",".join(self.pre_answered_correctly_list) + ']'
        self.pre_user_answer_list = user_answer_list
        self.pre_answered_correctly_list = answered_correctly_list
        return df, sample_df

    def __next__(self):
        added_user = set()
        pre_start = self.current
        pre_added_user = -1
        pre_task_container_id = -1
        pre_content_type_id = -1
        user_answer_list = []
        answered_correctly_list = []
        while self.current < self.len:
            crr_user_id = self.user_id[self.current]
            crr_task_container_id = self.task_container_id[self.current]
            crr_content_type_id = self.content_type_id[self.current]
            if crr_user_id in added_user and (crr_user_id != pre_added_user or (crr_task_container_id != pre_task_container_id and crr_content_type_id == 0 and pre_content_type_id == 0)):
                # known user(not prev user or (differnt task container and both question))
                return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            if len(added_user) == self.max_user:
                if  crr_user_id == pre_added_user and (crr_task_container_id == pre_task_container_id or crr_content_type_id == 1):
                    user_answer_list.append(self.user_answer[self.current])
                    answered_correctly_list.append(self.answered_correctly[self.current])
                    self.current += 1
                    continue
                else:
                    return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            added_user.add(crr_user_id)
            pre_added_user = crr_user_id
            pre_task_container_id = crr_task_container_id
            pre_content_type_id = crr_content_type_id
            user_answer_list.append(self.user_answer[self.current])
            answered_correctly_list.append(self.answered_correctly[self.current])
            self.current += 1
        if pre_start < self.current:
            return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
        else:
            raise StopIteration()

In [26]:
import riiideducation
if debug:
    target_df = pd.read_csv('debug.csv',index_col=0)
    iter_test = Iter_Valid(target_df,max_user=1000)
    predicted = []
    def set_predict(df):
        predicted.append(df)
else:
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict   

In [27]:
#0.55 sec/iter
#test_itr = 0:00:01.182652
previous_test_df = None
predicts0 = None
predicts1 = None
predicts2 = None
predicts_final = None
for (test_df, sample_prediction_df) in iter_test:
    start = datetime.datetime.now()
    #講座の対処ができていないので暫定で講座コンテンツID置換(集計対象外)
    test_df.loc[test_df['content_type_id'] == 1,'content_id'] = 0
    test_df = data_format(test_df,questions_df,content_df,prior_question_elapsed_time_mean)
    test_df = post_round(test_df,use_tags,use_content)
    
    if previous_test_df is not None:
        previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
        previous_test_df[TARGET] = previous_test_df[TARGET].astype('uint8')
        previous_test_df['user_answer'] = eval(test_df["prior_group_responses"].iloc[0])
        previous_test_df['user_answer'] = previous_test_df['user_answer'].astype('uint8')

        update_user_content_feats(previous_test_df, user_content_dict, user_tags_dict, user_repeat_count_dict)
        update_user_feats(previous_test_df,
                          answer_per_dict,
                          answered_correctly_sum_u_dict,
                          answered_correctly_cumsum_u_dict,
                          answered_incorrectly_cumsum_u_dict,
                          count_u_dict,
                          parts_u_dict,
                          parts_count_u_dict,
                          answered_diff_sum_u_dict,
                          avg_c_sum_u_dict,
                          parts_avg_c_dict,
                          user_answer_per_sum_dict,
                          parts_user_answer_per_sum_dict,
                          content_correct_user_mean_dict,
                          content_correct_count_dict,
                          parts_content_correct_user_mean_dict,
                          last_correct_timestamp_dict,
                          last_incorrect_timestamp_dict,
                          like_answer_dict,
                          like_answer_three_dict,
                          dislike_answer_dict,
                          dislike_answer_three_dict,
                          parts_answered_correctly_cumsum_u_dict,
                          parts_answered_incorrectly_cumsum_u_dict)
        
        previous_test_df = previous_test_df[previous_test_df['content_type_id'] == 0].reset_index(drop=True)
        previous_test_df['predicts0'] = predicts0
        previous_test_df['predicts1'] = predicts1
        previous_test_df['predicts2'] = predicts2
        previous_test_df['predicts3'] = predicts3
        previous_test_df['predicts4'] = predicts4
        ##DEBUG
        #previous_test_df['predicts_final'] = predicts_final
        #display(previous_test_df[['weight0','weight1','weight2','weight3','weight4','weight_all','part',
        #                          'predicts0','predicts1','predicts2','predicts3','predicts4',TARGET]])
        ##
        update_model_weight(previous_test_df,model_weight,user_count,loss)
    
    test_df = add_time_feats(test_df,time_u_dict,lect_u_dict)
    test_df = add_user_content_feats_without_update(test_df,
                                                    user_content_dict,
                                                    user_tags_dict,
                                                    user_repeat_count_dict)
    
    test_df = get_model_weight(test_df,model_weight,user_count)
    
    #Type Safeとするため暫定で存在するコンテンツに置換
    previous_test_df = test_df.copy()
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = add_user_feats_without_update(test_df, 
                                            answer_per_dict,
                                            answered_correctly_sum_u_dict,
                                            answered_correctly_cumsum_u_dict,
                                            answered_incorrectly_cumsum_u_dict,
                                            count_u_dict,
                                            parts_u_dict,
                                            parts_count_u_dict,
                                            answered_diff_sum_u_dict,
                                            avg_c_sum_u_dict,
                                            parts_avg_c_dict,
                                            user_answer_per_sum_dict,
                                            parts_user_answer_per_sum_dict,
                                            content_correct_user_mean_dict,
                                            content_correct_count_dict,
                                            parts_content_correct_user_mean_dict,
                                            last_correct_timestamp_dict,
                                            last_incorrect_timestamp_dict,
                                            like_answer_dict,
                                            like_answer_three_dict,
                                            dislike_answer_dict,
                                            dislike_answer_three_dict,
                                            parts_answered_correctly_cumsum_u_dict,
                                            parts_answered_incorrectly_cumsum_u_dict)
    test_df = add_prior_feats(test_df, 
                              q_stats_dict,
                              q_enc_tag_dict,
                              q_prior_root_dict,
                              prior_content_dict,
                              prior_prior_content_dict,
                              prior_time_dict,
                              prior_time_per_sum_dict,
                              prior_lag_dict,
                              prior_prior_lag_dict,
                              lag_sum_dict,
                              prior_avg_c_dict,
                              prior_prior_avg_c_dict,
                              part_lag_sum_dict,
                              prior_part_dict)
    
    test_df = post_features(test_df)
    
    predicts0 =  models[0].predict(test_df[FEATS])
    predicts1 =  models[1].predict(test_df[FEATS])
    predicts2 =  models[2].predict(test_df[FEATS])
    predicts3 =  models[3].predict(test_df[FEATS])
    predicts4 =  models[4].predict(test_df[FEATS])
    
    predicts_final = ((predicts0 * test_df['weight0']) + (predicts1 * test_df['weight1']) + (predicts2 * test_df['weight2']) + (predicts3 * test_df['weight3']) + (predicts4 * test_df['weight4']))
    test_df[TARGET] = predicts_final 
    
    set_predict(test_df[['row_id', TARGET]])
    print('test_itr =',(datetime.datetime.now() - start))



test_itr = 0:00:00.213102




test_itr = 0:00:00.248566




test_itr = 0:00:00.248934
test_itr = 0:00:00.256500


