In [None]:
%load_ext Cython

In [None]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
import lightgbm as lgb
import pickle
import datetime
import collections
from sklearn.preprocessing import LabelEncoder
import random
import os

In [None]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(707)

In [None]:
train_pickle = '../input/riiid-cross-validation-files/cv1_train.pickle'
valid_pickle = '../input/riiid-cross-validation-files/cv1_valid.pickle'
question_file = '../input/riiid-test-answer-prediction/questions.csv'
debug = False
build = True
root = '../input/base-0794-20201224output/'

# read data
feld_needed = ['row_id','timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation','user_answer']

## feature engineering

In [None]:
# Transform tags into lists of ints:
questions_df = pd.read_csv(question_file)
questions_df['part'] = (questions_df['part'] - 1).astype('uint8')
questions_df['correct_answer'] = questions_df['correct_answer'].astype('uint8')
questions_df['tags'] = questions_df['tags'].apply(lambda ts: [int(x) for x in str(ts).split() if x != 'nan'])

tag_rank = []
tag_columns = []
tag_to_questions = {}
for i, row in questions_df.iterrows():
    for t in row['tags']:
        tag_rank.append(t)
        if t not in tag_to_questions:
            tag_to_questions[t] = set()
        tag_to_questions[t].add(row['question_id'])
tags_df = pd.DataFrame([{'tag':t,'questions':qs}for t,qs in tag_to_questions.items()])
tag_rank, counts = zip(*collections.Counter(tag_rank).most_common(1))
print(tag_rank)
for t in tag_rank:
    tag_columns.append('tags_' + str(t))
    for i in range(len(questions_df)):
        if t in questions_df.iloc[i]['tags']:
            questions_df.at[i,'tags_' + str(t)] = 1
        else:
            questions_df.at[i,'tags_' + str(t)] = 0
    questions_df['tags_' + str(t)] = questions_df['tags_' + str(t)].astype('uint8')

del questions_df['bundle_id']
print(tag_columns)

le = LabelEncoder()
encoded = le.fit_transform(questions_df['tags'].astype(str))
decoded = le.inverse_transform(encoded)
questions_df['enc_tags'] = encoded.astype('uint16')
del questions_df['tags'], le

questions_df.head(3)

In [None]:
train = pd.read_pickle(train_pickle)[feld_needed]
valid = pd.read_pickle(valid_pickle)[feld_needed]
tmp = pd.concat([train[['content_type_id','content_id','answered_correctly']],valid[['content_type_id','content_id','answered_correctly']]])
content_df = tmp.loc[tmp['content_type_id']==0][['content_id','answered_correctly']].groupby(['content_id']).agg(['mean']).reset_index()
content_df.columns = ['content_id', 'answered_correctly_avg_c']
content_df['answered_correctly_avg_c'] = (content_df['answered_correctly_avg_c'] * 100).astype(np.uint8)
content_df = content_df.set_index('content_id')
content_df.index.name = 'content_id'
content_df.to_csv('content_df.csv')
del content_df,tmp,train,valid
_=gc.collect()

In [None]:
%%cython
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd
import gc

DTYPE = np.int32
ctypedef np.int32_t np_int_t
#dict
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def add_user_feats(df, 
                   answer_per_dict,
                   answered_correctly_sum_u_dict,
                   answered_correctly_cumsum_u_dict,
                   answered_incorrectly_cumsum_u_dict,
                   count_u_dict,
                   parts_u_dict,
                   parts_count_u_dict,
                   answered_diff_sum_u_dict,
                   avg_c_sum_u_dict,
                   parts_avg_c_dict,
                   user_answer_per_sum_dict,
                   parts_user_answer_per_sum_dict,
                   content_correct_user_mean_dict,
                   content_correct_count_dict,
                   parts_content_correct_user_mean_dict,
                   last_correct_timestamp_dict,
                   last_incorrect_timestamp_dict,
                   like_answer_dict,
                   like_answer_three_dict,
                   dislike_answer_dict,
                   dislike_answer_three_dict,
                   parts_answered_correctly_cumsum_u_dict,
                   parts_answered_incorrectly_cumsum_u_dict):
    
    cdef int arr_size = len(df)
    
    cdef np.ndarray[np_int_t, ndim=1] acsu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] accu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] aicu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] cu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=2] ptu = np.zeros([arr_size,7], dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=2] ptcu = np.zeros([arr_size,7], dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] paccu = np.zeros(arr_size, dtype=DTYPE)#
    cdef np.ndarray[np_int_t, ndim=1] paicu = np.zeros(arr_size, dtype=DTYPE)#
    cdef np.ndarray[np_int_t, ndim=1] adsu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] avcu = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] pavc = np.zeros(arr_size, dtype=DTYPE)
    cdef np.ndarray[float, ndim=1] uaps = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[float, ndim=1] puaps = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[float, ndim=1] cucm = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[float, ndim=1] pcucm = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[long, ndim=1] lct = np.zeros(arr_size, dtype=long)
    cdef np.ndarray[long, ndim=1] lit = np.zeros(arr_size, dtype=long)
    cdef np.ndarray[float, ndim=1] like = np.zeros(arr_size, dtype=np.float32)
    cdef np.ndarray[float, ndim=1] dislike = np.zeros(arr_size, dtype=np.float32)
    cdef int cnt, i, j
    cdef np.ndarray[long, ndim=1] row
    #                                0             1              2                3                     4           5            6                7                     
    for cnt,row in enumerate(df[['user_id','answered_correctly','part','answered_correctly_avg_c','content_id','user_answer','timestamp','correct_answer']].values):
        acsu[cnt] = answered_correctly_sum_u_dict[row[0]]
        accu[cnt] = answered_correctly_cumsum_u_dict[row[0]]
        aicu[cnt] = answered_incorrectly_cumsum_u_dict[row[0]]
        paccu[cnt] = parts_answered_correctly_cumsum_u_dict[row[2]][row[0]]
        paicu[cnt] = parts_answered_incorrectly_cumsum_u_dict[row[2]][row[0]]
        cu[cnt] = count_u_dict[row[0]]
        adsu[cnt] = answered_diff_sum_u_dict[row[0]]
        avcu[cnt] = avg_c_sum_u_dict[row[0]]
        uaps[cnt] = user_answer_per_sum_dict[row[0]]
        lct[cnt] = row[6] - last_correct_timestamp_dict[row[0]]
        lit[cnt] = row[6] - last_incorrect_timestamp_dict[row[0]]

        if row[2] == 1:
            if (row[6] == 0) or (dict_sub(answered_correctly_sum_u_dict[row[0]] ,count_u_dict[row[0]])==0):
                like[cnt] = np.nan
                dislike[cnt] = np.nan
            else:
                like[cnt] = like_answer_three_dict[row[7]][row[0]] / dict_sub(cu[cnt] ,acsu[cnt])
                dislike[cnt] = dislike_answer_three_dict[row[7]][row[0]] / dict_sub(cu[cnt] ,acsu[cnt])
        else:
            if (row[6] == 0) or (dict_sub(answered_correctly_sum_u_dict[row[0]] , count_u_dict[row[0]])==0):
                like[cnt] = np.nan
                dislike[cnt] = np.nan
            else:
                like[cnt] = like_answer_dict[row[7]][row[0]] / dict_sub(cu[cnt] , acsu[cnt])
                dislike[cnt] = dislike_answer_dict[row[7]][row[0]] / dict_sub(cu[cnt] , acsu[cnt])
        if content_correct_count_dict[row[4]] > 0:
            cucm[cnt] = content_correct_user_mean_dict[row[4]] / content_correct_count_dict[row[4]]
            pcucm[cnt] = parts_content_correct_user_mean_dict[row[4]] / content_correct_count_dict[row[4]]
        for i in range(7):
            ptu[cnt,i] = parts_u_dict[i][row[0]]
            ptcu[cnt,i] = parts_count_u_dict[i][row[0]]
            if i == row[2]:
                pavc[cnt] = parts_avg_c_dict[i][row[0]] / dict_sum(parts_count_u_dict[i][row[0]] , 1)
                parts_avg_c_dict[i][row[0]] = dict_sum(parts_avg_c_dict[row[2]][row[0]],row[3])
                
        if ptcu[cnt,row[2]] != 0:
            puaps[cnt] = parts_user_answer_per_sum_dict[row[2]][row[0]] / ptcu[cnt,row[2]]
        
        if row[1] == 1:
            answered_correctly_sum_u_dict[row[0]] = dict_sum(answered_correctly_sum_u_dict[row[0]],1)
            answered_correctly_cumsum_u_dict[row[0]] = dict_sum(answered_correctly_cumsum_u_dict[row[0]],1)
            answered_incorrectly_cumsum_u_dict[row[0]] = 0
            last_correct_timestamp_dict[row[0]] = row[6]
            parts_answered_correctly_cumsum_u_dict[row[2]][row[0]] = dict_sum(parts_answered_correctly_cumsum_u_dict[row[2]][row[0]],1)
            parts_answered_incorrectly_cumsum_u_dict[row[2]][row[0]] = 0
        else:
            answered_correctly_cumsum_u_dict[row[0]] = 0
            answered_incorrectly_cumsum_u_dict[row[0]] = dict_sum(answered_incorrectly_cumsum_u_dict[row[0]],1)
            last_incorrect_timestamp_dict[row[0]] = row[6]
            parts_answered_correctly_cumsum_u_dict[row[2]][row[0]] = 0
            parts_answered_incorrectly_cumsum_u_dict[row[2]][row[0]] = dict_sum(parts_answered_incorrectly_cumsum_u_dict[row[2]][row[0]],1) 
            if row[2] == 1:
                #answer0,1,3
                like_answer_three_dict[row[5]][row[0]] = dict_sum(like_answer_three_dict[row[5]][row[0]],1)
                for j in [0,1,3]:
                    if row[5] != j:
                        dislike_answer_three_dict[j][row[0]] = dict_sum(dislike_answer_three_dict[j][row[0]],1)
            else:
                like_answer_dict[row[5]][row[0]] = dict_sum(like_answer_dict[row[5]][row[0]],1)
                for j in [0,1,2,3]:
                    if row[5] != j:
                        dislike_answer_dict[j][row[0]] = dict_sum(dislike_answer_dict[j][row[0]],1)
            
            
            
        answered_diff_sum_u_dict[row[0]] = dict_sum(answered_diff_sum_u_dict[row[0]],abs(row[3] - (row[1] * 100)))
        count_u_dict[row[0]] = dict_sum(count_u_dict[row[0]],1)
        avg_c_sum_u_dict[row[0]] = dict_sum(avg_c_sum_u_dict[row[0]],row[3])
        
        if row[4] in answer_per_dict[row[5]]:
            user_answer_per_sum_dict[row[0]] = user_answer_per_sum_dict[row[0]] + answer_per_dict[row[5]][row[4]]
            parts_user_answer_per_sum_dict[row[2]][row[0]] = parts_user_answer_per_sum_dict[row[2]][row[0]] + answer_per_dict[row[5]][row[4]]
        else:
            user_answer_per_sum_dict[row[0]] = user_answer_per_sum_dict[row[0]] + 0.33
            parts_user_answer_per_sum_dict[row[2]][row[0]] = parts_user_answer_per_sum_dict[row[2]][row[0]] + 0.33
       
        parts_u_dict[row[2]][row[0]] = dict_sum(parts_u_dict[row[2]][row[0]],row[1])
        parts_count_u_dict[row[2]][row[0]] = dict_sum(parts_count_u_dict[row[2]][row[0]],1)
        
        if row[1] == 1:
            content_correct_count_dict[row[4]] =  dict_sum(content_correct_count_dict[row[4]],1)
            content_correct_user_mean_dict[row[4]] = content_correct_user_mean_dict[row[4]] + (answered_correctly_sum_u_dict[row[0]] / count_u_dict[row[0]])
            parts_content_correct_user_mean_dict[row[4]] = parts_content_correct_user_mean_dict[row[4]] + (parts_u_dict[row[2]][row[0]] / parts_count_u_dict[row[2]][row[0]])
 
    df['answered_correctly_sum_u'] = acsu
    df['answered_correctly_sum_u'] = df['answered_correctly_sum_u'].astype('uint16')
    df['answered_cumsum_u'] = accu - aicu
    df['answered_cumsum_u'] = df['answered_cumsum_u'].astype('int8')
    df['part_answered_cumsum_u'] = paccu - paicu
    df['part_answered_cumsum_u'] = df['part_answered_cumsum_u'].astype('int8')    
    df['count_u'] = cu
    df['count_u'] = df['count_u'].astype('uint16')
    df['answered_correctly_avg_u'] = df['answered_correctly_sum_u'] / df['count_u']
    df['answered_correctly_avg_u'] = df['answered_correctly_avg_u'].astype('float16')
    df['answered_diff_mean'] = adsu  / cu
    df['answered_diff_mean'] = df['answered_diff_mean'].astype('float16')
    df['avg_c_mean'] = avcu / cu
    df['avg_c_mean'] = df['avg_c_mean'].astype('float16')
    df['part_avg_c_mean'] = pavc
    df['part_avg_c_mean'] = df['part_avg_c_mean'].astype('uint8')
    
    df['avg_c_per_u'] = df['avg_c_mean'] / (df['answered_correctly_avg_u'] * 100)
    df['avg_c_per_u'] = df['avg_c_per_u'].astype('float16')
    
    df['user_answer_per_mean'] = uaps
    df['user_answer_per_mean'] = df['user_answer_per_mean']  / df['count_u']
    df['user_answer_per_mean'] = df['user_answer_per_mean'].astype('float16')
    
    df['part_user_answer_per_mean']= puaps
    df['part_user_answer_per_mean'] = df['part_user_answer_per_mean'].astype('float16')
    
    df['content_lv'] = cucm
    df['content_lv'] = df['content_lv'].astype('float16')
    df['part_content_lv'] = pcucm
    df['part_content_lv'] = df['part_content_lv'].astype('float16')
    df.loc[df['content_lv']==0,'content_lv']=0.5
    df.loc[df['part_content_lv']==0,'part_content_lv']=0.5

    df['last_correct_timelag'] = lct
    df['last_correct_timelag'] = df['last_correct_timelag'].astype('uint32')
    df['last_incorrect_timelag'] = lit
    df['last_incorrect_timelag'] = df['last_incorrect_timelag'].astype('uint32')
    
    df['is_like_answer'] = like
    df['is_like_answer'] = df['is_like_answer'].astype('float16')
    df['is_dislike_answer'] = dislike
    df['is_dislike_answer'] = df['is_dislike_answer'].astype('float16')
    df['part_count_per'] = 0
    df['lr_count_per'] = 0
    cdef str pnum
    for i in range(7):
        pnum = str(i)
        df['p' + pnum + '_count_u'] = ptcu[:,i]
        df['p' + pnum + '_count_u'] = df['p' + pnum + '_count_u']
        df['p' + pnum + '_count_u'] = df['p' + pnum + '_count_u'].astype('uint32')
        df['p' + pnum + '_mean_u'] = ptu[:,i] / ptcu[:,i]
        df['p' + pnum + '_mean_u']  = df['p' + pnum + '_mean_u'] * (df['p' + pnum + '_count_u'] / df['count_u'])
        df['p' + pnum + '_mean_u'] = df['p' + pnum + '_mean_u'].astype('float16')
        df.loc[df['part']==i,'part_count_per'] = df['p' + pnum + '_count_u'] / df['count_u']
    df['part_count_per'] = df['part_count_per'].astype('float16')    
    df.loc[df['part']<4,'lr_count_per'] = ((df['p0_count_u'] + df['p1_count_u'] + df['p2_count_u'] + df['p3_count_u']) / df['count_u'])
    df.loc[df['part']>3,'lr_count_per'] = ((df['p4_count_u'] + df['p5_count_u'] + df['p6_count_u']) / df['count_u']).astype('float16')
    df['lr_count_per'] = df['lr_count_per'].astype('float16')
    df.replace([np.inf, -np.inf], np.nan,inplace=True)
    return df

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef int dict_sum(int a, int b):
    return a + b
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef int dict_sub(int a, int b):
    return a - b

In [None]:
%%cython
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def add_time_feats(df,time_u_dict,lect_u_dict):
    cdef int arr_size = len(df)
    cdef int cnt
    cdef np.ndarray[long, ndim=1] row
    cdef np.ndarray[long, ndim=1] tu = np.zeros(arr_size,dtype=long)
    cdef np.ndarray[long, ndim=1] lc = np.zeros(arr_size,dtype=long)
    for cnt,row in enumerate(df[['user_id','timestamp','content_type_id']].values):
        if (row[1] - time_u_dict[row[0]]>0):
            tu[cnt] = dict_sub(row[1],time_u_dict[row[0]])
        elif (row[1] == 0):
            tu[cnt] = 0
        else:
            tu[cnt] = tu[cnt - 1]
        lc[cnt] = lect_u_dict[row[0]]
        
        time_u_dict[row[0]] = row[1]
        if (row[2] == 1):
            lect_u_dict[row[0]] = lect_u_dict[row[0]] + 1
    
    cdef int split = 60*60*24
    cdef np.ndarray[long, ndim=1] tu_day = tu // split
    cdef np.ndarray[long, ndim=1] tu_time = tu % split 

    df['lag_time'] = tu_time
    df['lag_time'] = df['lag_time'].astype('uint16')
    df['lag_day'] = tu_day
    df['lag_day'] = df['lag_day'].astype('uint16')
    df.loc[df['lag_day']>0,'lag_time'] = np.iinfo(np.uint16).max
    df['lecture_count'] = lc
    df.loc[df['lecture_count']>np.iinfo(np.uint8).max,'lecture_count'] = np.iinfo(np.uint8).max
    df['lecture_count'] = df['lecture_count'].astype('uint8')
    return df

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def update_time_feats(df,time_u_dict,lect_u_dict):
    cdef int arr_size = len(df)
    cdef int cnt
    cdef np.ndarray[long, ndim=1] row
    for cnt,row in enumerate(df[['user_id','timestamp','content_type_id']].values):
        time_u_dict[row[0]] = row[1]
        if (row[2] == 1):
            lect_u_dict[row[0]] = lect_u_dict[row[0]] + 1
            
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef int dict_sub(long a, int b):
    return a - b

In [None]:
questions_df = questions_df.set_index('question_id')
questions_df.index.name = 'content_id'

In [None]:
#content_answer_per生成
train = pd.read_pickle(train_pickle)[feld_needed]
train = train.loc[train['content_type_id']==0][['content_id','user_answer']]
tmp = train.groupby('content_id').count()
tmp.rename(columns={'user_answer':'count'},inplace=True)
train = pd.read_pickle(train_pickle)[feld_needed]
train = train.loc[train['content_type_id']==0][['content_id','user_answer','content_type_id']]
tmp2 = train.groupby(['content_id','user_answer']).count().reset_index()
tmp2 = tmp2.merge(tmp,left_on='content_id',right_index=True,how='left')
tmp2['answer_per'] = tmp2['content_type_id'] / tmp2['count']
tmp2 = tmp2[['content_id','user_answer','answer_per']]
tmp2['answer_per'].fillna(0.3,inplace=True)
answer_per_dict = {}
for i in range(4):
    answer_per_dict[i] = tmp2.loc[tmp2['user_answer']==i].set_index('content_id')[['answer_per']].to_dict()['answer_per']
del train, tmp2, tmp

In [None]:
train = pd.read_pickle(train_pickle)[feld_needed]
valid = pd.read_pickle(valid_pickle)[feld_needed]
# answered correctly average for each content
# content_type_idが異なっていて同じコンテンツIDが存在する
content_df = train.loc[train['content_type_id']==0][['content_id','answered_correctly']].groupby(['content_id']).agg(['mean']).reset_index()
content_df.columns = ['content_id', 'answered_correctly_avg_c']
content_df['answered_correctly_avg_c'] = (content_df['answered_correctly_avg_c'] * 100).astype(np.uint8)
content_df = content_df.set_index('content_id')
content_df.index.name = 'content_id'

if debug:
    train = train[:1000000]
    valid = valid[:10000]
else:
    #user_id split because user trace
    #current active user trace 
    print('all =',train['row_id'].min(),train['row_id'].max())
    train = train.sort_values('row_id')
    train = train[int(len(train)/2):]
    print('current =',train['row_id'].min(),train['row_id'].max())
    users = np.random.choice(train['user_id'].unique(), int(len(train['user_id'].unique()) * 8 / 10), replace=True)

    train = pd.read_pickle(train_pickle)[feld_needed]
    train = train.loc[train['user_id'].isin(users)]

print(train.shape)

In [None]:
%%cython
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def data_format(df,questions_df,content_df,prior_question_elapsed_time_mean):
    df['row_id'] = df['row_id'].astype('uint32')
    df['user_id'] = df['user_id'].astype('int32')
    df['content_type_id'] = df['content_type_id'].astype('uint8')
    df.loc[df['content_type_id'] != 0,'content_id'] = 532 #暫定
    df['content_id'] = df['content_id'].astype('uint16')
    # changing dtype to avoid lightgbm error
    df['prior_question_had_explanation'] = df.prior_question_had_explanation.fillna(False).astype('uint8')
    df['prior_question_elapsed_time'] = df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
    df['prior_question_elapsed_time'] = (df['prior_question_elapsed_time'] / 1000).astype('uint16')
    df['timestamp'] = (df['timestamp'] / 1000).astype(np.uint32)
    # merge
    df = pd.concat([df.reset_index(drop=True), questions_df.reindex(df['content_id'].values).reset_index(drop=True)], axis=1)
    df = pd.concat([df.reset_index(drop=True), content_df.reindex(df['content_id'].values).reset_index(drop=True)], axis=1)
    return df

In [None]:
# fill with mean value for prior_question_elapsed_time
# note that `train.prior_question_elapsed_time.mean()` dose not work!
# please refer https://www.kaggle.com/its7171/can-we-trust-pandas-mean for detail.
prior_question_elapsed_time_mean = train.prior_question_elapsed_time.dropna().values.mean()

train = data_format(train,questions_df,content_df,prior_question_elapsed_time_mean)
valid = data_format(valid,questions_df,content_df,prior_question_elapsed_time_mean)

# memory compaction
train.loc[train['answered_correctly'] < 0,'answered_correctly'] = 0
train['answered_correctly'] = train['answered_correctly'].astype('uint8')
train['user_answer'] = train['user_answer'].astype('uint8')
valid.loc[valid['answered_correctly'] < 0,'answered_correctly'] = 0
valid['answered_correctly'] = valid['answered_correctly'].astype('uint8')
valid['user_answer'] = valid['user_answer'].astype('uint8')

In [None]:
#経過時間(講義列も考慮)
#train add_time_feats = 0:00:40.409987
#valid add_time_feats = 0:00:03.369860
time_u_dict = defaultdict(int)
lect_u_dict = defaultdict(int)

start = datetime.datetime.now()
train = add_time_feats(train,time_u_dict,lect_u_dict)
print('train add_time_feats =',(datetime.datetime.now()- start))
start = datetime.datetime.now()
valid = add_time_feats(valid,time_u_dict,lect_u_dict)
print('valid add_time_feats =',(datetime.datetime.now()- start))

In [None]:
train = train.loc[train.content_type_id == False].reset_index(drop=True)
valid = valid.loc[valid.content_type_id == False].reset_index(drop=True)

In [None]:
%%cython
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd

DTYPE = np.int32
ctypedef np.int32_t np_int_t

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def add_prior_feats(df, dict q_stats_dict,dict q_enc_tag_dict,
                    prior_content_dict, prior_prior_content_dict, prior_time_dict, 
                    prior_time_per_sum_dict,
                    prior_lag_dict, prior_prior_lag_dict,
                    lag_sum_dict, prior_avg_c_dict, prior_prior_avg_c_dict,
                    part_lag_sum_dict, prior_part_dict):
    cdef int arr_size = len(df)
    cdef int cnt
    cdef np.ndarray[int, ndim=1] row
    cdef np.ndarray[np_int_t, ndim=1] pc = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] ppc = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] eqtag = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[float, ndim=1] tp = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[np_int_t, ndim=1] pt = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] ul = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] ull = np.zeros(arr_size,dtype=DTYPE)
    #cdef np.ndarray[np_int_t, ndim=1] pe = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] ls = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] pac = np.zeros(arr_size,dtype=DTYPE)
    #cdef np.ndarray[np_int_t, ndim=1] ppac = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[float, ndim=1] ptps = np.zeros(arr_size,dtype=np.float32)
    cdef np.ndarray[np_int_t, ndim=1] pls = np.zeros(arr_size,dtype=DTYPE)
    
    for cnt,row in enumerate(df[['user_id','content_id','prior_question_elapsed_time','lag_time' ,'prior_question_had_explanation','answered_correctly_avg_c','part']].values):
        ppc[cnt] = prior_prior_content_dict[row[0]]
        pc[cnt] = prior_content_dict[row[0]]
        if prior_prior_content_dict[row[0]] in q_stats_dict:
            pt[cnt] = prior_time_dict[row[0]] / q_stats_dict[prior_prior_content_dict[row[0]]]
        else:
            pt[cnt] = 1
        ls[cnt] = lag_sum_dict[row[0]]
        pac[cnt] = prior_avg_c_dict[row[0]]
        pls[cnt] = part_lag_sum_dict[row[6]][row[0]]
        if (q_enc_tag_dict[pc[cnt]] == q_enc_tag_dict[row[1]]):
            eqtag[cnt] = 1
        else:
            eqtag[cnt] = 0

        if (prior_content_dict[row[0]] > 0) & (prior_content_dict[row[0]] in q_stats_dict):
            tp[cnt] = row[2] / q_stats_dict[prior_content_dict[row[0]]]
            prior_time_per_sum_dict[row[0]] = prior_time_per_sum_dict[row[0]] + tp[cnt]
        else:
            tp[cnt] = 1
        ptps[cnt] = prior_time_per_sum_dict[row[0]]
        
        if prior_content_dict[row[0]] > 0:
            prior_part_dict[row[0]] = row[6]
            part_lag_sum_dict[prior_part_dict[row[0]]][row[0]] = part_lag_sum_dict[prior_part_dict[row[0]]][row[0]] + tp[cnt]
        prior_prior_content_dict[row[0]] = prior_content_dict[row[0]]
        prior_content_dict[row[0]] = row[1]
        prior_time_dict[row[0]] = row[2] #1つ前のコンテンツの回答時間
        ul[cnt] = prior_lag_dict[row[0]]
        ull[cnt] = prior_prior_lag_dict[row[0]]
        prior_prior_lag_dict[row[0]] = prior_lag_dict[row[0]]
        prior_lag_dict[row[0]] = row[3]
        lag_sum_dict[row[0]] = lag_sum_dict[row[0]] + row[3]
        prior_prior_avg_c_dict[row[0]] = prior_avg_c_dict[row[0]]
        prior_avg_c_dict[row[0]] = row[5]
            
    df['prior_content_id'] = pc
    df['prior_content_id'] = df['prior_content_id'].astype('uint16')
    df['prior_content_diff'] = df['content_id'] - df['prior_content_id'].astype('int16')
    df['is_same_tags'] = eqtag
    df.loc[df['prior_content_id'] == df['content_id'],'is_same_tags'] = df['is_same_tags'] + 2
    df['is_same_tags'] = df['is_same_tags'].astype('uint8')
    df['lag_time_per'] = df['lag_time'] / df['prior_question_elapsed_time'].astype('float32')
    df['elapsed_lag_per'] = tp
    df['elapsed_lag_per'] = df['elapsed_lag_per'].astype('float16')
    df['elapsed_time_per_mean'] = ptps
    df['elapsed_time_per_mean'] = df['elapsed_time_per_mean'].astype('float16')
    
    df['part_elapsed_time_per_mean'] = pls
    df['part_elapsed_time_per_mean'] = df['part_elapsed_time_per_mean'].astype('float16')
    
    df['prior_prior_question_elapsed_time_per'] = pt
    df['prior_prior_question_elapsed_time_per'] = df['prior_prior_question_elapsed_time_per'].astype('float16')
    #df['prior_prior_question_had_explanation'] = pe
    #df['prior_prior_question_had_explanation'] = df['prior_prior_question_had_explanation'].astype('uint8')
    df['prior_prior_lag_time'] = ull
    df['prior_prior_lag_time'] = df['prior_prior_lag_time'].astype('uint16')
    df['prior_lag_time'] = ul
    df['prior_lag_time'] = df['prior_lag_time'].astype('uint16')
    df['lag_lag_time'] = df['lag_time'] / df['prior_lag_time']
    df['lag_lag_time'] = df['lag_lag_time'].astype('float16')
    df['lag_sum'] = ls
    df['prior_avg_c'] = pac
    df['prior_avg_c'] = df['prior_avg_c'].astype('uint8')
    #df['prior_prior_avg_c'] = ppac
    #df['prior_prior_avg_c'] = df['prior_prior_avg_c'].astype('uint8')
    return df

In [None]:
#q_stats_dict = pd.read_csv('../input/riiiddataset/question_stats.csv').set_index('content_id')[['q_elapsed_time_mean']].to_dict()['q_elapsed_time_mean']
#correct answer only
q_stats_dict = pd.read_csv('../input/riiiddataset/correct_q_elapsed_time_mean.csv').set_index('content_id')[['correct_q_elapsed_time_mean']].to_dict()['correct_q_elapsed_time_mean']
q_enc_tag_dict = questions_df[['enc_tags']].to_dict()['enc_tags']
q_ans_dict = questions_df[['correct_answer']].to_dict()['correct_answer']

In [None]:
#train add_prior_feats = 0:02:22.203537
#valid add_prior_feats = 0:00:12.576596

prior_content_dict = defaultdict(int)
prior_prior_content_dict = defaultdict(int)
prior_time_dict = defaultdict(int)
prior_time_per_sum_dict = defaultdict(int)
prior_lag_dict = defaultdict(int)
prior_prior_lag_dict = defaultdict(int)
lag_sum_dict = defaultdict(int)
prior_avg_c_dict = defaultdict(int)
prior_prior_avg_c_dict = defaultdict(int)
part_lag_sum_dict = {}
for p in range(0,7):
    part_lag_sum_dict[p] = defaultdict(int)

prior_part_dict = defaultdict(int)
start = datetime.datetime.now()
train = add_prior_feats(train,
                        q_stats_dict,
                        q_enc_tag_dict,
                        prior_content_dict,
                        prior_prior_content_dict,
                        prior_time_dict,
                        prior_time_per_sum_dict,
                        prior_lag_dict,
                        prior_prior_lag_dict,
                        lag_sum_dict,
                        prior_avg_c_dict,
                        prior_prior_avg_c_dict,
                        part_lag_sum_dict,
                        prior_part_dict)
print('train add_prior_feats =',(datetime.datetime.now()- start))
start = datetime.datetime.now()
valid = add_prior_feats(valid,
                        q_stats_dict,
                        q_enc_tag_dict,
                        prior_content_dict,
                        prior_prior_content_dict,
                        prior_time_dict,
                        prior_time_per_sum_dict,
                        prior_lag_dict,
                        prior_prior_lag_dict,
                        lag_sum_dict,
                        prior_avg_c_dict,
                        prior_prior_avg_c_dict,
                        part_lag_sum_dict,
                        prior_part_dict)
print('valid add_prior_feats =',(datetime.datetime.now()- start))

In [None]:
#train add_user_feats = 0:05:06.433533
#valid add_user_feats = 0:00:27.512900
#正解数
answered_correctly_sum_u_dict = defaultdict(int)
#連続正答数
answered_correctly_cumsum_u_dict = defaultdict(int)
#連続不正解
answered_incorrectly_cumsum_u_dict = defaultdict(int)
#回答数
count_u_dict = defaultdict(int)
#パート回答数
parts_count_u_dict = {}
parts_u_dict = {}
parts_avg_c_dict = {}
parts_user_answer_per_sum_dict = {}
parts_answered_correctly_cumsum_u_dict = {}
parts_answered_incorrectly_cumsum_u_dict = {}
for p in range(0,7):
    parts_u_dict[p] = defaultdict(int)
    parts_count_u_dict[p] = defaultdict(int)
    parts_avg_c_dict[p] = defaultdict(int)
    parts_user_answer_per_sum_dict[p] = defaultdict(int)
    parts_answered_correctly_cumsum_u_dict[p] = defaultdict(int)
    parts_answered_incorrectly_cumsum_u_dict[p] = defaultdict(int)
    
#回答期待値との差
answered_diff_sum_u_dict = defaultdict(int)
avg_c_sum_u_dict = defaultdict(int)
user_answer_per_sum_dict = defaultdict(int)

content_correct_user_mean_dict = defaultdict(int)
content_correct_count_dict = defaultdict(int)
parts_content_correct_user_mean_dict = defaultdict(int)

last_correct_timestamp_dict = defaultdict(int)
last_incorrect_timestamp_dict = defaultdict(int)

like_answer_dict = {}
like_answer_three_dict = {}
dislike_answer_dict = {}
dislike_answer_three_dict = {}
for p in range(0,4):
    like_answer_dict[p] = defaultdict(int)
    like_answer_three_dict[p] = defaultdict(int)
    dislike_answer_dict[p] = defaultdict(int)
    dislike_answer_three_dict[p] = defaultdict(int)

start = datetime.datetime.now()
train = add_user_feats(train,
                       answer_per_dict,
                       answered_correctly_sum_u_dict, 
                       answered_correctly_cumsum_u_dict,
                       answered_incorrectly_cumsum_u_dict,
                       count_u_dict,
                       parts_u_dict,
                       parts_count_u_dict,
                       answered_diff_sum_u_dict,
                       avg_c_sum_u_dict,
                       parts_avg_c_dict,
                       user_answer_per_sum_dict,
                       parts_user_answer_per_sum_dict,
                       content_correct_user_mean_dict,
                       content_correct_count_dict,
                       parts_content_correct_user_mean_dict,
                       last_correct_timestamp_dict,
                       last_incorrect_timestamp_dict,
                       like_answer_dict,
                       like_answer_three_dict,
                       dislike_answer_dict,
                       dislike_answer_three_dict,
                       parts_answered_correctly_cumsum_u_dict,
                       parts_answered_incorrectly_cumsum_u_dict)

print('train add_user_feats =',(datetime.datetime.now() - start))

start = datetime.datetime.now()
valid = add_user_feats(valid,
                       answer_per_dict,
                       answered_correctly_sum_u_dict,
                       answered_correctly_cumsum_u_dict,
                       answered_incorrectly_cumsum_u_dict,
                       count_u_dict,
                       parts_u_dict,
                       parts_count_u_dict,
                       answered_diff_sum_u_dict,
                       avg_c_sum_u_dict,
                       parts_avg_c_dict,
                       user_answer_per_sum_dict,
                       parts_user_answer_per_sum_dict,
                       content_correct_user_mean_dict,
                       content_correct_count_dict,
                       parts_content_correct_user_mean_dict,
                       last_correct_timestamp_dict,
                       last_incorrect_timestamp_dict,
                       like_answer_dict,
                       like_answer_three_dict,
                       dislike_answer_dict,
                       dislike_answer_three_dict,
                       parts_answered_correctly_cumsum_u_dict,
                       parts_answered_incorrectly_cumsum_u_dict)
print('valid add_user_feats =',(datetime.datetime.now() - start))

In [None]:
%%cython
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd

DTYPE = np.int32
ctypedef np.int32_t np_int_t

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def add_user_content_feats(df, user_content_dict, user_tags_dict, user_repeat_count_dict):
    cdef int arr_size = len(df)
    cdef int cnt,i
    cdef np.ndarray[np_int_t, ndim=1] row
    cdef np.ndarray[np_int_t, ndim=1] uc = np.zeros(arr_size,dtype=DTYPE)
    cdef np.ndarray[np_int_t, ndim=1] rc = np.zeros(arr_size,dtype=DTYPE)
    for cnt,row in enumerate(df[['user_id','content_id','answered_correctly','enc_tags','part']].values):
        if (row[1] in user_content_dict[row[0]]):
            uc[cnt] = user_content_dict[row[0]][row[1]]
        else:
            uc[cnt] = 0
        if (row[3] in user_tags_dict[row[0]]):
            uc[cnt] = uc[cnt] + (user_tags_dict[row[0]][row[3]] * 2)

        rc[cnt] = user_repeat_count_dict[row[4]][row[0]]
        
        if row[2] == 0:
            user_content_dict[row[0]][row[1]] = 1
            user_tags_dict[row[0]][row[3]] = 1
        else:
            user_content_dict[row[0]][row[1]] = 2
            user_tags_dict[row[0]][row[3]] = 2
            
        if row[1] in user_content_dict[row[0]]:
            if not row[0] in user_repeat_count_dict[row[4]]:
                user_repeat_count_dict[row[4]][row[0]] = 0
            user_repeat_count_dict[row[4]][row[0]] = user_repeat_count_dict[row[4]][row[0]] + 1
    df['done_content_tag'] = uc
    df['done_content_tag'] = df['done_content_tag'].astype('uint8')
    df['repeat_part_per'] = rc
    df['repeat_part_per'] = df['repeat_part_per'].astype('uint16')
    return df

In [None]:
#回答数
user_content_dict = defaultdict(dict)
user_tags_dict = defaultdict(dict)
user_repeat_count_dict = {}
for p in range(0,7):
    user_repeat_count_dict[p] = defaultdict(int)
start = datetime.datetime.now()
train = add_user_content_feats(train, user_content_dict, user_tags_dict, user_repeat_count_dict)
print('train add_user_content_feats =',(datetime.datetime.now() - start))
start = datetime.datetime.now()
valid = add_user_content_feats(valid, user_content_dict, user_tags_dict, user_repeat_count_dict)
print('valid add_user_content_feats =',(datetime.datetime.now() - start))

In [None]:
%%cython
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def post_round(df,list use_tags,list use_content):
    df.loc[~df['enc_tags'].isin(use_tags),'enc_tags'] = 65535
    df.loc[~df['content_id'].isin(use_content),'content_id'] = 532 #暫定
    return df

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def post_features(df):
    cdef int i
    df['lag_mean'] = (df['lag_sum'] / df['count_u'])
    df.loc[df['lag_mean'] > 65535,'lag_mean'] = 65535
    df['lag_mean'] = df['lag_mean'].astype('float16')
    
    df['elapsed_time_per_mean'] = df['elapsed_time_per_mean'] / df['count_u']
    df['elapsed_time_per_mean'].fillna(1,inplace=True)
    df['elapsed_time_per_mean'] = df['elapsed_time_per_mean'].astype('float16')
    
    cdef list parts = list(df['part'].unique())
    for i in parts:
        df.loc[df['part']==i,'part_elapsed_time_per_mean'] = df['part_elapsed_time_per_mean'] / df['p' + str(i) + '_count_u']
        df.loc[df['p' +str(i)  + '_count_u']>0,'repeat_part_per'] = df['repeat_part_per'] / df['p' +str(i)  + '_count_u']
    
    df['part_elapsed_time_per_mean'].fillna(1,inplace=True)
    df.loc[df['part_elapsed_time_per_mean'] ==0,'part_elapsed_time_per_mean'] = 1
    df.loc[df['part_user_answer_per_mean'] == 0, 'part_user_answer_per_mean'] = np.nan
    df['repeat_part_per'] = df['repeat_part_per'].astype('float16')
    return df

In [None]:
#cat round
round_max = np.iinfo(np.uint16).max

use_tags = list(train['enc_tags'].value_counts()[train['enc_tags'].value_counts()>3].index)
use_content = list(train['content_id'].value_counts()[train['content_id'].value_counts()>3].index)

train = post_round(train,use_tags,use_content)
valid = post_round(valid,use_tags,use_content)

train = post_features(train)
valid = post_features(valid)

In [None]:
print(train.info())
print(train.memory_usage(deep=True))

In [None]:
del train,valid


## modeling

### Stack All Data

In [None]:
if build == True:
    #経過時間
    time_u_dict= defaultdict(int)
    lect_u_dict = defaultdict(int)
    prior_content_dict = defaultdict(int)

    #正解数
    answered_correctly_sum_u_dict = defaultdict(int)
    #連続正答数
    answered_correctly_cumsum_u_dict = defaultdict(int)
    #連続不正解
    answered_incorrectly_cumsum_u_dict = defaultdict(int)
    #回答数
    count_u_dict = defaultdict(int)
    #パート回答数
    parts_count_u_dict = {}
    parts_u_dict = {}
    parts_avg_c_dict = {}
    parts_user_answer_per_sum_dict = {}
    part_lag_sum_dict = {}
    parts_answered_correctly_cumsum_u_dict = {}
    parts_answered_incorrectly_cumsum_u_dict = {}
    user_repeat_count_dict = {}
    for p in range(0,7):
        parts_u_dict[p] = defaultdict(int)
        parts_count_u_dict[p] = defaultdict(int)
        parts_avg_c_dict[p] = defaultdict(int)
        parts_user_answer_per_sum_dict[p] = defaultdict(int)
        part_lag_sum_dict[p] = defaultdict(int)
        parts_answered_correctly_cumsum_u_dict[p] = defaultdict(int)
        parts_answered_incorrectly_cumsum_u_dict[p] = defaultdict(int)
        user_repeat_count_dict[p] = defaultdict(int)
    answered_diff_sum_u_dict = defaultdict(int)
    avg_c_sum_u_dict= defaultdict(int)
    user_answer_per_sum_dict = defaultdict(int)
    user_content_dict = defaultdict(dict)
    user_tags_dict = defaultdict(dict)
    prior_lag_dict = defaultdict(int)
    prior_prior_lag_dict = defaultdict(int)
    prior_time_dict = defaultdict(int)
    prior_time_per_sum_dict = defaultdict(int)
    lag_sum_dict = defaultdict(int)
    prior_avg_c_dict = defaultdict(int)
    prior_prior_avg_c_dict = defaultdict(int)
    prior_part_dict = defaultdict(int)
    content_correct_user_mean_dict = defaultdict(int)
    content_correct_count_dict = defaultdict(int)
    parts_content_correct_user_mean_dict = defaultdict(int)
    last_correct_timestamp_dict = defaultdict(int)
    last_incorrect_timestamp_dict = defaultdict(int)
    like_answer_dict = {}
    like_answer_three_dict = {}
    for p in range(0,4):
        like_answer_dict[p] = defaultdict(int)
        like_answer_three_dict[p] = defaultdict(int)
    # OOM 
    for n, train_part in enumerate(pd.read_csv('../input/riiid-test-answer-prediction/train.csv', chunksize=10**7, iterator=True)):
        start = datetime.datetime.now()
        train_part = train_part[feld_needed]
        train_part = data_format(train_part,questions_df,content_df,prior_question_elapsed_time_mean)
        del train_part['row_id']
        train_part.loc[train_part['answered_correctly'] < 0,'answered_correctly'] = 0
        train_part['answered_correctly'] = train_part['answered_correctly'].astype('uint8')
        train_part['user_answer'] = train_part['user_answer'].astype('uint8')
        train_part = add_time_feats(train_part,time_u_dict,lect_u_dict)
        train_part = train_part.loc[train_part.content_type_id == False].reset_index(drop=True)
        del train_part['content_type_id']
        _=gc.collect()
        train_part = add_prior_feats(train_part,
                                     q_stats_dict,
                                     q_enc_tag_dict,
                                     prior_content_dict,
                                     prior_prior_content_dict,
                                     prior_time_dict,
                                     prior_time_per_sum_dict,
                                     prior_lag_dict,
                                     prior_prior_lag_dict,
                                     lag_sum_dict,
                                     prior_avg_c_dict,
                                     prior_prior_avg_c_dict,
                                     part_lag_sum_dict,
                                     prior_part_dict)
        
        train_part = add_user_feats(train_part,
                       answer_per_dict,
                       answered_correctly_sum_u_dict, 
                       answered_correctly_cumsum_u_dict,
                       answered_incorrectly_cumsum_u_dict,
                       count_u_dict,
                       parts_u_dict,
                       parts_count_u_dict,
                       answered_diff_sum_u_dict,
                       avg_c_sum_u_dict,
                       parts_avg_c_dict,
                       user_answer_per_sum_dict,
                       parts_user_answer_per_sum_dict,
                       content_correct_user_mean_dict,
                       content_correct_count_dict,
                       parts_content_correct_user_mean_dict,
                       last_correct_timestamp_dict,
                       last_incorrect_timestamp_dict,
                       like_answer_dict,
                       like_answer_three_dict,
                       dislike_answer_dict,
                       dislike_answer_three_dict,
                       parts_answered_correctly_cumsum_u_dict,
                       parts_answered_incorrectly_cumsum_u_dict)
        add_user_content_feats(train_part, user_content_dict, user_tags_dict,user_repeat_count_dict)
        del train_part
        print('train add_feats =',(datetime.datetime.now() - start), n)

In [None]:
if True == True:
    with open('time_u_dict.pickle', 'wb') as f:
        pickle.dump(time_u_dict, f)

    with open('prior_content_dict.pickle', 'wb') as f:
        pickle.dump(prior_content_dict, f)
        
    with open('prior_prior_content_dict.pickle', 'wb') as f:
        pickle.dump(prior_prior_content_dict, f)
    
    with open('answered_correctly_sum_u_dict', 'wb') as f:
        pickle.dump(answered_correctly_sum_u_dict, f)  
        
    with open('answered_correctly_cumsum_u_dict.pickle', 'wb') as f:
        pickle.dump(answered_correctly_cumsum_u_dict, f) 
        
    with open('answered_incorrectly_cumsum_u_dict.pickle', 'wb') as f:
        pickle.dump(answered_incorrectly_cumsum_u_dict, f) 
        
    with open('count_u_dict', 'wb') as f:
        pickle.dump(count_u_dict, f)
        
    with open('parts_count_u_dict.pickle', 'wb') as f:
        pickle.dump(parts_count_u_dict, f)
        
    with open('parts_u_dict.pickle', 'wb') as f:
        pickle.dump(parts_u_dict, f)
        
    with open('answered_diff_sum_u_dict.pickle', 'wb') as f:
        pickle.dump(answered_diff_sum_u_dict, f)
        
    with open('user_content_dict.pickle','wb') as f:
        pickle.dump(user_content_dict, f)
        
    with open('user_tags_dict.pickle','wb') as f:
        pickle.dump(user_tags_dict, f)
        
    with open('prior_lag_dict.pickle','wb') as f:
        pickle.dump(prior_lag_dict, f)
    
    with open('prior_prior_lag_dict.pickle','wb') as f:
        pickle.dump(prior_prior_lag_dict, f)
        
    with open('prior_time_dict.pickle','wb') as f:
        pickle.dump(prior_time_dict, f)
        
    with open('prior_time_per_sum_dict.pickle','wb') as f:
        pickle.dump(prior_time_per_sum_dict, f)
    
    with open('part_lag_sum_dict.pickle','wb') as f:
        pickle.dump(part_lag_sum_dict, f)
    
    with open('prior_part_dict.pickle','wb') as f:
        pickle.dump(prior_part_dict, f)
    
    with open('lag_sum_dict.pickle','wb') as f:
        pickle.dump(lag_sum_dict, f)
        
    with open('avg_c_sum_u_dict.pickle','wb') as f:
        pickle.dump(avg_c_sum_u_dict, f)
    
    with open('user_answer_per_sum_dict.pickle','wb') as f:
        pickle.dump(user_answer_per_sum_dict, f)
    
    with open('parts_user_answer_per_sum_dict.pickle','wb') as f:
        pickle.dump(parts_user_answer_per_sum_dict, f)
    
    with open('content_correct_user_mean_dict.pickle','wb') as f:
        pickle.dump(content_correct_user_mean_dict, f)
        
    with open('content_correct_count_dict.pickle','wb') as f:
        pickle.dump(content_correct_count_dict, f)
        
    with open('parts_content_correct_user_mean_dict.pickle','wb') as f:
        pickle.dump(parts_content_correct_user_mean_dict, f)
    
    with open('prior_avg_c_dict.pickle','wb') as f:
        pickle.dump(prior_avg_c_dict, f)

    with open('prior_prior_avg_c_dict.pickle','wb') as f:
        pickle.dump(prior_prior_avg_c_dict, f)
        
    with open('parts_avg_c_dict.pickle','wb') as f:
        pickle.dump(parts_avg_c_dict, f)
    
    with open('lect_u_dict.pickle','wb') as f:
        pickle.dump(lect_u_dict, f)
        
    with open('last_correct_timestamp_dict.pickle','wb') as f:
        pickle.dump(last_correct_timestamp_dict,f)
      
    with open('last_incorrect_timestamp_dict.pickle','wb') as f:
        pickle.dump(last_incorrect_timestamp_dict,f)
        
    with open('like_answer_dict.pickle','wb') as f:
        pickle.dump(like_answer_dict,f)
        
    with open('like_answer_three_dict.pickle','wb') as f:
        pickle.dump(like_answer_three_dict,f)
    
    with open('dislike_answer_dict.pickle','wb') as f:
        pickle.dump(dislike_answer_dict,f)
        
    with open('dislike_answer_three_dict.pickle','wb') as f:
        pickle.dump(dislike_answer_three_dict,f) 
        
    with open('parts_answered_correctly_cumsum_u_dict.pickle','wb') as f:
        pickle.dump(parts_answered_correctly_cumsum_u_dict, f)
        
    with open('parts_answered_incorrectly_cumsum_u_dict.pickle','wb') as f:
        pickle.dump(parts_answered_incorrectly_cumsum_u_dict, f)
        
    with open('user_repeat_count_dict.pickle','wb') as f:
        pickle.dump(user_repeat_count_dict, f)