datatablepackageを使用したFTRLモデル   
序盤に作成したノートブック  
最終的には使用してません  


In [1]:
# installing datatable
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl

Processing /kaggle/input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl
Installing collected packages: datatable
Successfully installed datatable-0.11.0
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
## importing packages
import datatable as dt
import pandas as pd
import numpy as np
from datatable.models import Ftrl
from datatable import (dt, f, by, ifelse, update, sort, count, min, max, mean, sum, rowsum)
from sklearn.metrics import roc_auc_score
import datetime

import riiideducation

In [3]:
TARGET = 'answered_correctly'
DEBUG = True
map_prior = {True:1, False:0}
TRAIN_SIZE = 90000000

## Loading data
Using the .jay format of the training data is the best option for datatable. It is available as a **Kaggle Dataset** [here](https://www.kaggle.com/rohanrao/riiid-train-data-multiple-formats).


In [4]:
## reading data
train = dt.fread("../input/riiid-train-data-multiple-formats/riiid_train.jay")
questions = dt.fread("../input/riiid-test-answer-prediction/questions.csv")

In [5]:
## viewing train data
#train= train[:,["timestamp","user_id","content_id","content_type_id","task_container_id","user_answer","answered_correctly","prior_question_elapsed_time","prior_question_had_explanation"]]#
## merging questions metadata with train data
questions.key = "question_id"
train.names = {"content_id": "question_id"}

train = train[dt.f.content_type_id == 0, :]
train = train[:, :, dt.join(questions)]
train['prior_question_elapsed_time'] = dt.f.prior_question_elapsed_time / 1000
train['timestamp'] = dt.f.timestamp / 1000
del train['row_id'], train['content_type_id'], train['user_answer'], train['correct_answer']

# Feature Engeering

In [6]:
times_stack = dt.Frame()
times_stack_features = ['user_id','timestamp','prior_question_elapsed_time', 'prior_question_had_explanation']
times_m_gl = dt.Frame()
intervals_m_gl = dt.Frame()

def get_shift_features():
    task_m = times_stack[0, :, by('user_id','timestamp')]
    task_m[:, update(shift_timestamp = dt.shift(f.timestamp),n=1), by("user_id")]
    task_m = task_m[:, :, sort(-f.user_id, -f.timestamp)]
    task_m[:, update(prior_question_elapsed_time = dt.shift(f.prior_question_elapsed_time),n=1), by("user_id")]
    task_m[:, update(prior_question_had_explanation = dt.shift(f.prior_question_had_explanation),n=1), by("user_id")]
    #task_m = task_m[:, :, sort(f.user_id, f.timestamp)]
    task_m['interval'] = f.timestamp - f.shift_timestamp
    task_m = task_m[:,['user_id','timestamp','interval','prior_question_elapsed_time','prior_question_had_explanation']]
    task_m.key= (["user_id","timestamp"])
    
    interval_m = task_m[:, {'interval_mean':mean(f.interval)}, by('user_id')]
    interval_m.key= (["user_id"])
    return task_m, interval_m


questions_stack = dt.Frame()
questions_stack_features = ['question_id','part', TARGET,'prior_question_elapsed_time','user_id']
question_m_gl = dt.Frame()
part_m_gl = dt.Frame()
def get_question_features():
    question_m = questions_stack[:, {'question_mean':mean(f.answered_correctly),
                                     'elapsed_time_mean':mean(f.prior_question_elapsed_time)}, by('question_id')]
    part_m = questions_stack[:, {'part_time_mean':mean(f.prior_question_elapsed_time)}, by('part','user_id')]
    
    question_m.key= (["question_id"])
    part_m.key= (["part",'user_id'])
    return question_m, part_m


users_stack = dt.Frame()
users_stack_features = ['user_id',TARGET,'elapsed_time_per']
users_m_gl = dt.Frame()

def get_user_features():
    user_m = users_stack[:, {'user_mean':mean(f.answered_correctly),
                             'user_count':count(f.answered_correctly),
                             'elapsed_time_per_mean':mean(f.elapsed_time_per)}, by('user_id')]
    user_m.key= (["user_id"])
    return user_m
             

In [7]:
print(train.shape)
#TRAIN_SIZE = int(len(train) / 9)
print(train.shape)

(99271300, 10)
(99271300, 10)


In [8]:
# Create TrainigSet
dt_now = datetime.datetime.now()

X_train = train[:TRAIN_SIZE, :]

times_stack.rbind(X_train[:,times_stack_features])
times_m_gl,intervals_m_gl = get_shift_features()
del X_train['prior_question_elapsed_time'], X_train['prior_question_had_explanation']
X_train = X_train[:, :, dt.join(times_m_gl)]
X_train = X_train[:, :, dt.join(intervals_m_gl)]

questions_stack.rbind(X_train[:,questions_stack_features])
question_m_gl, part_m_gl = get_question_features()
X_train = X_train[:, :, dt.join(question_m_gl)]
X_train = X_train[:, :, dt.join(part_m_gl)]
X_train['elapsed_time_per'] = f.prior_question_elapsed_time / f.elapsed_time_mean

users_stack.rbind(X_train[:,users_stack_features])
user_m_gl = get_user_features()
X_train = X_train[:, :, dt.join(user_m_gl)]
y_train = X_train[:,TARGET]
print(datetime.datetime.now() - dt_now)

0:03:03.475692


In [9]:
#Create ValidationSet
X_valid = train[TRAIN_SIZE:, :]

times_stack.rbind(X_valid[:,times_stack_features])
times_m_gl,intervals_m_gl = get_shift_features()
del X_valid['prior_question_elapsed_time'], X_valid['prior_question_had_explanation']
X_valid = X_valid[:, :, dt.join(times_m_gl)]
X_valid = X_valid[:, :, dt.join(intervals_m_gl)]

questions_stack.rbind(X_valid[:,questions_stack_features])
question_m_gl,part_m_gl = get_question_features()
X_valid = X_valid[:, :, dt.join(question_m_gl)]
X_valid = X_valid[:, :, dt.join(part_m_gl)]
X_valid['elapsed_time_per'] = f.prior_question_elapsed_time / f.elapsed_time_mean

users_stack.rbind(X_valid[:,users_stack_features])
user_m_gl = get_user_features()
X_valid = X_valid[:, :, dt.join(user_m_gl)]
y_valid = X_valid[:,TARGET]

In [10]:
train_features = ["user_id", "question_id", "prior_question_elapsed_time"]
question_features = ["bundle_id", "part"]#,'interval',"elapsed_time_per", "tags"
create_features = ["user_mean", "question_mean","elapsed_time_mean",'elapsed_time_per_mean','prior_question_had_explanation','part_time_mean']

X_train = X_train[:, train_features + question_features + create_features]
X_valid = X_valid[:, train_features + question_features + create_features]

In [11]:
## building and validating FTRL model 0.005以上上がらない特徴は弱い
#Base Validation AUC: 0.7604151853254311
#part_time_mean Validation AUC: 0.7614426241046937
#timestamp 0 only f.timestamp > 1000 only Validation AUC: 0.5131733347825258
model_ftrl = Ftrl()
model_ftrl.interactions = [["question_id","part"],["question_id","bundle_id"]]
model_ftrl.fit(X_train, y_train, X_validation=X_valid, y_validation=y_valid)
y_pred = model_ftrl.predict(X_valid)
    
print(f"Validation AUC: {roc_auc_score(y_valid.to_numpy(), y_pred.to_numpy())}")
del X_train, y_train, X_valid, y_valid, y_pred

Validation AUC: 0.7613873374174995


In [12]:
del train['prior_question_elapsed_time'], train['prior_question_had_explanation']
train = train[:, :, dt.join(times_m_gl)]
train = train[:, :, dt.join(intervals_m_gl)]

train = train[:, :, dt.join(question_m_gl)]
train = train[:, :, dt.join(part_m_gl)]
train['elapsed_time_per'] = f.prior_question_elapsed_time / f.elapsed_time_mean
train = train[:, :, dt.join(user_m_gl)]

In [13]:
model_ftrl.reset()
model_ftrl.fit(train[:, train_features + question_features + create_features], train[:,TARGET])
del train
model_ftrl.feature_importances[:, :, sort(f.feature_importance)]

Unnamed: 0_level_0,feature_name,feature_importance
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪
0,elapsed_time_per_mean,0.0289361
1,user_id,0.0445998
2,elapsed_time_mean,0.046575
3,part_time_mean,0.0484037
4,bundle_id,0.0513587
5,question_id:bundle_id,0.0547996
6,question_id,0.0582589
7,question_id:part,0.0585604
8,question_mean,0.157969
9,prior_question_elapsed_time,0.176467


In [14]:
## initializing test environment
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
## inferencing and incremental learning
refresh_interval = -1
prev_test = pd.DataFrame()
for (current_test, current_prediction_df) in iter_test:
    current_test['prior_question_had_explanation'].fillna(False,inplace=True)
    current_test['prior_question_had_explanation'] = current_test['prior_question_had_explanation'].map(map_prior).astype(np.int8)
        
    # extracting previous batch's targets
    prev_target = eval(current_test["prior_group_answers_correct"].iloc[0])

    # incremental learning of FTRL model
    if (prev_test.shape[0] > 0):
        prev_test[TARGET] = np.array(prev_target)
        ## viewing train data
        prev_test = prev_test[dt.f.content_type_id == 0, :]
        del prev_test["content_type_id"]

        questions_stack.rbind(prev_test[:,questions_stack_features])
        users_stack.rbind(prev_test[:,users_stack_features])

        if refresh_interval < 0:
            question_m_gl, part_m_gl = get_question_features()
            user_m_gl = get_user_features()
            print('feature refresh')
            refresh_interval = 50000

        y_prev_test = prev_test[:, TARGET]
        X_prev_test = prev_test[:, train_features + question_features + create_features]
        model_ftrl.fit(X_prev_test, y_prev_test)

    # inferencing of current batch
    X_test = dt.Frame(current_test)
    ## merging questions metadata with train data
    X_test.names = {"content_id": "question_id"}
    X_test = X_test[:, :, dt.join(questions)]
    X_test['prior_question_elapsed_time'] = dt.f.prior_question_elapsed_time / 1000
    X_test['timestamp'] = dt.f.timestamp / 1000

    # time_stack target no use
    refresh_interval -= prev_test.shape[0]
    times_stack.rbind(X_test[:,times_stack_features])
    if refresh_interval < 0: #user_idを渡せば高速化出来るが、使いまわしは出来ない（時間内に収まるならuser_id指定が良い)
        times_m_gl,intervals_m_gl = get_shift_features()

    del X_test['prior_question_elapsed_time'], X_test['prior_question_had_explanation']
    X_test = X_test[:, :, dt.join(times_m_gl)]
    X_test = X_test[:, :, dt.join(intervals_m_gl)]
    X_test = X_test[:, :, dt.join(question_m_gl)]
    X_test = X_test[:, :, dt.join(part_m_gl)]
    
    X_test['elapsed_time_per'] = f.prior_question_elapsed_time / f.elapsed_time_mean
    
    X_test = X_test[:, :, dt.join(user_m_gl)]
    X_test[f.user_mean == None,'user_mean'] = 0.65
    X_test[f.question_mean == None,'question_mean'] = 0.65

    # retaining current batch data for next batch
    prev_test = X_test.copy(deep = True)
    
    current_prediction_df.answered_correctly = model_ftrl.predict(X_test[dt.f.content_type_id == 0, :][:, train_features + question_features + create_features]).to_numpy().ravel()
    env.predict(current_prediction_df)