In [1]:
!pip install datatable



In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
import riiideducation
import torch


# Error handling, ignore all
np.seterr(divide = 'ignore', invalid = 'ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [3]:
data_types_dict = {
    'user_id': 'int32', 
    'timestamp': 'int64',
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}

target = 'answered_correctly'

In [4]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns = set(data_types_dict.keys())).to_pandas()

In [5]:
print('Training dataset detailed information')
print('*' * 50)
print('Columns:', train_df.columns)
print('*' * 50)
print('Shape:', train_df.shape)
print('*' * 50)
print('NA values in each column:', sum(train_df.isna().sum()))
print('*' * 50)

Training dataset detailed information
**************************************************
Columns: Index(['timestamp', 'user_id', 'content_id', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation'],
      dtype='object')
**************************************************
Shape: (101230332, 6)
**************************************************
NA values in each column: 2744044
**************************************************


In [6]:
train_df = train_df[train_df[target] != -1].reset_index(drop = True, inplace = False)#获取target非-1的样本

train_df['prior_question_had_explanation'].fillna(False, inplace = True)#用False填充nan

train_df = train_df.astype(data_types_dict)

In [7]:
#train_df.head()

In [8]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()

cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount']) # 列方向上求累积和 和累计个数
#  学习进步的增长率
train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
# 
train_df.drop(columns = ['lag'], inplace = True)

In [9]:
# Overall correctness of users 用户回答问题正确的比例，数目和次数 sum是回答正确的次数，count是回答的xx题目的总次数
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
# Overall difficulty of questions每个content出现的次数和被回答正确的比例
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])

In [10]:
# Take only 24 last observations of each user
train_df = train_df.groupby('user_id').tail(24).reset_index(drop = True)

In [11]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols = [0, 3],
    dtype = {'question_id': 'int16', 'part': 'int8'}
)
train_df = pd.merge(train_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
train_df.drop(columns = ['question_id'], inplace = True)

In [12]:
train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')#某问题被回答的次数

train_df['content_id'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])#某问题被回答正确的比例

In [13]:
train_df.prior_question_had_explanation=train_df.prior_question_had_explanation.astype('int8')

train_df['lag'] = train_df.groupby('user_id')['prior_question_had_explanation'].shift()#用户是否

In [14]:
#train_df['lag'] = train_df.groupby('user_id')['prior_question_had_explanation'].shift()#用户是否看到上一个问题的答案，第一个题目为null。通常前几个都为false，因为那是测试。
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])#看上一题解释的总数和列数
train_df['explanation_mean'] = cum['cumsum'] / cum['cumcount']#解释的平均
train_df['explanation_cumsum'] = cum['cumsum'] 

train_df.drop(columns=['lag'], inplace=True)

train_df['explanation_mean'].fillna(0, inplace=True)
train_df['explanation_cumsum'].fillna(0, inplace=True)
train_df.explanation_mean=train_df.explanation_mean.astype('float16')
train_df.explanation_cumsum=train_df.explanation_cumsum.astype('int16')

In [15]:
explanation_agg = train_df.groupby('user_id')['prior_question_had_explanation'].agg(['sum', 'count'])#与上面cusum和cucount的区别
explanation_agg = explanation_agg.astype('int16')

In [16]:
max_timestamp_u = train_df[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()#取出timestamp的最大值
max_timestamp_u.columns = ['user_id', 'max_time_stamp']#重新设置columns

train_df['lagtime'] = train_df.groupby('user_id')['timestamp'].shift()
train_df['lagtime']=train_df['timestamp']-train_df['lagtime']#此用户交互与该用户完成第一个事件之间的时间（毫秒）。
train_df['lagtime'].fillna(0, inplace=True)#用0填充空值
train_df.lagtime=train_df.lagtime.astype('int32')#数据格式转换

lagtime_agg = train_df.groupby('user_id')['lagtime'].agg(['mean'])#完成每一题的平均时间
train_df['lagtime_mean'] = train_df['user_id'].map(lagtime_agg['mean'])#map映射
train_df.lagtime_mean=train_df.lagtime_mean.astype('int32')#转换数据格式


train_df['timestamp']=train_df['timestamp']/(1000*3600)#时间转换为小时
train_df.timestamp=train_df.timestamp.astype('int16')

提取验证集

In [17]:
# Ratio is 6/24 = 25%
valid_df = train_df.groupby('user_id').tail(6)
train_df.drop(valid_df.index, inplace = True)

训练

In [18]:

features = [ 'timestamp','lagtime',#'lagtime_mean',
    'content_id', 'prior_question_elapsed_time', 
            'prior_question_had_explanation', 'user_correctness', 
            'part', 'content_count','explanation_mean','explanation_cumsum']
'''
features = [ 'timestamp','lagtime','lagtime_mean',
    'content_id', 'prior_question_elapsed_time', 
            'prior_question_had_explanation', 'user_correctness', 
            'part', 'content_count']'''
cat_features = ['part']


params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'task_type': 'GPU',
    'grow_policy': 'Lossguide',
    'iterations': 2500,
    'learning_rate': 0.07,
    'random_seed': 0,
    'l2_leaf_reg': 1e-1,
    'depth': 15,
    'max_leaves': 10,
    'border_count': 128,
    'verbose': 100,
    'cat_features':cat_features,
}

In [19]:
from catboost import CatBoostClassifier, Pool

# Training and validating data
train_set = Pool(train_df[features], label = train_df[target],cat_features = cat_features)
val_set = Pool(valid_df[features], label = valid_df[target],cat_features = cat_features)

In [20]:
# Model definition
model = CatBoostClassifier(**params)

# Fitting
model.fit(train_set, eval_set = val_set, use_best_model = True)

0:	learn: 0.7303881	test: 0.7096431	best: 0.7096431 (0)	total: 102ms	remaining: 4m 15s
100:	learn: 0.7550097	test: 0.7375029	best: 0.7375029 (100)	total: 10.7s	remaining: 4m 13s
200:	learn: 0.7571999	test: 0.7400687	best: 0.7400687 (200)	total: 21s	remaining: 4m
300:	learn: 0.7581694	test: 0.7409446	best: 0.7409446 (300)	total: 31.1s	remaining: 3m 47s
400:	learn: 0.7587903	test: 0.7414724	best: 0.7414724 (400)	total: 40.4s	remaining: 3m 31s
500:	learn: 0.7591971	test: 0.7417947	best: 0.7417947 (500)	total: 49.6s	remaining: 3m 18s
600:	learn: 0.7595699	test: 0.7420488	best: 0.7420488 (600)	total: 58.7s	remaining: 3m 5s
700:	learn: 0.7598254	test: 0.7422498	best: 0.7422498 (700)	total: 1m 9s	remaining: 2m 58s
800:	learn: 0.7600787	test: 0.7424541	best: 0.7424613 (799)	total: 1m 19s	remaining: 2m 47s
900:	learn: 0.7603100	test: 0.7426743	best: 0.7426743 (900)	total: 1m 28s	remaining: 2m 37s
1000:	learn: 0.7605163	test: 0.7428354	best: 0.7428354 (1000)	total: 1m 38s	remaining: 2m 27s
1100:

<catboost.core.CatBoostClassifier at 0x7f74fc2e55d0>

In [21]:
import joblib
# 模型存储
joblib.dump(model, './1228_2_loan_model.pkl')
# 模型加载
#model = joblib.load('../input/loan_model.pkl')

['./1228_2_loan_model.pkl']

接口，这里是相比于国内的比赛有很大的不同。

In [22]:
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

explanation_sum_dict = explanation_agg['sum'].astype('int16').to_dict(defaultdict(int))
explanation_count_dict = explanation_agg['count'].astype('int16').to_dict(defaultdict(int))


lagtime_mean_dict = lagtime_agg['mean'].astype('int32').to_dict(defaultdict(int))
max_timestamp_u_dict = max_timestamp_u.set_index('user_id').to_dict()

In [23]:
try:
    env = riiideducation.make_env()
except:
    pass
iter_test = env.iter_test()
prior_test_df = None

In [24]:
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop = True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        
        for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1

    prior_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)#测试数据
    test_df = pd.merge(test_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')    
    
    user_sum = np.zeros(len(test_df), dtype = np.int16)
    user_count = np.zeros(len(test_df), dtype = np.int16)
    content_sum = np.zeros(len(test_df), dtype = np.int32)
    content_count = np.zeros(len(test_df), dtype = np.int32)
    explanation_sum = np.zeros(len(test_df), dtype=np.int32)
    explanation_count = np.zeros(len(test_df), dtype=np.int32)
    
    lagtime = np.zeros(len(test_df), dtype=np.int32)
    lagtime_mean = np.zeros(len(test_df), dtype=np.int32)
    
    for i, (user_id, content_id,timestamp) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values,test_df['timestamp'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        explanation_sum[i] = explanation_sum_dict[user_id]
        explanation_count[i] = explanation_count_dict[user_id]
        
        if user_id in max_timestamp_u_dict['max_time_stamp'].keys():
            lagtime[i] = timestamp-max_timestamp_u_dict['max_time_stamp'][user_id]
            max_timestamp_u_dict['max_time_stamp'][user_id]=timestamp
            lagtime_mean[i] = (lagtime_mean_dict[user_id]+lagtime[i])/2           
        else:
            lagtime[i]=0
            max_timestamp_u_dict['max_time_stamp'].update({user_id:timestamp})
            lagtime_mean_dict.update({user_id:timestamp})
            lagtime_mean[i]=(lagtime_mean_dict[user_id]+lagtime[i])/2

    test_df['user_correctness'] = user_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df['explanation_mean'] = explanation_sum / explanation_count
    test_df['explanation_cumsum'] = explanation_sum 
    test_df["lagtime"] = lagtime
    test_df["lagtime_mean"] = lagtime_mean
    test_df['timestamp']=test_df['timestamp']/(1000*3600)#时间转换为小时    

       
    test_df[target] = model.predict_proba(test_df[features])[:,1]
    env.predict(test_df[['row_id', target]])