In [2]:
KAGGLE_RUN = True
# Libraries
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
import lightgbm as lgb
import random
import os
import io
import sys
if not KAGGLE_RUN:
    sys.path.insert(0, './input')
import riiideducation

# Boto3 Setup
if not KAGGLE_RUN:
    import boto3
    s3_session = boto3.Session()
    s3_client = boto3.client('s3')
    s3_resource = boto3.resource('s3')
    bucket_name = 'npa02012-main'
    s3_bucket = s3_resource.Bucket(bucket_name)

# Set seed
SEED = 0
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

# Get data from S3
if not KAGGLE_RUN:
    # Download riiideducation package
    s3_client.download_file(bucket_name
                            ,'kaggle_data/riiid-test-answer-prediction/riiideducation/__init__.py'
                            ,'./input/riiideducation/__init__.py')
    tmp = 'competition.cpython-37m-x86_64-linux-gnu.so'
    s3_client.download_file(bucket_name
                            ,'kaggle_data/riiid-test-answer-prediction/riiideducation/' + tmp
                            ,'./input/riiideducation/' + tmp)

    # Download example_sample_submission.csv and move to /kaggle/input
    save_to = '/home/ubuntu/kaggle_riiid/input/example_sample_submission.csv'
    s3_client.download_file(bucket_name
                            ,'kaggle_data/riiid-test-answer-prediction/example_sample_submission.csv'
                            ,save_to)
    move_to = '/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv'
    os.rename(save_to, move_to)

    # Download example_test.csv and move to /kaggle/input
    save_to = '/home/ubuntu/kaggle_riiid/input/example_test.csv'
    s3_client.download_file(bucket_name
                            ,'kaggle_data/riiid-test-answer-prediction/example_test.csv'
                            ,save_to)
    move_to = '/kaggle/input/riiid-test-answer-prediction/example_test.csv'
    os.rename(save_to, move_to)
    
# Define data for train.csv
data_types_dict = {
    'user_id': 'int32', 
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'

# Load data
if not KAGGLE_RUN:
    # Train (sample)
    key = 'kaggle_data/riiid-test-answer-prediction/train.csv'
    obj = s3_client.get_object(Bucket=bucket_name, Key=key)
    train_iter = pd.read_csv(obj['Body'], chunksize = 1000000)
    df_train = train_iter.get_chunk()

    # Questions
    key = 'kaggle_data/riiid-test-answer-prediction/questions.csv'
    obj = s3_client.get_object(Bucket=bucket_name, Key=key)
    df_questions = pd.read_csv(io.BytesIO(obj['Body'].read()))
else:
    import datatable as dt
    df_train = dt.fread('../input/riiid-test-answer-prediction/train.csv'
                        ,columns=set(data_types_dict.keys())).to_pandas()
    df_questions = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv'
                                ,usecols=[0, 3]
                                ,dtype={'question_id': 'int16', 'part': 'int8'}
                               )

df_train = df_train[df_train[target] != -1].reset_index(drop=True)
df_train['prior_question_had_explanation'].fillna(False, inplace=True)
df_train = df_train.astype(data_types_dict)

df_train['lag'] = df_train.groupby('user_id')[target].shift()
cum = df_train.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
df_train['user_correctness'] = cum['cumsum'] / cum['cumcount']
df_train.drop(columns=['lag'], inplace=True)

user_agg = df_train.groupby('user_id')[target].agg(['sum', 'count'])
content_agg = df_train.groupby('content_id')[target].agg(['sum', 'count'])
df_train = df_train.groupby('user_id').tail(24).reset_index(drop=True)

df_train = pd.merge(df_train, df_questions, left_on='content_id', right_on='question_id', how='left')
df_train.drop(columns=['question_id'], inplace=True)

df_train['content_count'] = df_train['content_id'].map(content_agg['count']).astype('int32')
df_train['content_id'] = df_train['content_id'].map(content_agg['sum'] / content_agg['count'])

df_valid = df_train.groupby('user_id').tail(6)
df_train.drop(df_valid.index, inplace=True)

features = [
    'content_id',
    'prior_question_elapsed_time',
    'prior_question_had_explanation',
    'user_correctness',
    'part',
    'content_count'
]

if not KAGGLE_RUN:
    print('Building Model')
    params = {
        'objective': 'binary',
        'seed': 42,
        'metric': 'auc',
        'learning_rate': 0.05,
        'max_bin': 800,
        'num_leaves': 80
    }
    d_train = lgb.Dataset(df_train[features], label=df_train[target])
    d_valid = lgb.Dataset(df_valid[features], label=df_valid[target])

    model = lgb.train(
        params, 
        d_train, 
        num_boost_round=10000,
        valid_sets=[d_train, d_valid], 
        early_stopping_rounds=50,
        verbose_eval=50
    )
    model.save_model(f'input/test_model.txt')
    #lgb.plot_importance(model, importance_type='gain')
else:
    print('Loading Model and Making Predictions')
    # Load model
    model = lgb.Booster(model_file='/kaggle/input/riiid-test-model/test_model.txt')
    
    # Setup dicts
    user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
    user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
    content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
    content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

    # Make env
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    df_prior_test = None

    # Make predictions
    for (df_test, df_sample_prediction) in iter_test:
        if df_prior_test is not None:
            df_prior_test[target] = eval(df_test['prior_group_answers_correct'].iloc[0])
            df_prior_test = df_prior_test[df_prior_test[target] != -1].reset_index(drop=True)

            user_ids = df_prior_test['user_id'].values
            content_ids = df_prior_test['content_id'].values
            targets = df_prior_test[target].values

            for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
                user_sum_dict[user_id] += answered_correctly
                user_count_dict[user_id] += 1
                content_sum_dict[content_id] += answered_correctly
                content_count_dict[content_id] += 1

        df_prior_test = df_test.copy()

        df_test = df_test[df_test['content_type_id'] == 0].reset_index(drop=True)
        df_test = pd.merge(df_test, df_questions, left_on='content_id', right_on='question_id', how='left')
        df_test['prior_question_had_explanation'] = df_test['prior_question_had_explanation']\
                                                        .fillna(False).astype('bool')    

        user_sum = np.zeros(len(df_test), dtype=np.int16)
        user_count = np.zeros(len(df_test), dtype=np.int16)
        content_sum = np.zeros(len(df_test), dtype=np.int32)
        content_count = np.zeros(len(df_test), dtype=np.int32)

        for i, (user_id, content_id) in enumerate(zip(df_test['user_id'].values, df_test['content_id'].values)):
            user_sum[i] = user_sum_dict[user_id]
            user_count[i] = user_count_dict[user_id]
            content_sum[i] = content_sum_dict[content_id]
            content_count[i] = content_count_dict[content_id]

        df_test['user_correctness'] = user_sum / user_count
        df_test['content_count'] = content_count
        df_test['content_id'] = content_sum / content_count

        df_test[target] = model.predict(df_test[features])
        env.predict(df_test[['row_id', target]])
print("Finished")

Building Model
[LightGBM] [Info] Number of positive: 34969, number of negative: 28491
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2706
[LightGBM] [Info] Number of data points in the train set: 63460, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.551040 -> initscore=0.204874
[LightGBM] [Info] Start training from score 0.204874
Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.774488	valid_1's auc: 0.733889
[100]	training's auc: 0.787538	valid_1's auc: 0.733756
Early stopping, best iteration is:
[69]	training's auc: 0.779495	valid_1's auc: 0.734152
