In [1]:
import pandas as pd
import os
import gc
import numpy as np

In [2]:
def memory_usage(df, name):
    mb = df.memory_usage().sum() / 1024 / 1024
    print(name + ' use {:.2f} mb in memory'.format(mb))

# TODO
* Попробовать всё-таки считать прогресс оконной функцией
* Погенерировать ещё признаков

# Загрузка данных

In [3]:
data_path = './data'
#data_path = '/kaggle/input'
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        current_file_path = os.path.join(dirname, filename)
        if 'lectures.csv' in current_file_path:
            lectures_path = current_file_path
        if 'questions.csv' in current_file_path:
            questions_path = current_file_path
        if 'train.csv' in current_file_path:
            data_path = current_file_path
        if 'example_test.csv' in current_file_path:
            test_data_path = current_file_path
        print(current_file_path)

./data/.DS_Store
./data/riiid-test-answer-prediction/lectures.csv
./data/riiid-test-answer-prediction/example_sample_submission.csv
./data/riiid-test-answer-prediction/questions.csv
./data/riiid-test-answer-prediction/train.csv
./data/riiid-test-answer-prediction/example_test.csv
./data/riiid-test-answer-prediction/riiideducation/competition.cpython-37m-x86_64-linux-gnu.so
./data/riiid-test-answer-prediction/riiideducation/__init__.py


In [4]:
%%time
lectures_df = pd.read_csv(lectures_path)
questions_df = pd.read_csv(questions_path)
data = pd.read_csv(data_path, low_memory=False, nrows=3e7, 
                   dtype={'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 
                          'content_id': 'int16', 'content_type_id': 'int8', 'task_container_id': 'int16', 
                          'user_answer': 'int8', 'answered_correctly': 'int8', 
                          'prior_question_elapsed_time': 'float32', 
                          'prior_question_had_explanation': 'boolean'
                         }
                  )
test = pd.read_csv(test_data_path, 
                   dtype={'row_id': 'int64', 'group_num': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 
                          'content_id': 'int16', 'content_type_id': 'int8', 'task_container_id': 'int16', 
                          'user_answer': 'int8', 'answered_correctly': 'int8', 
                          'prior_question_elapsed_time': 'float32', 
                          'prior_question_had_explanation': 'boolean',
                          'prior_group_answers_correct': 'object', 
                          'prior_group_responses': 'object'
                         }
                  )
memory_usage(lectures_df, 'lectures')
memory_usage(questions_df, 'questions')
memory_usage(data, 'train')
memory_usage(test, 'test')

lectures use 0.01 mb in memory
questions use 0.52 mb in memory
train use 944.14 mb in memory
test use 0.01 mb in memory
CPU times: user 27.3 s, sys: 3.61 s, total: 30.9 s
Wall time: 30.9 s


# Формирование массива

In [5]:
def merge_data(data, questions_df, lectures_df=None, data_type='train'):
    # Собираем общий массив
    questions_df.columns = ['content_id', 'bundle_id', 'correct_answer', 'part', 'tags']
    if lectures_df is None:
        full_df = (
            data[data['content_type_id'] != 1]
            .merge(questions_df, on='content_id', how='left')
        )
    else:
        lectures_df.columns = ['content_id', 'lecture_tag', 'lecture_part', 'type_of_lecture']
        full_df = (
            data
            .merge(questions_df, on='content_id', how='left')
            .merge(lectures_df, on='content_id', how='left')
        )
        # Уменьшаем размер массива путём объединения колонок и заполенения наллов
        full_df.loc[full_df['part'].isnull(), 'part'] = full_df.loc[full_df['part'].isnull(), 'lecture_part']
        full_df.loc[full_df['tags'].isnull(), 'tags'] = full_df.loc[full_df['tags'].isnull(), 'lecture_tag'].astype(str)
        full_df.drop(columns=['lecture_part', 'lecture_tag'], inplace=True)
        full_df.loc[full_df['correct_answer'].isnull(), 'correct_answer'] = -1
    
    # Приводим типы
    full_df['part'] = full_df['part'].astype('int8')
    
    # Если обучающая выборка, то можем проверить правильность ответа
    if data_type == 'train':
        full_df['correct_answer'] = full_df['correct_answer'].astype('int8')
        # Убеждаемся в правильности ответов
        full_df['answered_correctly_really'] = (full_df['user_answer'] == full_df['correct_answer']).astype('int8')
        index_dev = (
            (full_df['answered_correctly_really'] != full_df['answered_correctly']) 
            & (full_df['answered_correctly'] != -1)
        )
        deviations_count = index_dev.sum()
        if deviations_count > 0:
            print('Wrong answer_correctly in {} rows'.format(deviations_count))
            full_df.loc[index_dev, 'answered_correctly'] = full_df.loc[index_dev, 'answered_correctly_really']
        full_df.drop(columns=['answered_correctly_really', 'correct_answer'], inplace=True)
    
    memory_usage(full_df, 'merged_df')
    
    return full_df

In [6]:
#merged_df = merge_data(data, questions_df)

In [7]:
def current_question_time(full_df):
    # Время, затраченное на ответ (у пачки вопросов временные характеристики одинаковые)
    bundles = full_df[['timestamp', 'user_id', 'bundle_id', 'prior_question_elapsed_time']].drop_duplicates()
    bundles['question_elapsed_time'] = (
        bundles
        .sort_values(by=['timestamp'], ascending=True)
        .groupby(['user_id'])['prior_question_elapsed_time'].shift(-1)
        .dropna()
    )
    # Добавляем данные к итоговому массиву
    full_df = full_df.merge(bundles[['timestamp', 'user_id', 'bundle_id', 'question_elapsed_time']], 
                            on=['user_id', 'bundle_id', 'timestamp'],
                            how='left')
    
    return full_df

In [8]:
#full_df = current_question_time(merged_df)

In [9]:
#full_df.head(5)

# Сбор статистик

In [10]:
def get_questions_characteristics(full_df, questions_characteristics=None):
    # Время, затраченное на ответ
    bundles = (
        full_df.groupby('bundle_id').
        agg({'question_elapsed_time': ['sum', 'count']}).
        reset_index()
    )
    bundles.columns = ['bundle_id', 'question_bundle_time_sum', 'question_bundle_count']
    
    if questions_characteristics is None:
        # Характеристики правильности ответов
        questions_characteristics = (
            full_df
            .groupby(['content_id', 'bundle_id'])['answered_correctly']
            .mean()
            .reset_index()
        )
        questions_characteristics.columns = ['content_id', 'bundle_id', 'question_answered_correctly_mean']
        # Итоговый массив харастеристик вопросов
        questions_characteristics = questions_characteristics.merge(bundles, on='bundle_id', how='left')
    else:
        columns = questions_characteristics.columns
        columns_update = ['question_bundle_time_sum', 'question_bundle_count']
        columns_renamed = [col + '_prev' if col in columns_update else col for col in columns]
        questions_characteristics.columns = columns_renamed
        questions_characteristics = (
            questions_characteristics
            .merge(bundles, on=['bundle_id'], how='left')
            .fillna(0.)
        )
        for col in columns_update:
            questions_characteristics.loc[:, col] += questions_characteristics.loc[:, col + '_prev']
        
        questions_characteristics = questions_characteristics[columns]
    
    questions_characteristics['question_bundle_time_mean'] = questions_characteristics['question_bundle_time_sum'] / questions_characteristics['question_bundle_count']
    memory_usage(questions_characteristics, 'questions_characteristics')
    
    return questions_characteristics

In [11]:
#questions_characteristics = get_questions_characteristics(full_df)

In [12]:
#questions_characteristics.head(5)

In [13]:
def get_users_characteristics(full_df, users_characteristics=None):
    # Время, затраченное на ответ
    users_time_stats = (
        full_df[full_df['content_type_id'] != 1].
        groupby('user_id').
        agg({'prior_question_elapsed_time': ['sum', 'count']}).
        reset_index()
    )
    users_time_stats.columns = ['user_id', 'user_elapsed_time_sum', 
                                'user_answers_count']
    if users_characteristics is None:
        # Характеристики правильности ответов
        users_characteristics = (
            full_df
            .groupby(['user_id'])['answered_correctly']
            .mean()
            .reset_index()
        )
        users_characteristics.columns = ['user_id', 'user_answered_correctly_mean']
        users_characteristics = users_characteristics.merge(users_time_stats, on='user_id', how='left')
    else:
        columns = users_characteristics.columns
        columns_update = ['user_elapsed_time_sum', 'user_answers_count']
        columns_renamed = [col + '_prev' if col in columns_update else col for col in columns]
        users_characteristics.columns = columns_renamed
        users_characteristics = (
            users_characteristics
            .merge(users_time_stats, on=['user_id'], how='outer')
        )
        index_answers_null = users_characteristics['user_answered_correctly_mean'].isnull()
        users_characteristics = users_characteristics.fillna(0.)
        
        for col in columns_update:
            users_characteristics.loc[:, col] += users_characteristics.loc[:, col + '_prev']
            
        users_characteristics = users_characteristics[columns]
        # Возвращаем наллы в ответы
        users_characteristics.loc[index_answers_null, 'user_answered_correctly_mean'] = np.nan
        
    users_characteristics['user_elapsed_time_mean'] = users_characteristics['user_elapsed_time_sum'] / users_characteristics['user_answers_count']
    
    # Заполняем наллы средними значениями
    index_null = users_characteristics['user_elapsed_time_mean'].isnull()
    users_mean = users_characteristics['user_elapsed_time_mean'].mean()
    users_characteristics.loc[index_null, 'user_elapsed_time_mean'] = users_mean
    
    memory_usage(users_characteristics, 'users_characteristics')
    
    return users_characteristics

In [14]:
#users_characteristics = get_users_characteristics(full_df)

In [15]:
#users_characteristics

In [16]:
def get_combined_df(full_df, users_characteristics, questions_characteristics):
    output = (
        full_df
        .merge(questions_characteristics[['content_id', 'question_answered_correctly_mean', 
                                         'question_bundle_time_mean']], on='content_id', how='left')
        .merge(users_characteristics[['user_id', 'user_answered_correctly_mean', 
                                      'user_elapsed_time_mean']], on='user_id', how='left')
    )
    
    
    # Заполняем наллы первых ответов
    index_nulls = output.prior_question_elapsed_time.isnull()
    output.loc[index_nulls, 'prior_question_elapsed_time'] = output.loc[index_nulls, 'user_elapsed_time_mean']
    output.loc[index_nulls, 'prior_question_had_explanation'] = False
    index_nulls = output.prior_question_elapsed_time.isnull()
    output.loc[index_nulls, 'prior_question_elapsed_time'] = output.loc[index_nulls, 'question_bundle_time_mean']
    
    # Заполняем наллы статистик
    mean_question_ans = questions_characteristics['question_answered_correctly_mean'].mean()
    mean_question_time = questions_characteristics['question_bundle_time_mean'].mean()
    mean_user_time = users_characteristics['user_elapsed_time_mean'].mean()
    index_nulls = output.question_answered_correctly_mean.isnull()
    output.loc[index_nulls, 'question_answered_correctly_mean'] = mean_question_ans
    index_nulls = output.question_bundle_time_mean.isnull()
    output.loc[index_nulls, 'question_bundle_time_mean'] = mean_question_time
    index_nulls = output.user_elapsed_time_mean.isnull()
    output.loc[index_nulls, 'user_elapsed_time_mean'] = mean_user_time
    
    index_nulls = output.user_answered_correctly_mean.isnull()
    output.loc[index_nulls, 'user_answered_correctly_mean'] = output.loc[index_nulls, 'question_answered_correctly_mean']
    
    memory_usage(output, 'combined_df')
    
    return output

In [17]:
#final_df = get_combined_df(full_df, users_characteristics, questions_characteristics)

# Подготовка обучающей выборки

In [18]:
def prepare_train(train_data, questions_df):
    # Обработка входных данных
    full_df = merge_data(train_data, questions_df)
    full_df = current_question_time(full_df)
    # Расчёт статистик
    questions_characteristics = get_questions_characteristics(full_df)
    users_characteristics = get_users_characteristics(full_df)
    # Получение итогового датафрейма
    full_df = get_combined_df(full_df, users_characteristics, questions_characteristics)
    # Формирование обучающей выборки
    train_df = (
        full_df[['row_id', 'prior_question_elapsed_time', 'prior_question_had_explanation', 
                 'answered_correctly', 'question_answered_correctly_mean', 'question_bundle_time_mean',
                 'user_answered_correctly_mean', 'user_elapsed_time_mean']]
        .set_index('row_id')
    )
    train_df.loc[:, 'prior_question_had_explanation'] = train_df.loc[:, 'prior_question_had_explanation'].astype('int8')
    memory_usage(train_df, 'train_df')
    
    return train_df, questions_characteristics, users_characteristics

In [19]:
#train_df, questions_characteristics, users_characteristics = prepare_train(data, questions_df)

In [20]:
#train_df.isnull().sum()

# Подготовка тестовой выборки

In [21]:
def get_users_last_answers(data):
    users_last_timestamps = data[data['content_type_id'] == 0].groupby('user_id')['timestamp'].max().reset_index()
    merge_cols = ['user_id', 'timestamp']
    users_last_timestamps.columns = merge_cols
    output = data.merge(users_last_timestamps, on=merge_cols, how='inner')
    output['old_data'] = True
    return output

In [22]:
#last_users_answers = get_users_last_answers(data)

In [23]:
def prepare_test(test_data, last_users_answers, questions_df, users_characteristics, questions_characteristics):
    # Обработка входных данных
    test_data['old_data'] = False
    test_data = pd.concat([last_users_answers, test_data], axis=0)
    full_df = merge_data(test_data, questions_df, data_type='test')
    full_df = current_question_time(full_df)
    full_df = full_df[~full_df['old_data']]
    # Обновление статистик
    questions_characteristics = get_questions_characteristics(full_df, questions_characteristics)
    users_characteristics = get_users_characteristics(full_df, users_characteristics)
    # Получение итогового датафрейма
    full_df = get_combined_df(full_df, users_characteristics, questions_characteristics)
    # Формирование тестовой выборки
    test_df = (
        full_df.loc[:, 
                    ['row_id', 'prior_question_elapsed_time', 'prior_question_had_explanation', 
                     'question_answered_correctly_mean', 'question_bundle_time_mean',
                     'user_answered_correctly_mean', 'user_elapsed_time_mean']]
        .set_index('row_id')
    )
    test_df.loc[:, 'prior_question_had_explanation'] = test_df.loc[:, 'prior_question_had_explanation'].astype('int8')
    memory_usage(test_df, 'test_df')
    
    last_users_answers = get_users_last_answers(full_df)
    memory_usage(last_users_answers, 'last_users_answers')
    
    return test_df, questions_characteristics, users_characteristics, last_users_answers

In [24]:
#test_df, questions_characteristics, users_characteristics, last_users_answers = prepare_test(test, last_users_answers, questions_df, users_characteristics, questions_characteristics)

# Обучение модели

In [25]:
# Готовим исходные данные
data = data[data['content_type_id'] == 0]
merged_df = merge_data(data, questions_df)
last_users_answers = get_users_last_answers(merged_df)
del merged_df
gc.collect()

merged_df use 1627.16 mb in memory


0

In [26]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn import metrics
import lightgbm as lgbm
from sklearn import metrics
import gc
import pickle

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [27]:
# Общие параметры
# -----------
random_state=13
test_size=0.3
n_splits = 2

In [28]:
train_data, test_data = train_test_split(data, test_size=test_size, shuffle=False, random_state=random_state)

In [29]:
train_df, q_chars, u_chars = prepare_train(train_data, questions_df)
last_answers = get_users_last_answers(train_data)
test_df, q_chars, u_chars, last_answers = prepare_test(test_data, 
                                                       last_answers, questions_df, 
                                                       u_chars, q_chars)
test_df = (
        test_df.reset_index()
        .merge(test_data[['row_id', 'answered_correctly']], on='row_id', how='inner')
        .set_index('row_id')
    )
del q_chars, u_chars, last_answers
gc.collect()

merged_df use 1139.01 mb in memory
questions_characteristics use 0.62 mb in memory
users_characteristics use 3.13 mb in memory
combined_df use 1688.88 mb in memory
train_df use 746.25 mb in memory


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


merged_df use 572.82 mb in memory
questions_characteristics use 0.67 mb in memory
users_characteristics use 4.92 mb in memory
combined_df use 866.88 mb in memory
test_df use 378.73 mb in memory
last_users_answers use 5.86 mb in memory


0

In [30]:
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=13)

oof = np.zeros(len(train_data))
predictions = np.zeros(len(test_data))

skf_split = skf.split(X=train_data, y=train_data['answered_correctly'])

In [31]:
param = {
        'num_leaves': 80,
        'max_bin': 250,
        'min_data_in_leaf': 11,
        'learning_rate': 0.01,
        'min_sum_hessian_in_leaf': 0.00245,
        'bagging_fraction': 1.0, 
        'bagging_freq': 5, 
        'feature_fraction': 0.05,
        'lambda_l1': 4.972,
        'lambda_l2': 2.276,
        'min_gain_to_split': 0.65,
        'max_depth': 14,
        'save_binary': True,
        'seed': 1337,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'drop_seed': 1337,
        'data_random_seed': 1337,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': False,
        'device': 'cpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0
    }

In [32]:
%%time

# Training Loop
counter = 1

for train_index, valid_index in skf_split:
    train_df, q_chars, u_chars = prepare_train(train_data.iloc[train_index, :], questions_df)
    last_answers = get_users_last_answers(train_data.iloc[train_index, :])
    valid_df, q_chars, u_chars, last_answers = prepare_test(train_data.iloc[valid_index, :], 
                                                           last_answers, questions_df, 
                                                           u_chars, q_chars)
    valid_df = (
        valid_df.reset_index()
        .merge(train_data.iloc[valid_index, :][['row_id', 'answered_correctly']], on='row_id', how='inner')
        .set_index('row_id')
    )
    
    print("==== Fold {} ====".format(counter))
    
    lgbm_train = lgbm.Dataset(data = train_df.drop(columns='answered_correctly').values,
                              label = train_df['answered_correctly'].values,
                              #feature_name = features_to_keep,
                              free_raw_data = False)
    
    lgbm_valid = lgbm.Dataset(data = valid_df.values,
                              label = valid_df['answered_correctly'].values,
                              #feature_name = features_to_keep,
                              free_raw_data = False)
    
    lgbm_2 = lgbm.train(params = param, train_set = lgbm_train, valid_sets = [lgbm_valid],
                        early_stopping_rounds = 12, num_boost_round=100, verbose_eval=25)
    
    
    # X_valid to predict
    oof[valid_index] = lgbm_2.predict(valid_df.drop(columns='answered_correctly').values, 
                                      num_iteration = lgbm_2.best_iteration)
    predictions += lgbm_2.predict(test_df, 
                                  num_iteration = lgbm_2.best_iteration) / n_splits
    
    counter += 1

merged_df use 569.50 mb in memory
questions_characteristics use 0.62 mb in memory
users_characteristics use 3.13 mb in memory
combined_df use 844.44 mb in memory
train_df use 373.12 mb in memory


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


merged_df use 664.55 mb in memory
questions_characteristics use 0.67 mb in memory
users_characteristics use 3.45 mb in memory
combined_df use 1011.36 mb in memory
test_df use 441.86 mb in memory
last_users_answers use 10.26 mb in memory
==== Fold 1 ====
Training until validation scores don't improve for 12 rounds
Early stopping, best iteration is:
[7]	valid_0's auc: 0.745307
merged_df use 569.50 mb in memory
questions_characteristics use 0.62 mb in memory
users_characteristics use 3.13 mb in memory
combined_df use 844.44 mb in memory
train_df use 373.12 mb in memory


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


merged_df use 664.55 mb in memory
questions_characteristics use 0.67 mb in memory
users_characteristics use 3.45 mb in memory
combined_df use 1011.36 mb in memory
test_df use 441.86 mb in memory
last_users_answers use 10.26 mb in memory
==== Fold 2 ====
Training until validation scores don't improve for 12 rounds
Early stopping, best iteration is:
[7]	valid_0's auc: 0.745018
CPU times: user 13min 41s, sys: 2min 11s, total: 15min 53s
Wall time: 3min 39s


In [33]:
print("CV ROC: {:<0.2f}".format(metrics.roc_auc_score(test_df['answered_correctly'], predictions)))

CV ROC: 0.72
