### Modeling. Stage 1.

#### Loading and preprocessing data.

In [1]:
import numpy as np
import pandas as pd
import time

from IPython.display import Markdown
def bold(string):
    display(Markdown(string))
    
    
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
%matplotlib inline

sns.set(style="darkgrid")


from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


from matplotlib.ticker import FuncFormatter


from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier

import gc
import sys
pd.set_option('display.max_rows', None)

# The module cannot be used anywhere else, it is meant to be used in kaggle kernels only. 
# riiideducation is specific to kaggle kernels for this competition only.
#import riiideducation

In [2]:
%%time

dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "boolean"
}

data = pd.read_csv(r"\Data\train.csv", 
                   dtype=dtypes)

Wall time: 2min 21s


In [3]:
bold('**Shape of our train data:**')
display(data.shape)

**Shape of our train data:**

(101230332, 10)

In [4]:
data.head(10)

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,False,1,3,1,,
1,1,56943,115,5716,False,2,2,1,37000.0,False
2,2,118363,115,128,False,0,0,1,55000.0,False
3,3,131167,115,7860,False,3,0,1,19000.0,False
4,4,137965,115,7922,False,4,1,1,11000.0,False
5,5,157063,115,156,False,5,2,1,5000.0,False
6,6,176092,115,51,False,6,0,1,17000.0,False
7,7,194190,115,50,False,7,3,1,17000.0,False
8,8,212463,115,7896,False,8,2,1,16000.0,False
9,9,230983,115,7863,False,9,0,1,16000.0,False


In [5]:
data = data.drop('row_id', 1)
data['prior_question_had_explanation'] = data['prior_question_had_explanation'].astype('boolean')

bold('**Memory usage by data:**')
display(data.memory_usage(deep=True))

data.head()

**Memory usage by data:**

Index                                   128
timestamp                         809842656
user_id                           404921328
content_id                        202460664
content_type_id                   202460664
task_container_id                 202460664
user_answer                       101230332
answered_correctly                101230332
prior_question_elapsed_time       404921328
prior_question_had_explanation    202460664
dtype: int64

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,115,5692,False,1,3,1,,
1,56943,115,5716,False,2,2,1,37000.0,False
2,118363,115,128,False,0,0,1,55000.0,False
3,131167,115,7860,False,3,0,1,19000.0,False
4,137965,115,7922,False,4,1,1,11000.0,False


In [6]:
%%time

questions = pd.read_csv(r"\Data\questions.csv")
lectures = pd.read_csv(r"\Data\lectures.csv")
example_test = pd.read_csv(r"\Data\example_test.csv")
example_sample_submission = pd.read_csv(r"\Data\example_sample_submission.csv")

Wall time: 27.4 ms


In [7]:
questions['tags'] = questions['tags'].astype(str)

tags = [x.split() \
                for x in questions[questions['tags'] != "nan"] \
                                                                ['tags']. \
                                                                values]
tags = [item \
            for elem in tags \
                for item in elem]
tags = set(tags)
tags = list(tags)

In [8]:
questions['tags'] = questions['tags'].astype(str)

questions['tag'] = questions['tags']. \
                                     str. \
                                     split(' ')
questions = questions.explode('tag')
questions = pd.merge(
                     questions, 
                     questions. \
                               groupby('question_id')['tag']. \
                               count(). \
                               reset_index(), 
                     on = 'question_id'
                    )

questions_split = questions.copy()
questions_split.columns = [
                           'question_id', 
                           'bundle_id', 
                           'correct_answer', 
                           'part', 
                           'tags',
                           'splitted_tag', 
                           'number_of_tags'
                          ]
questions_split = questions_split.drop_duplicates()

questions_split.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,splitted_tag,number_of_tags
0,0,0,0,1,51 131 162 38,51,4
1,0,0,0,1,51 131 162 38,131,4
2,0,0,0,1,51 131 162 38,162,4
3,0,0,0,1,51 131 162 38,38,4
4,1,1,1,1,131 36 81,131,3


In [9]:
questions = questions.drop(['tag_x'], axis=1)
questions.columns = [
                     'question_id', 
                     'bundle_id', 
                     'correct_answer', 
                     'part', 
                     'tags', 
                     'number_of_tags'
                    ]
questions = questions.drop_duplicates()

questions.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,number_of_tags
0,0,0,0,1,51 131 162 38,4
4,1,1,1,1,131 36 81,3
7,2,2,0,1,131 101 162 92,4
11,3,3,0,1,131 149 162 29,4
15,4,4,3,1,131 5 162 38,4


In [10]:
tags_list = [x.split() \
                     for x in questions.tags.values]
questions['tags_list'] = tags_list

questions.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,number_of_tags,tags_list
0,0,0,0,1,51 131 162 38,4,"[51, 131, 162, 38]"
4,1,1,1,1,131 36 81,3,"[131, 36, 81]"
7,2,2,0,1,131 101 162 92,4,"[131, 101, 162, 92]"
11,3,3,0,1,131 149 162 29,4,"[131, 149, 162, 29]"
15,4,4,3,1,131 5 162 38,4,"[131, 5, 162, 38]"


In [11]:
correct = data[data['answered_correctly'] != -1]. \
                                                 groupby(['content_id', 
                                                          'answered_correctly'], 
                                                          as_index = False). \
                                                 size()
correct = correct.pivot(index = 'content_id', 
                        columns = 'answered_correctly', 
                        values = 'size')
correct.columns = ['Num_of_Wrong', 'Num_of_Right']
correct = correct.fillna(0)
correct[['Num_of_Wrong', 'Num_of_Right']] = correct[['Num_of_Wrong', 
                                                     'Num_of_Right']]. \
                                                                      astype(int)

questions = questions.merge(correct, 
                            left_on = 'question_id', 
                            right_on = 'content_id',
                            how = "left")

bold('**Answers were wrong and right per question_id (per content_id in train data):**')
questions.head(6)

**Answers were wrong and right per question_id (per content_id in train data):**

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,number_of_tags,tags_list,Num_of_Wrong,Num_of_Right
0,0,0,0,1,51 131 162 38,4,"[51, 131, 162, 38]",637,6266
1,1,1,1,1,131 36 81,3,"[131, 36, 81]",809,6589
2,2,2,0,1,131 101 162 92,4,"[131, 101, 162, 92]",20015,24890
3,3,3,0,1,131 149 162 29,4,"[131, 149, 162, 29]",5067,17906
4,4,4,3,1,131 5 162 38,4,"[131, 5, 162, 38]",12275,19461
5,5,5,2,1,131 149 162 81,4,"[131, 149, 162, 81]",1344,8383


In [12]:
%%time
tags_df = pd.DataFrame()

for x in range(len(tags)):
    df = questions[
                   questions['tags']. \
                                     apply(lambda l: tags[x] in l)
                  ]
    df_tmp = df.agg({'Num_of_Wrong': ['sum'], 
                     'Num_of_Right': ['sum']})
    df_tmp['Total_questions'] = df_tmp['Num_of_Wrong'] + df_tmp['Num_of_Right']
    df_tmp['Num_of_question_ids_with_tag'] = len(df)
    df_tmp['tag'] = tags[x]
    df_tmp = df_tmp.set_index('tag')
    tags_df = tags_df.append(df_tmp)

    
tags_df[['Num_of_Wrong', 
         'Num_of_Right', 
         'Total_questions']] = tags_df[['Num_of_Wrong', 
                                        'Num_of_Right', 
                                        'Total_questions']]. \
                                                             astype(int)
tags_df['Percent_correct'] = tags_df['Num_of_Right'] / \
                                                       tags_df['Total_questions']
tags_df = tags_df.sort_values(by = "Percent_correct")

tags_df.head(6)

Wall time: 915 ms


Unnamed: 0_level_0,Num_of_Wrong,Num_of_Right,Total_questions,Num_of_question_ids_with_tag,Percent_correct
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
151,272267,264913,537180,16,0.493155
24,258085,253221,511306,29,0.495244
23,540798,532011,1072809,40,0.495905
167,170386,170681,341067,11,0.500432
108,234504,238296,472800,20,0.50401
67,983588,1138475,2122063,338,0.536494


In [13]:
batch_lect = data.groupby(["task_container_id", 
                           "answered_correctly"]). \
                  size(). \
                  unstack()
batch_lect.columns = ['Lecture', 
                      'Num_of_Wrong', 
                      'Num_of_Right']
batch_lect['Lecture'] = batch_lect['Lecture'].fillna(0)
batch_lect = batch_lect.astype('Int64')

batch_lect['Percent_correct'] = batch_lect['Num_of_Right'] / \
                                    (batch_lect['Num_of_Wrong'] + \
                                    batch_lect['Num_of_Right'])
batch_lect['Percent_lecture'] = batch_lect['Lecture'] / \
                                    (batch_lect['Lecture'] + \
                                     batch_lect['Num_of_Wrong'] + \
                                     batch_lect['Num_of_Right'])
batch_lect = batch_lect.sort_values(by = "Percent_lecture", 
                                    ascending = False)


bold("**The highest number of lectures watched within a single task_container_id is**")
display(
        batch_lect['Lecture'].max()
)

**The highest number of lectures watched within a single task_container_id is**

5143

In [14]:
batch_lect.head(6)

Unnamed: 0_level_0,Lecture,Num_of_Wrong,Num_of_Right,Percent_correct,Percent_lecture
task_container_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9096,8,57,191,0.770161,0.03125
270,2265,26105,52352,0.66727,0.028059
477,1428,16166,33427,0.674027,0.027988
253,2343,27380,54798,0.666821,0.027721
351,1835,21225,43158,0.670332,0.027711
422,1574,18067,37234,0.673297,0.027675


In [15]:
batch_lect['Has_lecture'] = np.where(
                                     batch_lect['Lecture'] == 0, 
                                     False, 
                                     True
                                    )

batch_lect.head(6)

Unnamed: 0_level_0,Lecture,Num_of_Wrong,Num_of_Right,Percent_correct,Percent_lecture,Has_lecture
task_container_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9096,8,57,191,0.770161,0.03125,True
270,2265,26105,52352,0.66727,0.028059,True
477,1428,16166,33427,0.674027,0.027988,True
253,2343,27380,54798,0.666821,0.027721,True
351,1835,21225,43158,0.670332,0.027711,True
422,1574,18067,37234,0.673297,0.027675,True


In [16]:
batch_lect = batch_lect[['Num_of_Wrong', 
                         'Num_of_Right', 
                         'Has_lecture']]
batch_lect = batch_lect.groupby("Has_lecture"). \
                        sum()
batch_lect['Percent_correct'] = batch_lect['Num_of_Right'] / \
                                    (batch_lect['Num_of_Wrong'] + \
                                     batch_lect['Num_of_Right'])
batch_lect = batch_lect[['Percent_correct']]

batch_lect

Unnamed: 0_level_0,Percent_correct
Has_lecture,Unnamed: 1_level_1
False,0.736412
True,0.657087


---

#### Baseline model.

In [17]:
features_df = data.iloc[ : int(
                               9 / 10 * len(data)
                              )]
train_df = data.iloc[int(
                         9 / 10 * len(data)
                        ) : ]

In [18]:
bold('**Shapes of our datasets:**')
print('data:        ', data.shape, '\n',
      'train_df:    ', train_df.shape, '\n',
      'features_df: ', features_df.shape, '\n',
      sep = '')

**Shapes of our datasets:**

data:        (101230332, 9)
train_df:    (10123034, 9)
features_df: (91107298, 9)



In [19]:
%%time
train_questions_df = features_df[
                                 features_df['answered_correctly'] != -1
                                ]
grouped_by_user_df = train_questions_df.groupby('user_id')

user_answers_df = grouped_by_user_df.agg(
                                         {'answered_correctly': [
                                                                 'count', 
                                                                 'mean', 
                                                                 'median', 
                                                                 'std', 
                                                                 'skew'
                                                                ]}
                                        ). \
                                     copy()
user_answers_df.columns = [
                           'questions_answered', 
                           'mean_user_accuracy', 
                           'median_user_accuracy', 
                           'std_user_accuracy', 
                           'skew_user_accuracy'
                          ]


user_answers_df.head(10)

Wall time: 42.3 s


Unnamed: 0_level_0,questions_answered,mean_user_accuracy,median_user_accuracy,std_user_accuracy,skew_user_accuracy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
115,46,0.695652,1.0,0.465215,-0.879359
124,30,0.233333,0.0,0.430183,1.328338
2746,19,0.578947,1.0,0.507257,-0.347892
5382,125,0.672,1.0,0.471374,-0.741648
8623,109,0.642202,1.0,0.481566,-0.601619
8701,17,0.588235,1.0,0.5073,-0.394244
12741,265,0.573585,1.0,0.495491,-0.299277
13134,1243,0.706356,1.0,0.455614,-0.907294
24418,6283,0.690275,1.0,0.462417,-0.823222
24600,50,0.34,0.0,0.478518,0.696595


In [20]:
%%time
grouped_by_content_df = train_questions_df.groupby('content_id')

content_answers_df = grouped_by_content_df.agg(
                                               {'answered_correctly': [
                                                                       'count', 
                                                                       'mean', 
                                                                       'median',
                                                                       'std',  
                                                                       'skew'
                                                                      ]}
                                              ). \
                                           copy()
content_answers_df.columns = [
                              'question_asked', 
                              'mean_accuracy', 
                              'median_accuracy', 
                              'std_accuracy', 
                              'skew_accuracy'
                             ]

content_answers_df.head(10)

Wall time: 14.6 s


Unnamed: 0_level_0,question_asked,mean_accuracy,median_accuracy,std_accuracy,skew_accuracy
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,6236,0.908595,1.0,0.288207,-2.836339
1,6684,0.891682,1.0,0.310805,-2.521185
2,40499,0.554656,1.0,0.49701,-0.219949
3,20734,0.779348,1.0,0.414696,-1.347371
4,28549,0.613226,1.0,0.48702,-0.465009
5,8748,0.862711,1.0,0.344171,-2.10822
6,51074,0.47488,0.0,0.499373,0.100612
7,14892,0.86711,1.0,0.339467,-2.163149
8,7700,0.906234,1.0,0.291522,-2.787708
9,42591,0.305511,0.0,0.460629,0.844492


In [21]:
del features_df
del grouped_by_user_df
del grouped_by_content_df

In [22]:
features = [
            'mean_user_accuracy', 
            'questions_answered',
            'std_user_accuracy', 
            'median_user_accuracy',
            'skew_user_accuracy',
            'mean_accuracy', 
            'question_asked',
            'std_accuracy', 
            'median_accuracy',
            'prior_question_elapsed_time', 
            'prior_question_had_explanation',
            'skew_accuracy'
           ]

target = 'answered_correctly'

In [23]:
train_df = train_df[
                    train_df[target] != -1
                   ]

train_df = train_df.merge(user_answers_df, 
                          how = 'left', 
                          on = 'user_id')
train_df = train_df.merge(content_answers_df, 
                          how = 'left', 
                          on = 'content_id')

train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation']. \
                                                                                        fillna(value = False). \
                                                                                        astype(bool)
train_df = train_df.fillna(value = 0.5)

train_df.head(10)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,questions_answered,mean_user_accuracy,median_user_accuracy,std_user_accuracy,skew_user_accuracy,question_asked,mean_accuracy,median_accuracy,std_accuracy,skew_accuracy
0,8231964660,1933715875,11259,False,3188,2,0,13000.0,True,5219.0,0.779843,1.0,0.414392,-1.351136,1291.0,0.532146,1.0,0.499159,-0.128999
1,8232002976,1933715875,4957,False,3189,2,1,44000.0,True,5219.0,0.779843,1.0,0.414392,-1.351136,2548.0,0.584772,1.0,0.492858,-0.344273
2,8232096407,1933715875,5113,False,3190,0,1,22000.0,True,5219.0,0.779843,1.0,0.414392,-1.351136,1960.0,0.603571,1.0,0.48928,-0.423795
3,8232119872,1933715875,4699,False,3191,1,1,74000.0,True,5219.0,0.779843,1.0,0.414392,-1.351136,2504.0,0.694888,1.0,0.460547,-0.847011
4,8232142930,1933715875,11430,False,3192,3,1,9000.0,True,5219.0,0.779843,1.0,0.414392,-1.351136,1922.0,0.765869,1.0,0.423565,-1.256695
5,8232271384,1933715875,8217,False,3193,2,0,13000.0,True,5219.0,0.779843,1.0,0.414392,-1.351136,1817.0,0.604843,1.0,0.489019,-0.429262
6,8232333990,1933715875,5293,False,3194,3,1,88000.0,True,5219.0,0.779843,1.0,0.414392,-1.351136,2474.0,0.67017,1.0,0.470246,-0.724333
7,8232357667,1933715875,3840,False,3195,0,1,21000.0,True,5219.0,0.779843,1.0,0.414392,-1.351136,2171.0,0.586366,1.0,0.492598,-0.350977
8,8232374283,1933715875,11436,False,3196,1,1,14000.0,True,5219.0,0.779843,1.0,0.414392,-1.351136,1171.0,0.922289,1.0,0.267831,-3.158789
9,8232435220,1933715875,5353,False,3197,1,0,7000.0,True,5219.0,0.779843,1.0,0.414392,-1.351136,4040.0,0.52995,1.0,0.499164,-0.120062


* Переназвать фичи, а то не понятны

In [24]:
train_df = train_df[
                    features + [target]
                   ]
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0.5)

train_df.head(10)

Unnamed: 0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,prior_question_elapsed_time,prior_question_had_explanation,skew_accuracy,answered_correctly
0,0.779843,5219.0,0.414392,1.0,-1.351136,0.532146,1291.0,0.499159,1.0,13000.0,True,-0.128999,0
1,0.779843,5219.0,0.414392,1.0,-1.351136,0.584772,2548.0,0.492858,1.0,44000.0,True,-0.344273,1
2,0.779843,5219.0,0.414392,1.0,-1.351136,0.603571,1960.0,0.48928,1.0,22000.0,True,-0.423795,1
3,0.779843,5219.0,0.414392,1.0,-1.351136,0.694888,2504.0,0.460547,1.0,74000.0,True,-0.847011,1
4,0.779843,5219.0,0.414392,1.0,-1.351136,0.765869,1922.0,0.423565,1.0,9000.0,True,-1.256695,1
5,0.779843,5219.0,0.414392,1.0,-1.351136,0.604843,1817.0,0.489019,1.0,13000.0,True,-0.429262,0
6,0.779843,5219.0,0.414392,1.0,-1.351136,0.67017,2474.0,0.470246,1.0,88000.0,True,-0.724333,1
7,0.779843,5219.0,0.414392,1.0,-1.351136,0.586366,2171.0,0.492598,1.0,21000.0,True,-0.350977,1
8,0.779843,5219.0,0.414392,1.0,-1.351136,0.922289,1171.0,0.267831,1.0,14000.0,True,-3.158789,1
9,0.779843,5219.0,0.414392,1.0,-1.351136,0.52995,4040.0,0.499164,1.0,7000.0,True,-0.120062,0


In [25]:
train_df, test_df = train_test_split(
                                     train_df, 
                                     random_state = 666, 
                                     test_size = 0.28
                                    )

In [26]:
def create_model(trial):
    
    num_leaves = trial.suggest_int("num_leaves", 
                                   2, 
                                   31)
    n_estimators = trial.suggest_int("n_estimators", 
                                     50, 
                                     300)
    max_depth = trial.suggest_int('max_depth', 
                                  3, 
                                  8)
    min_child_samples = trial.suggest_int('min_child_samples', 
                                          100, 
                                          1200)
    learning_rate = trial.suggest_uniform('learning_rate', 
                                          0.0001, 
                                          0.99)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 
                                         5, 
                                         90)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 
                                             0.0001, 
                                             1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 
                                             0.0001, 
                                             1.0)
    
    model = LGBMClassifier(
                           num_leaves = num_leaves,
                           n_estimators = n_estimators, 
                           max_depth = max_depth, 
                           min_child_samples = min_child_samples, 
                           min_data_in_leaf = min_data_in_leaf,
                           learning_rate = learning_rate,
                           feature_fraction = feature_fraction,
                           random_state = 666
                          )
    
    return model

In [27]:
def objective(trial):
    
    model = create_model(trial)
    model.fit(train_df[features], 
              train_df[target])
    
    score = roc_auc_score(
                          test_df[target].values, 
                          model.predict_proba(
                                              test_df[features]
                                             )[:, 1]
                        )
    
    return score

In [28]:
%%time
params = {
          'bagging_fraction': 0.5817242323514327,
          'feature_fraction': 0.6884588361650144,
          'learning_rate': 0.42887924851375825, 
          'max_depth': 6,
          'min_child_samples': 946, 
          'min_data_in_leaf': 47, 
          'n_estimators': 169,
          'num_leaves': 29,
          'random_state': 666
        }

model = LGBMClassifier(**params)
model.fit(train_df[features], 
          train_df[target])

print('LGB score: ', 
      roc_auc_score(
                    test_df[target].values, 
                    model.predict_proba(
                                        test_df[features]
                                       )[:, 1]
                   ))

LGB score:  0.7217489630782472
Wall time: 40.9 s


In [29]:
#env = riiideducation.make_env()

In [30]:
%%time
#adding user features
user_df = data[data['answered_correctly'] != -1]. \
                                                 groupby('user_id'). \
                                                 agg(
                                                     {'answered_correctly': ['count', 
                                                                             'mean']}
                                                    ). \
                                                 reset_index()
user_df.columns = ['user_id', 
                   'user_questions', 
                   'user_mean']


user_lect = data.groupby(
                         ["user_id", 
                          "answered_correctly"]
                        ). \
                 size(). \
                 unstack()
user_lect.columns = ['Lecture', 
                     'Num_of_Wrong', 
                     'Num_of_Right']
user_lect['Lecture'] = user_lect['Lecture']. \
                                            fillna(0)
user_lect = user_lect.astype('Int64')
user_lect['watches_lecture'] = np.where(
                                        user_lect['Lecture'] > 0, 
                                        1, 
                                        0
                                       )
user_lect = user_lect.reset_index()
user_lect = user_lect[['user_id', 
                       'watches_lecture']]

user_df = user_df.merge(user_lect, 
                        how = "left",
                        on = "user_id", )
del user_lect

user_df.head(10)

Wall time: 20.4 s


Unnamed: 0,user_id,user_questions,user_mean,watches_lecture
0,115,46,0.695652,0
1,124,30,0.233333,0
2,2746,19,0.578947,1
3,5382,125,0.672,1
4,8623,109,0.642202,1
5,8701,17,0.588235,0
6,12741,265,0.573585,1
7,13134,1243,0.706356,1
8,24418,6283,0.690275,1
9,24600,50,0.34,0


In [31]:
%%time
#adding content features
content_df = data[data.answered_correctly != -1]. \
                                                 groupby('content_id'). \
                                                 agg(
                                                     {'answered_correctly': ['count', 
                                                                             'mean',
                                                                             'median']}
                                                    ). \
                                                 reset_index()
content_df.columns = ['content_id', 
                      'content_questions', 
                      'content_mean',
                      'content_median']

content_df.head(6)

Wall time: 14.5 s


Unnamed: 0,content_id,content_questions,content_mean,content_median
0,0,6903,0.907721,1.0
1,1,7398,0.890646,1.0
2,2,44905,0.554281,1.0
3,3,22973,0.779437,1.0
4,4,31736,0.613215,1.0
5,5,9727,0.861828,1.0


In [None]:
%%time
#using one of the validation sets composed by tito
cv2_train = pd.read_pickle("../input/riiid-cross-validation-files/cv2_train.pickle")['row_id']
cv2_valid = pd.read_pickle("../input/riiid-cross-validation-files/cv2_valid.pickle")['row_id']

---