<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/notebooks/dataset_for_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Dataset for Transformer

In [143]:
%%capture
!pip install gcsfs

In [144]:
import gcsfs
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, IterableDataset
from torch.utils.data import DataLoader

In [145]:
DATA_GCS_PATH = 'gs://kds-f053d4580bc3a669abc506205db4de0fe88fec05714d876ac14309ac'

In [146]:
dtypes_train = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
    }

dtypes_questions = {
    "question_id": "",
    "bundle_id": "",
    "correct_answer": "",
    "part": "",
    "tags": "",
}

dtypes_lectures = {
    "lecture_id": "",
    "part": "",
    "tag": "",
    "type_of": "",
}

In [147]:
train_df = pd.read_csv(DATA_GCS_PATH + "/train.csv", dtype=dtypes_train, nrows=1e6)
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False


In [148]:
questions_df = pd.read_csv(DATA_GCS_PATH + "/questions.csv")
questions_df.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


In [149]:
lectures_df = pd.read_csv(DATA_GCS_PATH + "/lectures.csv")
lectures_df.head()

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question


Lets remove the lecture interactions from all the users  from train. We may want to add it back later.

In [150]:
print(f"DF len: {len(train_df)}")
train_df = train_df[train_df.answered_correctly != -1]
print(f"After removing lecture interations df len: {len(train_df)}")

DF len: 1000000
After removing lecture interations df len: 980093


### Cleaning and Merging the Questions Dataframe with Train


*   Lets join the `questions_df` to `train_df` using `content_id` = `question_id`
*   Drop the columns we are not using.
*   Prior_questions_elapsed is in ms - Lets bring it to minutes.
*   Lets add 1 to all the indicator columns - We would need 0 for padding.



In [151]:
train_df = train_df.join(questions_df, on="content_id")
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part,tags
0,0,0,115,5692,0,1,3,1,,,5692,5692,3,5,151
1,1,56943,115,5716,0,2,2,1,37000.0,False,5716,5716,2,5,168
2,2,118363,115,128,0,0,0,1,55000.0,False,128,128,0,1,131 149 92
3,3,131167,115,7860,0,3,0,1,19000.0,False,7860,7860,0,1,131 104 81
4,4,137965,115,7922,0,4,1,1,11000.0,False,7922,7922,1,1,131 149 92


Column Drops

In [152]:
drop_cols = [
             "row_id", 
             "timestamp", 
             "content_type_id", 
             "user_answer", 
             "prior_question_had_explanation", 
             "question_id", 
             "bundle_id", 
             "correct_answer", 
             "tags"
             ]
train_df = train_df.drop(drop_cols, axis=1)
train_df.head()

Unnamed: 0,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,part
0,115,5692,1,1,,5
1,115,5716,2,1,37000.0,5
2,115,128,0,1,55000.0,1
3,115,7860,3,1,19000.0,1
4,115,7922,4,1,11000.0,1


Indicator columns - add 1 since we use 0 for padding

In [153]:
indicator_cols = ["content_id", "task_container_id", "part"]
for c in indicator_cols:
  train_df[c] = train_df[c] + 1
train_df.head()

Unnamed: 0,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,part
0,115,5693,2,1,,6
1,115,5717,3,1,37000.0,6
2,115,129,1,1,55000.0,2
3,115,7861,4,1,19000.0,2
4,115,7923,5,1,11000.0,2


In [154]:
train_df["part"].value_counts()

6    403240
3    190731
7    108567
4     82175
5     75997
2     69411
8     49972
Name: part, dtype: int64

Convert to mins, fill nan to 0

In [155]:
train_df['prior_question_elapsed_time'] = train_df["prior_question_elapsed_time"].fillna(0).astype(np.float32) / 60000
train_df.head()

Unnamed: 0,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,part
0,115,5693,2,1,0.0,6
1,115,5717,3,1,0.616667,6
2,115,129,1,1,0.916667,2
3,115,7861,4,1,0.316667,2
4,115,7923,5,1,0.183333,2


#### Group by User

We are building a sequence dataset. Where each user interactions will be a sequence.

In [156]:
user_groups = train_df.groupby("user_id")
len(user_groups)

3824

A sample user and user interactions

In [157]:
first_user, first_user_df = next(iter(user_groups))
print(f"User: {first_user}")
first_user_df.head()

User: 115


Unnamed: 0,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,part
0,115,5693,2,1,0.0,6
1,115,5717,3,1,0.616667,6
2,115,129,1,1,0.916667,2
3,115,7861,4,1,0.316667,2
4,115,7923,5,1,0.183333,2


In [158]:
len(first_user_df)

46

So this can be considered as a sequence of 46 observation. Although multivariate sequence.

### PyTorch IterableDataset

In [159]:
# just some stuff I ctrl C ctrl V from StackOverflow (with little changes)
# [1,2,3,4] --- w = 2 --[[1,2], [2,3], [3,4]] but 2D to 3D
def rolling_window(a, w):
    s0, s1 = a.strides
    m, n = a.shape
    return np.lib.stride_tricks.as_strided(
        a, 
        shape=(m-w+1, w, n), 
        strides=(s0, s0, s1)
    )

def make_timeseries(x, window_length):
  """
  x - shape (seq_len, features)
  """
  # pad a lot of 0s before so when we create windows, we will have appropriate padding.
  x = np.pad(x, ((window_length-1, 0), (0, 0)), constant_values=0) # add padding to the first dimension
  
  # roll windows 
  x = rolling_window(x, window_length) # shape will become - (seq_len, window_length, features)

  return x

def add_features_to_user_df(user_df):

  # shifted answered_correctly, SOS 3
  user_df["answered_correctly"] = user_df["answered_correctly"].shift(fill_value=2) + 1

  return user_df

class RiidDataset(IterableDataset):
  def __init__(self, user_groups, window_length=100):
    self.user_groups = user_groups
    self.window_length = window_length

  def __iter__(self):
    for user_id, user_df in self.user_groups:
      user_df = user_df.copy()
      y = user_df["answered_correctly"].to_numpy().copy()
      x = add_features_to_user_df(user_df)

      x = make_timeseries(x, self.window_length)
      y = make_timeseries(np.expand_dims(y, axis=1), self.window_length)
      yield x, y

In [160]:
riid_ds = RiidDataset(user_groups)

In [166]:
x, y = next(iter(riid_ds))

x.shape, y.shape

((46, 100, 6), (46, 100, 1))

In [164]:
id, df = next(iter(user_groups))
df.tail()

Unnamed: 0,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,part
41,115,2065,41,1,0.283333,4
42,115,2064,41,0,0.283333,4
43,115,3364,42,0,0.238883,5
44,115,3366,42,0,0.238883,5
45,115,3365,42,1,0.238883,5
