<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/notebooks/PEBG_for_riiid_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## PEBG - Pretraining Questions and Skill (Tags) Embedding

### Original

Paper: https://arxiv.org/pdf/2012.05031v1.pdf  
Github: https://github.com/lyf-1/PEBG

In [1]:
%%capture
!pip install gcsfs
!pip install kaggle
!pip install datatable

In [3]:
import os
import itertools
import pandas as pd
import numpy as np
from scipy import sparse

In [4]:
DATA_PATH = 'gs://kds-7cd35ed419a621f754ec32f0c3616d2e9282a698c5eeaabc814bd7f6'

In [5]:
questions_df = pd.read_csv(DATA_PATH + "/questions.csv")
questions_df.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


In [6]:
questions_df.describe()

Unnamed: 0,question_id,bundle_id,correct_answer,part
count,13523.0,13523.0,13523.0,13523.0
mean,6761.0,6760.510907,1.455298,4.264956
std,3903.89818,3903.857783,1.149707,1.652553
min,0.0,0.0,0.0,1.0
25%,3380.5,3379.5,0.0,3.0
50%,6761.0,6761.0,1.0,5.0
75%,10141.5,10140.0,3.0,5.0
max,13522.0,13522.0,3.0,7.0


In [7]:
questions_df[questions_df["tags"].isnull()]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
10033,10033,10033,2,6,


In [8]:
questions_df.fillna("-1", inplace=True) # tags Nan filled with -1

In [9]:
f"{len(questions_df)} records"

'13523 records'

In [10]:
lectures_df = pd.read_csv(DATA_PATH + "/lectures.csv")
lectures_df.head()

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question


In [11]:
lectures_df.describe()

Unnamed: 0,lecture_id,tag,part
count,418.0,418.0,418.0
mean,16983.401914,94.480861,4.267943
std,9426.16466,53.586487,1.872424
min,89.0,0.0,1.0
25%,9026.25,50.25,2.0
50%,17161.5,94.5,5.0
75%,24906.25,140.0,6.0
max,32736.0,187.0,7.0


In [12]:
lectures_df.type_of.value_counts()

concept             222
solving question    186
intention             7
starter               3
Name: type_of, dtype: int64

#### Question - Skill Relationship

Questions IDs

In [13]:
# ids from 1 for the questions. 0 is used for padding.
questions = sorted(questions_df["question_id"].values)
question_ids_dict = dict(zip(questions, pd.factorize(questions, sort=True)[0] + 1)) # +1 since id to start from 1

In [14]:
{k:v for k, v in itertools.islice(question_ids_dict.items(), 5)}

{0: 1, 1: 2, 2: 3, 3: 4, 4: 5}

Skill IDs

In [15]:
from collections import Counter
skill_count = Counter()

for idx, row in questions_df.iterrows():
  skill_count.update([int(x) for x in row["tags"].split(" ")])

skill_count.most_common()[:5]

[(92, 2269), (38, 2256), (81, 1969), (29, 1707), (136, 1033)]

In [16]:
# ids from 1 for the skills, 0 is used for padding.
skills = sorted(list(skill_count.keys()))
skill_ids_dict = dict(zip(skills, pd.factorize(skills, sort=True)[0] + 1))

In [17]:
{k:v for k, v in itertools.islice(skill_ids_dict.items(), 5)}

{-1: 1, 0: 2, 1: 3, 2: 4, 3: 5}

##### Questions - Skill Relationship - Adjacency matrix

In [18]:
question_skills_map = {}
skill_questions_map = {}

for idx, row in questions_df.iterrows():
  qid = row["question_id"]
  skills = row["tags"].split(" ")
  question_skills_map[qid] = skills
  
  for skill in skills:
    if skill_questions_map.get(skill, False):
      skill_questions_map[skill].append(qid)
    else:
      skill_questions_map[skill] = [qid]

In [19]:
# skills to questions
{k:v[:10] for k, v in itertools.islice(skill_questions_map.items(), 2)}

{'131': [0, 1, 2, 3, 4, 5, 8, 10, 11, 12],
 '51': [0, 11, 12, 30, 68, 70, 93, 119, 7870, 7882]}

In [20]:
# questions to skill
{k:v for k, v in itertools.islice(question_skills_map.items(), 2)}

{0: ['51', '131', '162', '38'], 1: ['131', '36', '81']}

In [21]:
question_skill_matrix = np.zeros((len(question_ids_dict)+1, len(skill_ids_dict)+1)) # +1 since, 0th position for both is reserved for padding.
question_skill_matrix.shape

(13524, 190)

In [22]:
for question, qid in question_ids_dict.items():

  # get its skills
  skills = question_skills_map[question]
  for skill in skills:
    question_skill_matrix[qid, skill_ids_dict[int(skill)]] = 1

In [23]:
question_skill_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

##### Question - Question Implicit Relationship

In [24]:
question_question_matrix = np.zeros((len(question_ids_dict)+1, len(question_ids_dict)+1)) # +1 since 0th position reserved for padding.
question_question_matrix.shape

(13524, 13524)

In [25]:
for question, qid in question_ids_dict.items():
  # get its skills
  skills = question_skills_map[question]
  for skill in skills:
    # get all the questions related to this skill
    other_questions = skill_questions_map[skill]
    for oq in other_questions:
      question_question_matrix[qid, question_ids_dict[oq]] = 1

In [26]:
question_question_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

##### Skill - Skill Implicit relationship

In [27]:
skill_skill_matrix = np.zeros((len(skill_ids_dict)+1, len(skill_ids_dict)+1))
skill_skill_matrix.shape

(190, 190)

In [28]:
for skill, sid in skill_ids_dict.items():

  # get the questiions for this skill
  questions = skill_questions_map[str(skill)]

  for q in questions:
    skills_for_q = question_skills_map[q]
    for s in skills_for_q:
      skill_skill_matrix[sid, skill_ids_dict[int(s)]] = 1

In [29]:
skill_skill_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

### Training the Question Embedding

In [30]:
import tensorflow as tf
import tensorflow.keras as keras

In [31]:
class PEBG(keras.models.Model):

  def __init__(self, **kwargs):
    super().__init__(**kwargs)

    # expected input tokens [0-len(question_ids_dict)]
    # 0 is used for padding, so actual question starts from 1
    self.question_emb_layer = keras.layers.Embedding(len(question_ids_dict)+1, 512)
    self.skill_emb_layer = keras.layers.Embedding(len(skill_ids_dict)+1, 512)

  def call(self, x):
    """
    x - question ids for the batch
    """
  
    # input question embedding
    question_emb = self.question_emb_layer(x)

    # all skills embeddings
    skills_full = np.arange(skill_skill_matrix.shape[0])
    skill_emb_full = self.skill_emb_layer(skills_full)
    
    # all question emebddings
    questions_full = np.arange(question_skill_matrix.shape[0])
    question_emb_full = self.question_emb_layer(questions_full)
    
    # --- question-skill relationship
    question_skill_similarity = tf.matmul(question_emb, tf.transpose(skill_emb_full))
    question_skill_similarity = tf.sigmoid(question_skill_similarity)

    # --- question-question relationsip
    question_question_similarity = tf.sigmoid(tf.matmul(question_emb, tf.transpose(question_emb_full)))

    # --- skill-skill relationship
    skill_skill_similarity = tf.sigmoid(tf.matmul(skill_emb_full, tf.transpose(skill_emb_full)))

    return question_skill_similarity, question_question_similarity, skill_skill_similarity

In [32]:
model = PEBG()

In [33]:
loss_qs = keras.losses.BinaryCrossentropy(name="qs_loss")
loss_qq = keras.losses.BinaryCrossentropy(name="qq_loss")
loss_ss = keras.losses.BinaryCrossentropy(name="ss_loss")

In [34]:
optimizer = keras.optimizers.Adam(learning_rate=3e-3)

In [35]:
model.compile(loss=[loss_qs, loss_qq, loss_ss], optimizer=optimizer)

In [36]:
def data_gen(batch_size):
  i = 0
  while i < question_skill_matrix.shape[0]:
    if i + batch_size < question_skill_matrix.shape[0]:
      start, end = i, i+batch_size
    else:
      start, end = i, question_skill_matrix.shape[0]
      
    x = np.arange(start, end) # batch of question ids.
    y1 = question_skill_matrix[start:end, :] # batch of question-skill relationship matrix
    y2 = question_question_matrix[start:end, :] # batch of question-question relationship matrix
    y3 = skill_skill_matrix # full skill-skill relatioship matrix

    yield x, (np.expand_dims(y1, axis=1), np.expand_dims(y2, axis=1), y3)
    i += batch_size

def data_gen_outer(batch_size=32):
  while True:
    yield from data_gen(batch_size)

In [38]:
for x, y in itertools.islice(data_gen_outer(32), 1):
  print(x.shape, y[0].shape, y[1].shape, y[2].shape)

(32,) (32, 1, 190) (32, 1, 13524) (190, 190)


In [39]:
batch_size = 256
steps_per_epoch = sum([1 for x in data_gen(batch_size)])

In [40]:
model.fit(x=data_gen_outer(batch_size), steps_per_epoch=steps_per_epoch, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f192ab8b0f0>

Trained Embeddings

In [41]:
# Question Embeddings
model.question_emb_layer.embeddings

<tf.Variable 'pebg/embedding/embeddings:0' shape=(13524, 512) dtype=float32, numpy=
array([[ 0.11638232,  0.06535552, -0.01314952, ..., -0.13246727,
        -0.05797041,  0.01014372],
       [ 0.00109777,  0.19277222, -0.13173836, ...,  0.28658658,
        -0.01569505,  0.33641967],
       [-0.19911356,  0.03192   ,  0.395069  , ...,  0.6991206 ,
         1.0284982 ,  0.7063647 ],
       ...,
       [-0.1155467 , -0.24189898,  0.35891122, ...,  0.12221462,
        -0.3937341 ,  0.20390603],
       [ 0.15150973, -0.18828931,  0.16208261, ..., -0.2477136 ,
         0.1107517 , -0.10507666],
       [-0.39865467,  0.3005379 ,  0.14781755, ..., -0.45457536,
        -0.12065182, -0.5957024 ]], dtype=float32)>

In [42]:
# Skills Embeddings
model.skill_emb_layer.embeddings

<tf.Variable 'pebg/embedding_1/embeddings:0' shape=(190, 512) dtype=float32, numpy=
array([[ 0.03362526,  0.00359148, -0.05108867, ..., -0.07942886,
        -0.09174554,  0.01776172],
       [ 0.02678815,  0.30969268, -0.24670598, ...,  0.22584054,
         0.06760801, -0.570033  ],
       [-1.3883893 ,  1.3455019 , -1.3073989 , ..., -0.00886322,
        -2.1125605 , -0.46222714],
       ...,
       [-0.10803529,  0.49158198,  0.52322584, ...,  0.18236262,
         0.42145532, -1.1476061 ],
       [ 0.05665646,  1.4300889 , -0.25593925, ...,  0.01547858,
         0.9443566 ,  0.48409668],
       [-0.1239078 , -0.5829788 , -0.06567568, ..., -0.29379362,
         0.82124805, -0.4986084 ]], dtype=float32)>

##### Loading the trained embedding.

In [43]:
embedding_matrix = model.question_emb_layer.embeddings
question_emb_layer = keras.layers.Embedding(len(question_ids_dict)+1, 512, embeddings_initializer=keras.initializers.Constant(embedding_matrix))

##### Upload to Kaggle

In [46]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [55]:
# Copy Kaggle API key
!mkdir -p ~/.kaggle && cp /content/drive/My\ Drive/Projects/Kaggle/api_key/kaggle.json ~/.kaggle/

In [47]:
!mkdir embedding

In [48]:
np.save("embedding/question_emb.npy", embedding_matrix)

In [52]:
np.load("embedding/question_emb.npy")

array([[ 0.11638232,  0.06535552, -0.01314952, ..., -0.13246727,
        -0.05797041,  0.01014372],
       [ 0.00109777,  0.19277222, -0.13173836, ...,  0.28658658,
        -0.01569505,  0.33641967],
       [-0.19911356,  0.03192   ,  0.395069  , ...,  0.6991206 ,
         1.0284982 ,  0.7063647 ],
       ...,
       [-0.1155467 , -0.24189898,  0.35891122, ...,  0.12221462,
        -0.3937341 ,  0.20390603],
       [ 0.15150973, -0.18828931,  0.16208261, ..., -0.2477136 ,
         0.1107517 , -0.10507666],
       [-0.39865467,  0.3005379 ,  0.14781755, ..., -0.45457536,
        -0.12065182, -0.5957024 ]], dtype=float32)

In [56]:
!kaggle datasets init -p embedding/

Data package template written to: embedding/dataset-metadata.json


In [57]:
# id and title only alphanumeric and "-"
meta = """
{
  "licenses": [
    {
      "name": "CC0-1.0"
    }
  ], 
  "id": "nisarahamedk/embedding-0-1",
  "title": "embedding-0-1"
}
"""
with open("embedding/dataset-metadata.json", "w") as f:
  f.write(meta)

In [59]:
# create
!kaggle datasets create -p embedding/ --dir-mode tar -u

Starting upload for file question_emb.npy
100% 26.4M/26.4M [00:01<00:00, 14.8MB/s]
Upload successful: question_emb.npy (26MB)
Your public Dataset is being created. Please check progress at https://www.kaggle.com/nisarahamedk/embedding-0-1
