<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/notebooks/PEBG_for_riiid_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## PEBG - Pretraining Questions and Skill (Tags) Embedding

### Original

Paper: https://arxiv.org/pdf/2012.05031v1.pdf  
Github: https://github.com/lyf-1/PEBG

In [1]:
%%capture
!pip install gcsfs
!pip install kaggle
!pip install datatable

In [2]:
# from google.colab import drive
# drive.mount("/content/drive")

In [3]:
import os 
import pandas as pd
import numpy as np
from scipy import sparse

In [4]:
DATA_PATH = 'gs://kds-7cd35ed419a621f754ec32f0c3616d2e9282a698c5eeaabc814bd7f6'

In [5]:
questions_df = pd.read_csv(DATA_PATH + "/questions.csv")
questions_df.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


In [6]:
questions_df.describe()

Unnamed: 0,question_id,bundle_id,correct_answer,part
count,13523.0,13523.0,13523.0,13523.0
mean,6761.0,6760.510907,1.455298,4.264956
std,3903.89818,3903.857783,1.149707,1.652553
min,0.0,0.0,0.0,1.0
25%,3380.5,3379.5,0.0,3.0
50%,6761.0,6761.0,1.0,5.0
75%,10141.5,10140.0,3.0,5.0
max,13522.0,13522.0,3.0,7.0


In [7]:
questions_df[questions_df["tags"].isnull()]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
10033,10033,10033,2,6,


In [8]:
questions_df.fillna("-1", inplace=True) # tags Nan filled with -1

In [9]:
f"{len(questions_df)} records"

'13523 records'

#### Question - Skill Relationship

Questions IDs

In [10]:
# ids from 0 for the questions.
questions = questions_df["question_id"].values
question_ids_dict = dict(zip(questions, pd.factorize(questions, sort=True)[0]))

In [11]:
# question_ids_dict

Skill IDs

In [12]:
from collections import Counter
skill_count = Counter()

for idx, row in questions_df.iterrows():
  skill_count.update([int(x) for x in row["tags"].split(" ")])

skill_count.most_common()[:5]

[(92, 2269), (38, 2256), (81, 1969), (29, 1707), (136, 1033)]

In [13]:
skills = list(skill_count.keys())
skill_ids_dict = dict(zip(skills, pd.factorize(skills, sort=True)[0]))

In [14]:
# skill_ids_dict

##### Questions - Skill Relationship - Adjacency matrix

In [15]:
question_skills_map = {}
skill_questions_map = {}

for idx, row in questions_df.iterrows():
  qid = row["question_id"]
  skills = row["tags"].split(" ")
  question_skills_map[qid] = skills
  
  for skill in skills:
    if skill_questions_map.get(skill, False):
      skill_questions_map[skill].append(qid)
    else:
      skill_questions_map[skill] = [qid]

In [16]:
# skill_questions_map

In [17]:
question_skill_matrix = np.zeros((len(question_ids_dict), len(skill_ids_dict)))
question_skill_matrix.shape

(13523, 189)

In [18]:
for question, qid in question_ids_dict.items():

  # get its skills
  skills = question_skills_map[question]
  for skill in skills:
    question_skill_matrix[qid, skill_ids_dict[int(skill)]] = 1

In [19]:
question_skill_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

##### Question - Question Implicit Relationship

In [20]:
question_question_matrix = np.zeros((len(question_ids_dict), len(question_ids_dict)))
question_question_matrix.shape

(13523, 13523)

In [21]:
for question, qid in question_ids_dict.items():
  # get its skills
  skills = question_skills_map[question]
  for skill in skills:
    # get all the questions related to this skill
    other_questions = skill_questions_map[skill]
    for oq in other_questions:
      question_question_matrix[qid, question_ids_dict[oq]] = 1

In [22]:
question_question_matrix

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

##### Skill - Skill Implicit relationship

In [23]:
skill_skill_matrix = np.zeros((len(skill_ids_dict), len(skill_ids_dict)))
skill_skill_matrix.shape

(189, 189)

In [24]:
for skill, sid in skill_ids_dict.items():

  # get the questiions for this skill
  questions = skill_questions_map[str(skill)]

  for q in questions:
    skills_for_q = question_skills_map[q]
    for s in skills_for_q:
      skill_skill_matrix[sid, skill_ids_dict[int(s)]] = 1

In [25]:
skill_skill_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

### Training the Question Embedding

In [26]:
import tensorflow as tf
import tensorflow.keras as keras

In [27]:
class PEBG(keras.models.Model):

  def __init__(self, **kwargs):
    super().__init__(**kwargs)

    self.question_emb_layer = keras.layers.Embedding(len(question_ids_dict), 512)
    self.skill_emb_layer = keras.layers.Embedding(len(skill_ids_dict), 512)

  def call(self, x):
    """
    x - question ids for the batch
    """
  
    # input question embedding
    question_emb = self.question_emb_layer(x)

    # all skills embeddings
    skills_full = np.array(list(skill_ids_dict.values()))
    skill_emb_full = self.skill_emb_layer(skills_full)
    
    # all question emebddings
    questions_full = np.array(list(question_ids_dict.values()))
    question_emb_full = self.question_emb_layer(questions_full)
    
    # --- question-skill relationship
    question_skill_similarity = tf.matmul(question_emb, tf.transpose(skill_emb_full))
    question_skill_similarity = tf.sigmoid(question_skill_similarity)

    # --- question-question relationsip
    question_question_similarity = tf.sigmoid(tf.matmul(question_emb, tf.transpose(question_emb_full)))

    # --- skill-skill relationship
    skill_skill_similarity = tf.sigmoid(tf.matmul(skill_emb_full, tf.transpose(skill_emb_full)))

    return question_skill_similarity, question_question_similarity, skill_skill_similarity

In [28]:
model = PEBG()

In [29]:
loss_qs = keras.losses.BinaryCrossentropy(name="qs_loss")
loss_qq = keras.losses.BinaryCrossentropy(name="qq_loss")
loss_ss = keras.losses.BinaryCrossentropy(name="ss_loss")

In [30]:
optimizer = keras.optimizers.Adam(learning_rate=3e-3)

In [31]:
model.compile(loss=[loss_qs, loss_qq, loss_ss], optimizer=optimizer)

In [32]:
def data_gen(batch_size):
  i = 0
  while i < len(question_ids_dict):
    if i + batch_size < len(question_ids_dict):
      start, end = i, i+batch_size
    else:
      start, end = i, len(question_ids_dict)
      
    x = np.arange(start, end) # batch of question ids.
    y1 = question_skill_matrix[start:end, :] # batch of question-skill relationship matrix
    y2 = question_question_matrix[start:end, :] # batch of question-question relationship matrix
    y3 = skill_skill_matrix # full skill-skill relatioship matrix

    yield x, (np.expand_dims(y1, axis=1), np.expand_dims(y2, axis=1), y3)
    i += batch_size

def data_gen_outer(batch_size=32):
  while True:
    yield from data_gen(batch_size)

In [33]:
from itertools import islice
for x, y in islice(data_gen_outer(32), 1):
  print(x.shape, y[0].shape, y[1].shape, y[2].shape)

(32,) (32, 1, 189) (32, 1, 13523) (189, 189)


In [34]:
batch_size = 256
steps_per_epoch = sum([1 for x in data_gen(batch_size)])

In [35]:
model.fit(x=data_gen_outer(batch_size), steps_per_epoch=steps_per_epoch, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f2b83693438>

Trained Embeddings

In [36]:
# Question Embeddings
model.question_emb_layer.embeddings

<tf.Variable 'pebg/embedding/embeddings:0' shape=(13523, 512) dtype=float32, numpy=
array([[ 0.33978233, -0.59295726,  0.904474  , ..., -0.65274256,
         0.7578639 , -0.19723678],
       [ 0.392786  , -0.32606828,  1.156926  , ..., -0.3684354 ,
        -0.00149528,  0.07227462],
       [ 0.4286755 ,  1.2806342 , -0.24467272, ..., -0.20866449,
         0.46077085,  0.19128309],
       ...,
       [ 0.4637387 , -0.00836393, -0.15827219, ...,  0.23365061,
        -0.2586248 ,  0.14397731],
       [ 0.07100088,  0.08849046, -0.05075152, ...,  0.07161791,
        -0.19464932,  0.09356661],
       [-0.7603003 , -0.4634242 ,  0.24669372, ..., -0.3820537 ,
         0.18035999, -0.26784134]], dtype=float32)>

In [37]:
# Skills Embeddings
model.skill_emb_layer.embeddings

<tf.Variable 'pebg/embedding_1/embeddings:0' shape=(189, 512) dtype=float32, numpy=
array([[ 1.4750328 , -0.41053993, -0.25789976, ..., -1.1023577 ,
        -0.5152434 ,  0.7768604 ],
       [-0.01956799,  0.27172834, -0.2937895 , ...,  0.5130188 ,
         0.06713983, -0.53285366],
       [-0.41730702,  0.41648334, -0.10307369, ..., -0.8179862 ,
        -0.35453105, -0.04955959],
       ...,
       [-0.6866998 ,  0.55968124,  0.17957912, ..., -0.32058078,
         0.16454488,  0.471418  ],
       [ 0.05433251, -0.6627156 , -0.31531078, ...,  0.7144671 ,
         0.82057947,  0.4648847 ],
       [ 0.45230004,  0.08398312, -0.43649882, ..., -0.12252785,
        -0.374263  ,  0.94085974]], dtype=float32)>