<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/notebooks/PEBG_for_riiid_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## PEBG - Pretraining Questions and Skill (Tags) Embedding

### Original

Paper: https://arxiv.org/pdf/2012.05031v1.pdf  
Github: https://github.com/lyf-1/PEBG

In [1]:
%%capture
!pip install gcsfs
!pip install kaggle
!pip install datatable

In [2]:
# from google.colab import drive
# drive.mount("/content/drive")

In [3]:
import os 
import pandas as pd
import numpy as np
from scipy import sparse

In [4]:
DATA_PATH = 'gs://kds-7cd35ed419a621f754ec32f0c3616d2e9282a698c5eeaabc814bd7f6'

In [5]:
questions_df = pd.read_csv(DATA_PATH + "/questions.csv")
questions_df.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


In [6]:
questions_df.describe()

Unnamed: 0,question_id,bundle_id,correct_answer,part
count,13523.0,13523.0,13523.0,13523.0
mean,6761.0,6760.510907,1.455298,4.264956
std,3903.89818,3903.857783,1.149707,1.652553
min,0.0,0.0,0.0,1.0
25%,3380.5,3379.5,0.0,3.0
50%,6761.0,6761.0,1.0,5.0
75%,10141.5,10140.0,3.0,5.0
max,13522.0,13522.0,3.0,7.0


In [7]:
questions_df[questions_df["tags"].isnull()]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
10033,10033,10033,2,6,


In [8]:
questions_df.fillna("-1", inplace=True) # tags Nan filled with -1

In [9]:
f"{len(questions_df)} records"

'13523 records'

#### Question - Skill Relationship

Questions IDs

In [10]:
# ids from 0 for the questions.
questions = questions_df["question_id"].values
question_ids_dict = dict(zip(questions, pd.factorize(questions, sort=True)[0]))

In [11]:
question_ids_dict

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 11: 11,
 12: 12,
 13: 13,
 14: 14,
 15: 15,
 16: 16,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31,
 32: 32,
 33: 33,
 34: 34,
 35: 35,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 41,
 42: 42,
 43: 43,
 44: 44,
 45: 45,
 46: 46,
 47: 47,
 48: 48,
 49: 49,
 50: 50,
 51: 51,
 52: 52,
 53: 53,
 54: 54,
 55: 55,
 56: 56,
 57: 57,
 58: 58,
 59: 59,
 60: 60,
 61: 61,
 62: 62,
 63: 63,
 64: 64,
 65: 65,
 66: 66,
 67: 67,
 68: 68,
 69: 69,
 70: 70,
 71: 71,
 72: 72,
 73: 73,
 74: 74,
 75: 75,
 76: 76,
 77: 77,
 78: 78,
 79: 79,
 80: 80,
 81: 81,
 82: 82,
 83: 83,
 84: 84,
 85: 85,
 86: 86,
 87: 87,
 88: 88,
 89: 89,
 90: 90,
 91: 91,
 92: 92,
 93: 93,
 94: 94,
 95: 95,
 96: 96,
 97: 97,
 98: 98,
 99: 99,
 100: 100,
 101: 101,
 102: 102,
 103: 103,
 104: 104,
 105: 105,
 106: 106,
 107: 107,
 108: 108,
 109: 109,
 110: 110,

Skill IDs

In [12]:
from collections import Counter
skill_count = Counter()

for idx, row in questions_df.iterrows():
  skill_count.update([int(x) for x in row["tags"].split(" ")])

skill_count.most_common()[:5]

[(92, 2269), (38, 2256), (81, 1969), (29, 1707), (136, 1033)]

In [13]:
skills = list(skill_count.keys())
skill_ids_dict = dict(zip(skills, pd.factorize(skills, sort=True)[0]))

In [14]:
skill_ids_dict

{-1: 0,
 0: 1,
 1: 2,
 2: 3,
 3: 4,
 4: 5,
 5: 6,
 6: 7,
 7: 8,
 8: 9,
 9: 10,
 10: 11,
 11: 12,
 12: 13,
 13: 14,
 14: 15,
 15: 16,
 16: 17,
 17: 18,
 18: 19,
 19: 20,
 20: 21,
 21: 22,
 22: 23,
 23: 24,
 24: 25,
 25: 26,
 26: 27,
 27: 28,
 28: 29,
 29: 30,
 30: 31,
 31: 32,
 32: 33,
 33: 34,
 34: 35,
 35: 36,
 36: 37,
 37: 38,
 38: 39,
 39: 40,
 40: 41,
 41: 42,
 42: 43,
 43: 44,
 44: 45,
 45: 46,
 46: 47,
 47: 48,
 48: 49,
 49: 50,
 50: 51,
 51: 52,
 52: 53,
 53: 54,
 54: 55,
 55: 56,
 56: 57,
 57: 58,
 58: 59,
 59: 60,
 60: 61,
 61: 62,
 62: 63,
 63: 64,
 64: 65,
 65: 66,
 66: 67,
 67: 68,
 68: 69,
 69: 70,
 70: 71,
 71: 72,
 72: 73,
 73: 74,
 74: 75,
 75: 76,
 76: 77,
 77: 78,
 78: 79,
 79: 80,
 80: 81,
 81: 82,
 82: 83,
 83: 84,
 84: 85,
 85: 86,
 86: 87,
 87: 88,
 88: 89,
 89: 90,
 90: 91,
 91: 92,
 92: 93,
 93: 94,
 94: 95,
 95: 96,
 96: 97,
 97: 98,
 98: 99,
 99: 100,
 100: 101,
 101: 102,
 102: 103,
 103: 104,
 104: 105,
 105: 106,
 106: 107,
 107: 108,
 108: 109,
 109: 110,


##### Questions - Skill Relationship - Adjacency matrix

In [15]:
question_skills_map = {}
skill_questions_map = {}

for idx, row in questions_df.iterrows():
  qid = row["question_id"]
  skills = row["tags"].split(" ")
  question_skills_map[qid] = skills
  
  for skill in skills:
    if skill_questions_map.get(skill, False):
      skill_questions_map[skill].append(qid)
    else:
      skill_questions_map[skill] = [qid]

In [16]:
# skill_questions_map

In [17]:
question_skill_matrix = np.zeros((len(question_ids_dict), len(skill_ids_dict)))
question_skill_matrix.shape

(13523, 189)

In [18]:
for question, qid in question_ids_dict.items():

  # get its skills
  skills = question_skills_map[question]
  for skill in skills:
    question_skill_matrix[qid, skill_ids_dict[int(skill)]] = 1

In [19]:
question_skill_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

##### Question - Question Implicit Relationship

In [20]:
question_question_matrix = np.zeros((len(question_ids_dict), len(question_ids_dict)))
question_question_matrix.shape

(13523, 13523)

In [21]:
for question, qid in question_ids_dict.items():
  # get its skills
  skills = question_skills_map[question]
  for skill in skills:
    # get all the questions related to this skill
    other_questions = skill_questions_map[skill]
    for oq in other_questions:
      question_question_matrix[qid, question_ids_dict[oq]] = 1

In [22]:
question_question_matrix

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

##### Skill - Skill Implicit relationship

In [23]:
skill_skill_matrix = np.zeros((len(skill_ids_dict), len(skill_ids_dict)))
skill_skill_matrix.shape

(189, 189)

In [24]:
for skill, sid in skill_ids_dict.items():

  # get the questiions for this skill
  questions = skill_questions_map[str(skill)]

  for q in questions:
    skills_for_q = question_skills_map[q]
    for s in skills_for_q:
      skill_skill_matrix[sid, skill_ids_dict[int(s)]] = 1

In [25]:
skill_skill_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

### Training the Question Embedding

In [26]:
import tensorflow as tf
import tensorflow.keras as keras

In [27]:
class PEBG(keras.models.Model):

  def __init__(self, **kwargs):
    super().__init__(**kwargs)

    self.question_emb_layer = keras.layers.Embedding(len(question_ids_dict), 512)
    self.skill_emb_layer = keras.layers.Embedding(len(skill_ids_dict), 512)

  def call(self, x):
    """
    x - question ids for the batch
    """
  
    # input question embedding
    question_emb = self.question_emb_layer(x)

    # all skills embeddings
    skills_full = np.array(list(skill_ids_dict.values()))
    skill_emb_full = self.skill_emb_layer(skills_full)
    
    # all question emebddings
    questions_full = np.array(list(question_ids_dict.values()))
    question_emb_full = self.question_emb_layer(questions_full)
    
    # --- question-skill relationship
    question_skill_similarity = tf.matmul(question_emb, tf.transpose(skill_emb_full))
    question_skill_similarity = tf.sigmoid(question_skill_similarity)

    # --- question-question relationsip
    question_question_similarity = tf.sigmoid(tf.matmul(question_emb, tf.transpose(question_emb_full)))

    # --- skill-skill relationship
    skill_skill_similarity = tf.sigmoid(tf.matmul(skill_emb_full, tf.transpose(skill_emb_full)))

    return question_skill_similarity, question_question_similarity, skill_skill_similarity

In [28]:
model = PEBG()

In [29]:
loss_qs = keras.losses.BinaryCrossentropy()
loss_qq = keras.losses.BinaryCrossentropy()
loss_ss = keras.losses.BinaryCrossentropy()

In [30]:
optimizer = keras.optimizers.Adam(learning_rate=3e-3)

In [31]:
model.compile(loss=[loss_qs, loss_qq, loss_ss], optimizer=optimizer)

In [32]:
def data_gen(batch_size=32):
  i = 0
  while i < len(question_ids_dict):
    if i + batch_size < len(question_ids_dict):
      start, end = i, i+batch_size
    else:
      start, end = i, len(question_ids_dict)
      
    x = np.arange(start, end) # batch of question ids.
    y1 = question_skill_matrix[start:end, :] # batch of question-skill relationship matrix
    y2 = question_question_matrix[start:end, :] # batch of question-question relationship matrix
    y3 = skill_skill_matrix # full skill-skill relatioship matrix

    yield x, (np.expand_dims(y1, axis=1), np.expand_dims(y2, axis=1), y3)
    i += batch_size


In [33]:
from itertools import islice
for x, y in islice(data_gen(32), 1):
  print(x.shape, y[0].shape, y[1].shape, y[2].shape)

(32,) (32, 1, 189) (32, 1, 13523) (189, 189)


In [34]:
for _ in range(10):
  model.fit(x=data_gen(128), epochs=1)



Trained Embeddings

In [35]:
# Question Embeddings
model.question_emb_layer.embeddings

<tf.Variable 'pebg/embedding/embeddings:0' shape=(13523, 512) dtype=float32, numpy=
array([[-0.00365639,  0.2988135 ,  0.17040603, ...,  0.07400862,
        -0.12307945, -0.00178798],
       [ 0.06814319,  0.26209313,  0.19111599, ...,  0.10183334,
        -0.28896707, -0.25837797],
       [ 0.04186353,  0.23322333,  0.19733417, ...,  0.11320984,
        -0.25597596,  0.07904615],
       ...,
       [-0.08763962,  0.06016776,  0.02563026, ..., -0.18814285,
        -0.08655133, -0.08723984],
       [ 0.00757714,  0.04028472, -0.05345757, ..., -0.10208762,
        -0.03454952, -0.00641418],
       [-0.01242409, -0.01136532, -0.0514461 , ..., -0.1256702 ,
        -0.00061035,  0.07680251]], dtype=float32)>

In [36]:
# Skills Embeddings
model.skill_emb_layer.embeddings

<tf.Variable 'pebg/embedding_1/embeddings:0' shape=(189, 512) dtype=float32, numpy=
array([[ 0.64790183, -0.26791418, -0.33244902, ..., -0.2878187 ,
         0.31640232, -0.33435816],
       [-0.17443947, -0.17821613,  0.13871816, ...,  0.07126206,
         0.1717649 , -0.41512597],
       [-0.30480915, -0.03605134, -0.10470226, ...,  0.03156327,
         0.22707224, -0.5446049 ],
       ...,
       [ 0.54134625, -0.12525888, -0.14509578, ...,  0.00317702,
        -0.12655552,  1.3593935 ],
       [-1.1740892 ,  0.3349037 , -0.07533068, ...,  0.17620645,
        -0.47476417, -0.05523997],
       [-0.45444292, -0.2547972 ,  0.08116692, ...,  0.23073377,
         0.49855912,  0.33982593]], dtype=float32)>