<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/PEBG_original_data_explore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## PEBG Paper Original Data Explore.

Paper: https://arxiv.org/pdf/2012.05031v1.pdf  
Github: https://github.com/lyf-1/PEBG

In [1]:
%%capture
!pip install gcsfs
!pip install kaggle
!pip install datatable

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [2]:
import os 
import pandas as pd
import numpy as np
from scipy import sparse

In [3]:
DATA_PATH = 'gs://kds-7cd35ed419a621f754ec32f0c3616d2e9282a698c5eeaabc814bd7f6'

In [4]:
questions_df = pd.read_csv(DATA_PATH + "/questions.csv")
questions_df.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


In [5]:
len(questions_df)

13523

In [6]:
q_df = questions_df.dropna() # one nan

In [7]:
pro_id_dict = {}
pro_ans_dict = {}
skill_id_dict = {}
pro_skill_dict = {}
pro_skill_adj = []
skill_cnt = 0
for i, row in q_df.iterrows():
    # print(i, len(df), row['tags'])
    pro_id_dict[row['question_id']] = i
    pro_ans_dict[row['question_id']] = row['correct_answer']
    tmp_skills = row['tags']
    pro_skill_dict[row['question_id']] = tmp_skills
    for s in tmp_skills.split(' '):
        if s not in skill_id_dict:
            skill_id_dict[s] = skill_cnt
            skill_cnt += 1
        pro_skill_adj.append([i, skill_id_dict[s], 1])

question_id to an id we give

In [9]:
len(pro_id_dict)

13522

correct answers to all questions

In [10]:
len(pro_ans_dict)

13522

skill id (tags) to an id we give (here same as skill id)

In [11]:
len(skill_id_dict)

188

question_id to skills dict

In [19]:
print(pro_skill_dict[0])
len(pro_skill_dict)

51 131 162 38


13522

adjacency list of the bipartite graph - [q_id, s_id, is_edge]

In [26]:
pro_skill_adj[:5] 

[[0, 0, 1], [0, 1, 1], [0, 2, 1], [0, 3, 1], [1, 1, 1]]

In [16]:
pro_skill_adj = np.array(pro_skill_adj).astype(np.int32)

In [17]:
pro_skill_adj

array([[    0,     0,     1],
       [    0,     1,     1],
       [    0,     2,     1],
       ...,
       [13520,    83,     1],
       [13521,    98,     1],
       [13522,    95,     1]], dtype=int32)

In [18]:
pro_skill_adj.shape

(30992, 3)

In [31]:
pro_num = np.max(pro_skill_adj[:, 0]) + 1
skill_num = np.max(pro_skill_adj[:, 1]) + 1
print('problem number %d, skill number %d' % (pro_num, skill_num), i)

problem number 13523, skill number 188 13522


In [13]:
 # take joint skill as a new skill, treating a uniqie set of skills appeared for a question in the data as a new skill.
skills = q_df['tags'].unique()
for s in skills:
    if ' ' in s:
        skill_id_dict[s] = skill_cnt
        skill_cnt += 1 

In [14]:
len(skill_id_dict)

1631

#### Looking at the processed data in the PEBG repo

In [20]:
!git clone https://github.com/lyf-1/PEBG.git

Cloning into 'PEBG'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 50 (delta 0), reused 1 (delta 0), pack-reused 47[K
Unpacking objects: 100% (50/50), done.


In [21]:
from scipy import sparse

In [22]:
data_folder = "PEBG/ednet"

##### Explicit relationship between questions and skills

In [23]:
pro_skill_coo = sparse.load_npz(os.path.join(data_folder, 'pro_skill_sparse.npz'))

In [24]:
pro_skill_coo

<12372x188 sparse matrix of type '<class 'numpy.float32'>'
	with 28158 stored elements in COOrdinate format>

In [25]:
pro_skill_dense = pro_skill_coo.toarray()

In [27]:
pro_skill_dense

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

row index - question_ids, column index - skill ids, cell = 1 where there is a relationship

##### Implicit relationship between skills.

In [28]:
skill_skill_coo = sparse.load_npz(os.path.join(data_folder, 'skill_skill_sparse.npz'))

In [29]:
skill_skill_dense = skill_skill_coo.toarray()

In [30]:
skill_skill_dense

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [31]:
skill_skill_dense.shape

(188, 188)

##### Implicit relationship between questions.

In [32]:
pro_pro_coo = sparse.load_npz(os.path.join(data_folder, 'pro_pro_sparse.npz'))

In [33]:
pro_pro_dense = pro_pro_coo.toarray()

In [34]:
pro_pro_dense.shape

(12372, 12372)

##### Other extractable features from user interactions about the questions.

In [35]:
pro_feat = np.load(os.path.join(data_folder, 'pro_feat.npz'))['pro_feat']    # [pro_diff_feat, auxiliary_target]
print('problem feature shape', pro_feat.shape)

problem feature shape (12372, 2)


In [36]:
pro_feat

array([[0.03353432, 0.9285648 ],
       [0.03931295, 0.94117093],
       [0.04576905, 0.5405403 ],
       ...,
       [0.        , 0.        ],
       [0.02945486, 0.9999    ],
       [0.        , 0.        ]], dtype=float32)

In [37]:
pro_feat.shape

(12372, 2)

features: 

- 0 - avg time it took to answer that question, 
- 1 - percentage of correct user interaction for that question