In [3]:
import numpy as np
import pandas as pd
from collections import defaultdict
import operator
import time, datetime
from utils import *

from IPython.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

# import plotly.express as px
# from plotly.subplots import make_subplots
# import plotly.graph_objs as go 

from tqdm import tqdm

# 0. load data

In [2]:
# # There are 784309 tables in our data set. 
# Each table describes a student’s question-solving log. 
# There is no difference in the information dimension between the tables.
# Each table contains the timestamp,solving_id,question_id,user_answer and elapsed_time 
# as described in the above Columns Description section.

In [3]:
# A major property of EdNet is that the questions come in bundles. 
# That is, a collection of questions sharing a common passage, 
# picture or listening material. 
# For example, questions of ID q2319, q2320 and q2321 may share the same reading passage. 
# In this case, the questions are said to form a bundle and will be given to the student 
# with corresponding shared material. 
# When a bundle is given, a student have access to all the problems and has to respond 
# all of them in order to complete the bundle.

In [4]:
import os
path='/mnt/qb/work/mlcolab/hzhou52/kt/EdNet/KT3'
d=[]
table_list=[]
s=pd.Series(os.listdir(path))

In [5]:
file_selected=s#s.sample(5000).to_numpy()
for file_name in file_selected:
    data_raw=pd.read_csv(os.path.join(path,file_name),encoding = "ISO-8859-15")
    data_raw['user_id']=pd.Series([file_name[:-4]]*len(data_raw))
    d.append([file_name[:-4],len(data_raw)])
    data=pd.DataFrame(data_raw,columns=['user_id']+data_raw.columns.to_list()[:-1])
    table_list.append(data)
df=pd.concat(table_list)
pd.set_option('display.max_rows',10)
df=df.reset_index(drop=True)
df

Unnamed: 0,user_id,timestamp,action_type,item_id,source,user_answer,platform
0,u1,1565096151269,enter,b3544,diagnosis,,mobile
1,u1,1565096187972,respond,q5012,diagnosis,b,mobile
2,u1,1565096194904,submit,b3544,diagnosis,,mobile
3,u1,1565096195001,enter,b3238,diagnosis,,mobile
4,u1,1565096218682,respond,q4706,diagnosis,c,mobile
...,...,...,...,...,...,...,...
89270649,u9998,1568964975390,enter,b3819,sprint,,mobile
89270650,u9998,1568964992921,respond,q5287,sprint,c,mobile
89270651,u9998,1568964996503,submit,b3819,sprint,,mobile
89270652,u9998,1568964996572,enter,e3819,sprint,,mobile


In [8]:
import pickle

filehandler = open("/mnt/qb/work/mlcolab/hzhou52/kt/EdNet/kt3.obj","wb")
pickle.dump(df, filehandler)
filehandler.close()

# 1. statistics overview

In [4]:
import pickle
file = open("/mnt/qb/work/mlcolab/hzhou52/kt/EdNet/kt3.obj",'rb')
object_file = pickle.load(file)
file.close()

questions = pd.read_csv('/mnt/qb/work/mlcolab/hzhou52/kt/EdNet/contents/questions.csv')

In [10]:
object_file.head()

Unnamed: 0,user_id,timestamp,action_type,item_id,source,user_answer,platform
0,u1,1565096151269,enter,b3544,diagnosis,,mobile
1,u1,1565096187972,respond,q5012,diagnosis,b,mobile
2,u1,1565096194904,submit,b3544,diagnosis,,mobile
3,u1,1565096195001,enter,b3238,diagnosis,,mobile
4,u1,1565096218682,respond,q4706,diagnosis,c,mobile


In [11]:
log = object_file

print('The columns of log are ', log.columns)
print('The length of log is {}'.format(len(log)))

print('###########################################')

print(count_unique(log, log.columns))

The columns of log are  Index(['user_id', 'timestamp', 'action_type', 'item_id', 'source',
       'user_answer', 'platform'],
      dtype='object')
The length of log is 89270654
###########################################
Number of unique values in user_id: 297915
Number of unique values in timestamp: 89044977
Number of unique values in action_type: 4
Number of unique values in item_id: 29498
Number of unique values in source: 8
Number of unique values in user_answer: 4
Number of unique values in platform: 2
None


In [12]:
check_nan(log)

Number of NaN values in column user_id: 0
Number of NaN values in column timestamp: 0
Number of NaN values in column action_type: 0
Number of NaN values in column item_id: 0
Number of NaN values in column source: 0
Number of NaN values in column user_answer: 65886174
Number of NaN values in column platform: 0


In [5]:
object_file = object_file[~object_file.user_answer.isna()]

In [15]:
print('The columns of log are ', object_file.columns)
print('The length of log is {}'.format(len(object_file)))

print('###########################################')

print(count_unique(object_file, object_file.columns))

The columns of log are  Index(['user_id', 'timestamp', 'action_type', 'item_id', 'source',
       'user_answer', 'platform'],
      dtype='object')
The length of log is 23384480
###########################################
Number of unique values in user_id: 296701
Number of unique values in timestamp: 23298298
Number of unique values in action_type: 1
Number of unique values in item_id: 11555
Number of unique values in source: 7
Number of unique values in user_answer: 4
Number of unique values in platform: 2
None


In [6]:
# questions
mapping = pd.Series(questions.correct_answer.values, index=questions.question_id).to_dict()
object_file['correct_answer'] = object_file['item_id'].apply(lambda x: mapping[x])

object_file['correct'] = (object_file['user_answer'] == object_file['correct_answer'])*1

In [7]:
object_file.head()

Unnamed: 0,user_id,timestamp,action_type,item_id,source,user_answer,platform,correct_answer,correct
1,u1,1565096187972,respond,q5012,diagnosis,b,mobile,c,0
4,u1,1565096218682,respond,q4706,diagnosis,c,mobile,c,1
7,u1,1565096290094,respond,q4366,diagnosis,b,mobile,b,1
10,u1,1565096337361,respond,q4829,diagnosis,a,mobile,c,0
13,u1,1565096395328,respond,q6528,diagnosis,b,mobile,d,0


In [21]:
object_file.correct.mean()

0.5682932013027444

# 2. remove learner logs with threshold

In [8]:
user_index = 'user_id'
skill_index = 'item_id'

interested_col = [
    'user_id', 'item_id', 'timestamp', 'correct', 'correct_answer', 'user_answer'
]
invert_col = [
    'user_id', 'skill_id', 'timestamp', 'correct', 'correct_answer', 'answer'
]

In [9]:
remove_thres = [50, 100, 150, 200]

log = object_file
for thres in remove_thres:
    print('remove threshold is {}'.format(thres))
    log = log[~log[skill_index].isna()]
    log = remove_log(log, user_index, num=thres)
    count_unique(log, [user_index, skill_index])
    print(len(log),'/',log[user_index].nunique(),'/',log[skill_index].nunique())

remove threshold is 50
Number of unique values in user_id: 45621
Number of unique values in item_id: 11554
20594724 / 45621 / 11554
remove threshold is 100
Number of unique values in user_id: 31201
Number of unique values in item_id: 11551
19576684 / 31201 / 11551
remove threshold is 150
Number of unique values in user_id: 24410
Number of unique values in item_id: 11551
18745938 / 24410 / 11551
remove threshold is 200
Number of unique values in user_id: 20170
Number of unique values in item_id: 11551
18012542 / 20170 / 11551


In [10]:
object_file.keys()

Index(['user_id', 'timestamp', 'action_type', 'item_id', 'source',
       'user_answer', 'platform', 'correct_answer', 'correct'],
      dtype='object')

In [14]:
base_log = object_file
remove_thres = [50, 100, 150, 200]
thres = 100
# for thres in remove_thres:
print(f'Remove threshold is {thres}')

# Remove rows with NaN skill_id and users who appear less than thres times
base_log = remove_log(base_log, user_index, num=thres)

# Create new DataFrame with desired columns
df = base_log[interested_col].copy()
df.columns = invert_col

# Re-index columns
df['original_skill_id'] = df['skill_id']
df['skill_id'] = pd.Categorical(df['skill_id'], categories=df['skill_id'].unique()).codes
df['problem_id'] = df['skill_id']
df['user_id'] = pd.Categorical(df['user_id'], categories=df['user_id'].unique()).codes
df = df.astype({'timestamp': np.float64, 'correct': np.float64})
df = df.astype({'problem_id': np.int64, 'user_id': np.int64, 'skill_id': np.int64})

# Save cleaned DataFrame to a CSV file
df.to_csv(f'/mnt/qb/work/mlcolab/hzhou52/ednet_kt3/multi_skill/interactions_{thres}.csv', sep='\t', index=False)

print(f'Remove threshold {thres} is done!')
print(count_unique(df, df.columns.tolist()))
print(f'Length of DataFrame: {len(df)}')

Remove threshold is 100
Remove threshold 100 is done!
Number of unique values in user_id: 31201
Number of unique values in skill_id: 11551
Number of unique values in timestamp: 19499677
Number of unique values in correct: 2
Number of unique values in correct_answer: 4
Number of unique values in answer: 4
Number of unique values in original_skill_id: 11551
Number of unique values in problem_id: 11551
None
Length of DataFrame: 19576684


In [13]:
df

Unnamed: 0,user_id,skill_id,timestamp,correct,correct_answer,answer,original_skill_id,problem_id
1,0,0,1.565096e+12,0.0,c,b,q5012,0
4,0,1,1.565096e+12,1.0,c,c,q4706,1
7,0,2,1.565096e+12,1.0,b,b,q4366,2
10,0,3,1.565096e+12,0.0,c,a,q4829,3
13,0,4,1.565096e+12,0.0,d,b,q6528,4
...,...,...,...,...,...,...,...,...
89270630,45620,3359,1.568964e+12,0.0,c,d,q3972,3359
89270635,45620,1954,1.568964e+12,1.0,c,c,q4500,1954
89270640,45620,7877,1.568965e+12,1.0,d,d,q17142,7877
89270645,45620,3804,1.568965e+12,1.0,b,b,q4511,3804
