In [2]:
%pip install datasets polars --quiet

Note: you may need to restart the kernel to use updated packages.


In [8]:
import polars as pl
from datasets import load_dataset  
import os
import pandas as pd

In [4]:
processed_folder = "data/processed"
os.makedirs(processed_folder, exist_ok=True)

In [5]:
print("Loading EDNet KT4 in streaming mode...")
dataset_stream = load_dataset("mgor/EDNet", "kt4", split="train", streaming=True)
print("Connection successful!")

Loading EDNet KT4 in streaming mode...


Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Connection successful!


In [6]:
sample_size = 50000
print(f"Fetching first {sample_size} records...")
sample_data = list(dataset_stream.take(sample_size))

Fetching first 50000 records...


In [9]:
df = pd.DataFrame(sample_data)
print("Sample loaded into Pandas DataFrame.")
print(df.head())

Sample loaded into Pandas DataFrame.
       timestamp action_type item_id  cursor_time     source user_answer  \
0  1555036427032       enter   b2386          NaN  diagnosis        None   
1  1555036443509     respond   q3854          NaN  diagnosis           d   
2  1555036444747      submit   b2386          NaN  diagnosis        None   
3  1555036447367       enter     b20          NaN  diagnosis        None   
4  1555036449850  play_audio     b20          0.0  diagnosis        None   

  platform subject_id correct_answer  is_correct  
0      web    u564440           None       False  
1      web    u564440              d        True  
2      web    u564440           None       False  
3      web    u564440           None       False  
4      web    u564440           None       False  


In [10]:
df_filtered = df[df['action_type'] == 'respond']

In [11]:
df_filtered = df_filtered[['timestamp', 'subject_id', 'item_id', 'is_correct']]

In [12]:
df_filtered = df_filtered.sort_values(['subject_id', 'timestamp'])
print("Sorted interactions by Student and timestamp.")

Sorted interactions by Student and timestamp.


In [13]:
student_counts = df_filtered.groupby('subject_id')['item_id'].count().reset_index()
student_counts = student_counts.rename(columns={'item_id': 'n_interactions'})

In [14]:
valid_students = student_counts[student_counts['n_interactions'] >= 5]['subject_id'].tolist()
df_filtered = df_filtered[df_filtered['subject_id'].isin(valid_students)]
print(f"Students with >=5 interactions: {len(valid_students)} students remain.")

Students with >=5 interactions: 147 students remain.


In [15]:
processed_csv = os.path.join(processed_folder, "ednet_sequences.csv")
df_filtered.to_csv(processed_csv, index=False)
print(f"Processed sequences saved to: {processed_csv}")

Processed sequences saved to: data/processed\ednet_sequences.csv


In [16]:
print(df_filtered.head(10))

           timestamp subject_id item_id  is_correct
4995   1567413540117     u12531   q3605       False
4998   1567413573276     u12531   q4895       False
5001   1567413619332     u12531   q5365       False
5004   1567413640139     u12531   q5577       False
5009   1567413670061     u12531    q869       False
5014   1567413692822     u12531    q743        True
37931  1566719300213     u13940  q11767        True
37932  1566719358769     u13940  q11769        True
37933  1566719373065     u13940  q11770       False
37934  1566719394350     u13940  q11768        True
