In [3]:
import numpy as np
import pandas as pd
from collections import defaultdict
import operator, os
import time, datetime
from utils import *
from tqdm import tqdm

Original data: https://pslcdatashop.web.cmu.edu/KDDCup/downloads.jsp  
Edu data analysis: https://edudata.readthedocs.io/en/latest/build/blitz/KDD%20Cup%202010.html  
Data format: https://pslcdatashop.web.cmu.edu/KDDCup/rules_data_format.jsp

In [None]:
# Row: the row number Update (04-20-2010): for challenge data sets, the row number in each file (train, test, and submission) is no longer taken from the original data set file. Instead, rows are renumbered within each file. So instead of 1...n rows for the training file and n+1..m rows for the test/submission file, it is now 1...n for the training file and 1...n for the test/submission file.
# Anon Student Id: unique, anonymous identifier for a student

# Problem Hierarchy: the hierarchy of curriculum levels containing the problem.
# Problem Name: unique identifier for a problem
# Problem View: the total number of times the student encountered the problem so far.

# Step Name: each problem consists of one or more steps (e.g., "find the area of rectangle ABCD" or "divide both sides of the equation by x"). The step name is unique within each problem, but there may be collisions between different problems, so the only unique identifier for a step is the pair of problem_name and step_name.
# Step Start Time: the starting time of the step. Can be null.
# First Transaction Time: the time of the first transaction toward the step.
# Correct Transaction Time: the time of the correct attempt toward the step, if there was one.
# Step End Time: the time of the last transaction toward the step.
# Step Duration (sec): the elapsed time of the step in seconds, calculated by adding all of the durations for transactions that were attributed to the step. Can be null (if step start time is null).

# Correct Step Duration (sec): the step duration if the first attempt for the step was correct.
# Error Step Duration (sec): the step duration if the first attempt for the step was an error (incorrect attempt or hint request).

# Correct First Attempt: the tutor's evaluation of the student's first attempt on the step—1 if correct, 0 if an error.
# Incorrects: total number of incorrect attempts by the student on the step.
# Hints: total number of hints requested by the student for the step.
# Corrects: total correct attempts by the student for the step. (Only increases if the step is encountered more than once.)
# KC(KC Model Name): the identified skills that are used in a problem, where available. A step can have multiple KCs assigned to it. Multiple KCs for a step are separated by ~~ (two tildes). Since opportunity describes practice by knowledge component, the corresponding opportunities are similarly separated by ~~.
# Opportunity(KC Model Name): a count that increases by one each time the student encounters a step with the listed knowledge component. Steps with multiple KCs will have multiple opportunity numbers separated by ~~.
# Additional KC models, which exist for the challenge data sets, will appear as additional pairs of columns (KC and Opportunity columns for each model).

# 1. statistics overview

In [5]:
bath_path = '/mnt/qb/work/mlcolab/hzhou52/kt/algebra08'
file_name = 'algebra_2008_2009_train.txt'
log = pd.read_table(os.path.join(bath_path, file_name), encoding="ISO-8859-15", low_memory=False)

In [6]:
log.head()

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,...,Correct First Attempt,Incorrects,Hints,Corrects,KC(SubSkills),Opportunity(SubSkills),KC(KTracedSkills),Opportunity(KTracedSkills),KC(Rules),Opportunity(Rules)
0,1,stu_de2777346f,"Unit CTA1_01, Section CTA1_01-3",REAL20B,1,R2C1,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,...,0,3,1,1,Identifying units,1,,,UNIT-HELP,1
1,2,stu_de2777346f,"Unit CTA1_01, Section CTA1_01-3",REAL20B,1,R3C1,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,...,1,0,0,1,Define Variable,1,,,VARIABLE-HELP,1
2,3,stu_de2777346f,"Unit CTA1_01, Section CTA1_01-3",REAL20B,1,R3C2,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,...,1,0,0,1,"Write expression, any form~~Using simple numbe...",1~~1~~1~~1~~1~~1,Using simple numbers-1~~Using large numbers-1~...,1~~1~~1,STANDARD-MX+B-FORMULA-HELP,1
3,4,stu_de2777346f,"Unit CTA1_01, Section CTA1_01-3",REAL20B,1,R4C1,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,...,1,1,0,1,"Entering a given~~Enter given, reading words~~...",1~~1~~1,Entering a given-1,1,GIVEN-HELP-NON-NUMERIC-PHRASE,1
4,5,stu_de2777346f,"Unit CTA1_01, Section CTA1_01-3",REAL20B,1,R4C2,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,2008-09-19 13:30:46.0,...,1,0,0,1,"Using simple numbers~~Find Y, any form~~Using ...",2~~1~~2~~1,Using simple numbers-1~~Using large numbers-1~...,2~~2~~1,CALCULATED-VALUE-HELP-MX+B-GIVEN-X-ZERO,1


In [7]:
print('The columns of log are ', log.columns)
print('The length of log is {}'.format(len(log)))

print('###########################################')

print(count_unique(log, log.columns))

The columns of log are  Index(['Row', 'Anon Student Id', 'Problem Hierarchy', 'Problem Name',
       'Problem View', 'Step Name', 'Step Start Time',
       'First Transaction Time', 'Correct Transaction Time', 'Step End Time',
       'Step Duration (sec)', 'Correct Step Duration (sec)',
       'Error Step Duration (sec)', 'Correct First Attempt', 'Incorrects',
       'Hints', 'Corrects', 'KC(SubSkills)', 'Opportunity(SubSkills)',
       'KC(KTracedSkills)', 'Opportunity(KTracedSkills)', 'KC(Rules)',
       'Opportunity(Rules)'],
      dtype='object')
The length of log is 8918054
###########################################
Number of unique values in Row: 8918054
Number of unique values in Anon Student Id: 3310
Number of unique values in Problem Hierarchy: 165
Number of unique values in Problem Name: 188368
Number of unique values in Problem View: 18
Number of unique values in Step Name: 700635
Number of unique values in Step Start Time: 814722
Number of unique values in First Transactio

In [None]:
interested_col = [
    'Anon Student Id', 'Problem Hierarchy', 'Problem Name', 'Step Name'
]

In [37]:
check_nan(log)

Number of NaN values in column Row: 0
Number of NaN values in column Anon Student Id: 0
Number of NaN values in column Problem Hierarchy: 0
Number of NaN values in column Problem Name: 0
Number of NaN values in column Problem View: 0
Number of NaN values in column Step Name: 0
Number of NaN values in column Step Start Time: 265516
Number of NaN values in column First Transaction Time: 0
Number of NaN values in column Correct Transaction Time: 238090
Number of NaN values in column Step End Time: 0
Number of NaN values in column Step Duration (sec): 442921
Number of NaN values in column Correct Step Duration (sec): 1641028
Number of NaN values in column Error Step Duration (sec): 7719947
Number of NaN values in column Correct First Attempt: 0
Number of NaN values in column Incorrects: 0
Number of NaN values in column Hints: 0
Number of NaN values in column Corrects: 0
Number of NaN values in column KC(SubSkills): 2475917
Number of NaN values in column Opportunity(SubSkills): 2475917
Numb

In [None]:
for col in log.columns:
    print(col, log[col][0:5])

In [56]:
8918054-2475917

6442137

## 1.1 find unique KCs

In [21]:
list(log['KC(SubSkills)'])

['Identifying units',
 'Define Variable',
 'Write expression, any form~~Using simple numbers~~Using large numbers~~Write expression, positive intercept~~Write expression, negative slope~~Write Expression, mx+b',
 'Entering a given~~Enter given, reading words~~Enter given, implied amount',
 'Using simple numbers~~Find Y, any form~~Using large numbers~~Find Y, negative slope',
 'Enter given, reading numerals~~Entering a given',
 'Using simple numbers~~Find Y, any form~~Using large numbers~~Find Y, negative slope',
 'Entering a given~~Enter given, reading words',
 'Using simple numbers~~Find Y, any form~~Using large numbers~~Find Y, negative slope',
 nan,
 nan,
 'Identifying units',
 'Identifying units',
 'Define Variable',
 'Entering a given~~Enter given, reading words',
 'Entering a given~~Enter given, reading words',
 'Write expression, any form~~Using simple numbers~~Using large numbers~~Write expression, positive intercept~~Write expression, negative slope~~Write Expression, mx+b',
 

In [39]:
import pandas as pd

unique_kcs = set()
kcs_series = log['KC(SubSkills)']

unique_kcs = set(kcs_series.dropna().str.split('~~', expand=True).stack().values)

KeyboardInterrupt: 

In [42]:
len(unique_kcs)

541

In [41]:
unique_kcs

{'Bogus skill',
 'Calculate part in proportion with fractions',
 'Calculate product of means or extremes',
 'Calculate ratio denominator from given',
 'Calculate ratio numerator from given',
 'Calculate total in proportion with fractions',
 'Calculate unit rate',
 'Changing axis bounds',
 'Changing axis intervals',
 'Choose Graphical a in A problem',
 'Choose Graphical a in G problem',
 'Choose Graphical a in N problem',
 'Choose Graphical a in V problem',
 'Choose Graphical h in A problem',
 'Choose Graphical h in G problem',
 'Choose Graphical h in N problem',
 'Choose Graphical h in V problem',
 'Choose Graphical k in A problem',
 'Choose Graphical k in G problem',
 'Choose Graphical k in N problem',
 'Choose Graphical k in V problem',
 'Choose Graphical refl-v in A problem',
 'Choose Graphical refl-v in G problem',
 'Choose Graphical refl-v in N problem',
 'Choose Graphical refl-v in V problem',
 'Choose form of compound inequality',
 'Choose mean',
 'Choose median',
 'Choose mode'

## 1.2 correctness

In [54]:
log['Corrects'].sum()/(log['Corrects'].sum()+log['Incorrects'].sum())

0.7439331517464881

In [52]:
log['Correct Step Duration (sec)'].sum()/log['Step Duration (sec)'].sum()

0.5109444814833888

# 2. remove learner logs with threshold

In [58]:
bath_path = '/mnt/qb/work/mlcolab/hzhou52/kt/algebra08'
file_name = 'algebra_2008_2009_train.txt'
log = pd.read_table(os.path.join(bath_path, file_name), encoding="ISO-8859-15", low_memory=False)

remove_thres = [50, 100, 150, 200]

user_index = 'Anon Student Id'
skill_index = 'KC(SubSkills)'
problem_index = 'Problem Name'
step_index = 'Step Name'

for thres in remove_thres:
    print('remove threshold is {}'.format(thres))
    log = log[~log[skill_index].isna()]
    log = remove_log(log, user_index, num=thres)
    count_unique(log, [user_index,skill_index,problem_index,step_index])
    print(len(log))
    print(len(log), '/', log[user_index].nunique(), '/')

remove threshold is 50
Number of unique values in Anon Student Id: 2955
Number of unique values in KC(SubSkills): 1828
Number of unique values in Problem Name: 188144
Number of unique values in Step Name: 500504
6436108
6436108 / 2955 /
remove threshold is 100
Number of unique values in Anon Student Id: 2828
Number of unique values in KC(SubSkills): 1828
Number of unique values in Problem Name: 188088
Number of unique values in Step Name: 500330
6427098
6427098 / 2828 /
remove threshold is 150
Number of unique values in Anon Student Id: 2729
Number of unique values in KC(SubSkills): 1828
Number of unique values in Problem Name: 188011
Number of unique values in Step Name: 500097
6414680
6414680 / 2729 /
remove threshold is 200
Number of unique values in Anon Student Id: 2630
Number of unique values in KC(SubSkills): 1828
Number of unique values in Problem Name: 187738
Number of unique values in Step Name: 499477
6397406
6397406 / 2630 /


In [None]:
bath_path = '/mnt/qb/work/mlcolab/hzhou52/kt/algebra08'
file_name = 'algebra_2008_2009_train.txt'
log = pd.read_table(os.path.join(bath_path, file_name), encoding="ISO-8859-15", low_memory=False)

for thres in remove_thres:
    print(f'Remove threshold is {thres}')

    # Remove rows with NaN skill_id and users who appear less than thres times
    log = log[~log[skill_index].isna()]
    log = remove_log(log, user_index, num=thres)

    # Create new DataFrame with desired columns
    df = log[interested_col].copy()
    df.columns = invert_col

    # Convert timestamp columns to a single timestamp value
    start_times = log.startTime
    end_times = log.endTime
    df['timestamp'] = (start_times + end_times) / 2
    df['dwell_time'] = end_times - start_times

    # Re-index columns
    df['skill_id'] = pd.Categorical(df['skill_text'], categories=df['skill_text'].unique()).codes
    df['problem_id'] = pd.Categorical(df['problem_id'], categories=df['problem_id'].unique()).codes
    df['user_id'] = pd.Categorical(df['user_id'], categories=df['user_id'].unique()).codes
    df = df.astype({'timestamp': np.float64, 'dwell_time': np.float64, 'correct': np.float64})
    df = df.astype({'problem_id': np.int64, 'user_id': np.int64, 'skill_id': np.int64})
    
    # Save cleaned DataFrame to a CSV file
    df.to_csv(f'/mnt/qb/work/mlcolab/hzhou52/kt/algebra08/multi_skill/interactions_{thres}.csv', sep='\t', index=False)

    print(f'Remove threshold {thres} is done!')
    print(count_unique(df, df.columns.tolist()))
    print(f'Length of DataFrame: {len(df)}')