# Pre process data for use with AKT, SAKT, DKT, and DKVMN

In [1]:
import pandas as pd
import json
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from random import shuffle

In [2]:
df = pd.read_csv('processed_data.csv')
print(df.columns)

Index(['Unnamed: 0', 'Anon Student Id', 'Problem Name', 'Step Name', 'Outcome',
       'KC (WPI-Apr-2005)', 'Step Num'],
      dtype='object')


In [3]:
len(df['KC (WPI-Apr-2005)'].unique())

85

In [4]:
# df = df.drop(columns=['Unnamed: 0','Step Name','Step Num'])
df = df.drop(columns=['Unnamed: 0','Step Name','Problem Name'])
df.columns = ['Anon Student Id', 'Outcome', 'KC (WPI-Apr-2005)', 'Problem Name']

In [5]:
df

Unnamed: 0,Anon Student Id,Outcome,KC (WPI-Apr-2005),Problem Name
0,1,0,1,1
1,1,0,1,1
2,1,1,2,2
3,1,1,2,2
4,1,0,2,2
...,...,...,...,...
323383,2833,1,36,593
323384,2833,0,36,594
323385,2833,0,36,595
323386,2833,0,36,596


## Perform K-fold cross validation with 5 folds and split the data into train, test, validate sets.

#### First we need put the data into the correct format. We need to get all the problems, outcomes, and KCs for each student and put them on 4 seperate lines in a comma seperated list.
student id

problems

KCs

outcomes

#### Each student will take up 4 lines

In [6]:
f = open('akt_data/akt_processed_data.csv', "w")
students = df['Anon Student Id'].unique().tolist()
shuffle(students)

for i, student in enumerate(students):
    f.write(str(student) + '\n')
    # problem
    temp = df.loc[df['Anon Student Id'] == student]['Problem Name']
    temp = [str(element) for element in temp]
    temp = ','.join(temp)
    f.write(temp + '\n')
    # kc
    temp = df.loc[df['Anon Student Id'] == student]['KC (WPI-Apr-2005)']
    temp = [str(element) for element in temp]
    temp = ','.join(temp)
    f.write(temp + '\n')
    # outcome
    temp = df.loc[df['Anon Student Id'] == student]['Outcome']
    temp = [str(element) for element in temp]
    temp = ','.join(temp)
    f.write(temp + '\n')

f.close()

#### Now we break up the data into 5 folds (called chunks here)

In [7]:
f = open('akt_data/akt_processed_data.csv', 'r')
    
for i in range(2830):
    if i == 0:
        f2 = open('akt_data/akt_chunk1.csv', 'w')
    elif i == 566:
        f2.close()
        f2 = open('akt_data/akt_chunk2.csv', 'w')
    elif i == 1132:
        f2.close()
        f2 = open('akt_data/akt_chunk3.csv', 'w')
    elif i == 1698:
        f2.close()
        f2 = open('akt_data/akt_chunk4.csv', 'w')
    elif i == 2264:
        f2.close()
        f2 = open('akt_data/akt_chunk5.csv', 'w')
    f2.write(f.readline())
    f2.write(f.readline())
    f2.write(f.readline())
    f2.write(f.readline())

f2.close()
f.close()

#### Next we create our 5 training datasets using the chunks

In [8]:
######################################################################
f = open('akt_data/final/assist2009_pid_train1.csv', 'w')
with open('akt_data/akt_chunk1.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk2.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk3.csv', 'r') as f2:
    for line in f2:
        f.write(line)
f.close()

######################################################################
f = open('akt_data/final/assist2009_pid_train2.csv', 'w')
with open('akt_data/akt_chunk4.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk2.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk3.csv', 'r') as f2:
    for line in f2:
        f.write(line)
f.close()

######################################################################
f = open('akt_data/final/assist2009_pid_train3.csv', 'w')
with open('akt_data/akt_chunk4.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk5.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk3.csv', 'r') as f2:
    for line in f2:
        f.write(line)
f.close()

######################################################################
f = open('akt_data/final/assist2009_pid_train4.csv', 'w')
with open('akt_data/akt_chunk1.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk4.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk5.csv', 'r') as f2:
    for line in f2:
        f.write(line)
f.close()

######################################################################
f = open('akt_data/final/assist2009_pid_train5.csv', 'w')
with open('akt_data/akt_chunk1.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk2.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk5.csv', 'r') as f2:
    for line in f2:
        f.write(line)
f.close()

#### Lastly we create the 5 testing and 5 validtion sets.

In [9]:
for i in range(5):
    testName = 'akt_data/final/assist2009_pid_test' + str(i+1) + '.csv'
    validName = 'akt_data/final/assist2009_pid_valid' + str(i+1) + '.csv'
    testChunk = str(((i+3)%5)+1)
    validChunk = str(((i+4)%5)+1)
    testChunkName = 'akt_data/akt_chunk' + testChunk + '.csv'
    validChunkName = 'akt_data/akt_chunk' + validChunk + '.csv'
    f = open(testName, 'w')
    with open(testChunkName, 'r') as f2:
        for line in f2:
            f.write(line)
    f.close()
    f = open(validName, 'w')
    with open(validChunkName, 'r') as f2:
        for line in f2:
            f.write(line)
    f.close()