# Pre processing the dataset for DKT

In [151]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

In [152]:
df = pd.read_csv('processed_data.csv')
print(df.columns)

Index(['Unnamed: 0', 'Anon Student Id', 'Problem Name', 'Step Name', 'Outcome',
       'KC (WPI-Apr-2005)', 'Step Num'],
      dtype='object')


## Drop columns we don't need and rename them accordingly
### Note: If we want to use Step Num instead of Step Name, just switch out the commented code below.

In [153]:
# df = df.drop(columns=['Unnamed: 0','Step Name', 'Problem Name'])
# df.columns = ['user_id', 'score', 'knowledge_code', 'exer_id']
df = df.drop(columns=['Unnamed: 0','Step Name', 'Step Num'])
df.columns = ['user_id', 'exer_id', 'score', 'knowledge_code']

In [154]:
df

Unnamed: 0,user_id,exer_id,score,knowledge_code
0,1,1,0,1
1,1,1,0,1
2,1,2,1,2
3,1,2,1,2
4,1,2,0,2
...,...,...,...,...
323383,2833,214,1,36
323384,2833,214,0,36
323385,2833,214,0,36
323386,2833,214,0,36


In [155]:
df.dtypes

user_id           int64
exer_id           int64
score             int64
knowledge_code    int64
dtype: object

# Analyze sample datasets
To get a good idea on the train/test split and how our data differs.

In [None]:
with open('dkt/data_old/log_data.json') as f:
    data = json.load(f)

users = []
for user in data:
    for log in user['logs']:
        users.append(log['exer_id'])
#         for kc in log['knowledge_code']:
#             users.append(kc)
# print(set(users))

In [10]:
with open('dkt/data/log_data.json') as f:
    data = json.load(f)

total_rows = 0
for user in data:
    total_rows += user['log_num']
print(total_rows)

278868


In [11]:
with open('dkt/data/test_set.json') as f:
    data = json.load(f)

total_rows = 0
for user in data:
    total_rows += user['log_num']
print(total_rows)

55760


In [12]:
with open('dkt/data/train_slice.json') as f:
    data = json.load(f)

total_rows = 0
for user in data:
    total_rows += user['log_num']
print(total_rows)

186049


In [13]:
with open('dkt/data/train_set.json') as f:
    data = json.load(f)

total_rows = 0
for user in data:
    total_rows += 1
print(total_rows)

186049


In [14]:
with open('dkt/data/val_set.json') as f:
    data = json.load(f)

total_rows = 0
for user in data:
    total_rows += user['log_num']
print(total_rows)

25606


# Split data and convert to json format

In [156]:
df['score'] = df['score'].astype(float)

In [157]:
df

Unnamed: 0,user_id,exer_id,score,knowledge_code
0,1,1,0.0,1
1,1,1,0.0,1
2,1,2,1.0,2
3,1,2,1.0,2
4,1,2,0.0,2
...,...,...,...,...
323383,2833,214,1.0,36
323384,2833,214,0.0,36
323385,2833,214,0.0,36
323386,2833,214,0.0,36


In [165]:
print(len(df['exer_id'].unique()))
print(len(df['knowledge_code'].unique()))
print(len(df['user_id'].unique()))

385
85
2833


In [159]:
df.sample(frac=1)
df_train, df_test = train_test_split(df, test_size=0.4, random_state=1)
df_test, df_valid = train_test_split(df_test, test_size=0.5, random_state=1)

In [160]:
result = df_train.to_json(orient="records")
parsed = json.loads(result)
for record in parsed:
    record['knowledge_code'] = [record['knowledge_code']]
with open('dkt/data/train_set.json', 'w') as outfile:
    json.dump(parsed, outfile, indent=4)

In [161]:
users = df_test['user_id'].unique().tolist()
result = df_test.to_json(orient="records")
parsed = json.loads(result)

res = [{"user_id": i,"log_num": 0,"logs": []} for i in users]
for i, user in enumerate(users):
    for record in parsed:
        if record['user_id'] == user:
            record['knowledge_code'] = [record['knowledge_code']]
            new_dict = {key:val for key, val in record.items() if key != 'user_id'} 
            res[i]['logs'].append(new_dict)
            res[i]['log_num'] += 1

with open('dkt/data/test_set.json', 'w') as outfile:
    json.dump(res, outfile, indent=4)

In [162]:
users = df_valid['user_id'].unique().tolist()
result = df_valid.to_json(orient="records")
parsed = json.loads(result)

res = [{"user_id": i,"log_num": 0,"logs": []} for i in users]
for i, user in enumerate(users):
    for record in parsed:
        if record['user_id'] == user:
            record['knowledge_code'] = [record['knowledge_code']]
            new_dict = {key:val for key, val in record.items() if key != 'user_id'} 
            res[i]['logs'].append(new_dict)
            res[i]['log_num'] += 1

with open('dkt/data/val_set.json', 'w') as outfile:
    json.dump(res, outfile, indent=4)

In [163]:
users = df['user_id'].unique().tolist()
result = df.to_json(orient="records")
parsed = json.loads(result)

res = [{"user_id": i,"log_num": 0,"logs": []} for i in users]
for i, user in enumerate(users):
    for record in parsed:
        if record['user_id'] == user:
            record['knowledge_code'] = [record['knowledge_code']]
            new_dict = {key:val for key, val in record.items() if key != 'user_id'} 
            res[i]['logs'].append(new_dict)
            res[i]['log_num'] += 1

with open('dkt/data/log_data.json', 'w') as outfile:
    json.dump(res, outfile, indent=4)

In [164]:
users = df_train['user_id'].unique().tolist()
result = df_train.to_json(orient="records")
parsed = json.loads(result)

res = [{"user_id": i,"log_num": 0,"logs": []} for i in users]
for i, user in enumerate(users):
    for record in parsed:
        if record['user_id'] == user:
            record['knowledge_code'] = [record['knowledge_code']]
            new_dict = {key:val for key, val in record.items() if key != 'user_id'} 
            res[i]['logs'].append(new_dict)
            res[i]['log_num'] += 1

with open('dkt/data/train_slice.json', 'w') as outfile:
    json.dump(res, outfile, indent=4)