# Pre process data for use with AKT

In [9]:
import pandas as pd
import json
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from random import shuffle

In [10]:
df = pd.read_csv('processed_data.csv')
print(df.columns)

Index(['Unnamed: 0', 'Anon Student Id', 'Problem Name', 'Step Name', 'Outcome',
       'KC (WPI-Apr-2005)', 'Step Num'],
      dtype='object')


In [11]:
len(df['KC (WPI-Apr-2005)'].unique())

85

In [12]:
# df = df.drop(columns=['Unnamed: 0','Step Name','Step Num'])
df = df.drop(columns=['Unnamed: 0','Step Name','Problem Name'])
df.columns = ['Anon Student Id', 'Outcome', 'KC (WPI-Apr-2005)', 'Problem Name']

In [13]:
df

Unnamed: 0,Anon Student Id,Outcome,KC (WPI-Apr-2005),Problem Name
0,1,0,1,1
1,1,0,1,1
2,1,1,2,2
3,1,1,2,2
4,1,0,2,2
...,...,...,...,...
323383,2833,1,36,593
323384,2833,0,36,594
323385,2833,0,36,595
323386,2833,0,36,596


## Perform K-fold cross validation with 5 folds and split the data into train, test, validate sets.

#### First we need put the data into the correct format. We need to get all the problems, outcomes, and KCs for each student and put them on 4 seperate lines in a comma seperated list.
student id

problems

KCs

outcomes

#### Each student will take up 4 lines

In [18]:
f = open('akt_data/akt_processed_data.csv', "w")
students = df['Anon Student Id'].unique().tolist()
shuffle(students)

for i, student in enumerate(students):
    f.write(str(student) + '\n')
    # problem
    temp = df.loc[df['Anon Student Id'] == student]['Problem Name']
    temp = [str(element) for element in temp]
    temp = ','.join(temp)
    f.write(temp + '\n')
    # kc
    temp = df.loc[df['Anon Student Id'] == student]['KC (WPI-Apr-2005)']
    temp = [str(element) for element in temp]
    temp = ','.join(temp)
    f.write(temp + '\n')
    # outcome
    temp = df.loc[df['Anon Student Id'] == student]['Outcome']
    temp = [str(element) for element in temp]
    temp = ','.join(temp)
    f.write(temp + '\n')

f.close()

#### Now we break up the data into 5 folds (called chunks here)

In [21]:
f = open('akt_data/akt_processed_data.csv', 'r')
    
for i in range(2830):
    if i == 0:
        f2 = open('akt_chunk1.csv', 'w')
    elif i == 566:
        f2.close()
        f2 = open('akt_chunk2.csv', 'w')
    elif i == 1132:
        f2.close()
        f2 = open('akt_chunk3.csv', 'w')
    elif i == 1698:
        f2.close()
        f2 = open('akt_chunk4.csv', 'w')
    elif i == 2264:
        f2.close()
        f2 = open('akt_chunk5.csv', 'w')
    f2.write(f.readline())
    f2.write(f.readline())
    f2.write(f.readline())
    f2.write(f.readline())

f2.close()
f.close()

#### Next we create our 5 training datasets using the chunks

In [76]:
######################################################################
f = open('akt_data/final/assist2009_pid_train1.csv', 'w')
with open('akt_data/akt_chunk1.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk2.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk3.csv', 'r') as f2:
    for line in f2:
        f.write(line)
f.close()

######################################################################
f = open('akt_data/final/akt_data/assist2009_pid_train2.csv', 'w')
with open('akt_data/akt_chunk4.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk2.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk3.csv', 'r') as f2:
    for line in f2:
        f.write(line)
f.close()

######################################################################
f = open('akt_data/final/assist2009_pid_train3.csv', 'w')
with open('akt_data/akt_chunk4.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk5.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk3.csv', 'r') as f2:
    for line in f2:
        f.write(line)
f.close()

######################################################################
f = open('akt_data/final/assist2009_pid_train4.csv', 'w')
with open('akt_data/akt_chunk1.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk4.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk5.csv', 'r') as f2:
    for line in f2:
        f.write(line)
f.close()

######################################################################
f = open('akt_data/final/assist2009_pid_train5.csv', 'w')
with open('akt_data/akt_chunk1.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk2.csv', 'r') as f2:
    for line in f2:
        f.write(line)
with open('akt_data/akt_chunk5.csv', 'r') as f2:
    for line in f2:
        f.write(line)
f.close()

#### Lastly we create the 5 testing and 5 validtion sets.

In [None]:
for i in range(5):
    testChunk = str(((i+3)%5)+1)
    validChunk = str(((i+4)%5)+1)
    testName = 'akt_data/final/assist2009_pid_test' + testChunk + '.csv'
    validName = 'akt_data/final/assist2009_pid_valid' + validChunk + '.csv'
    f = open(testName, 'w')
    with open('akt_data/akt_chunk1.csv', 'r') as f2:
        for line in f2:
            f.write(line)
    f = open(validName, 'w')
    with open('akt_data/akt_chunk1.csv', 'r') as f2:
        for line in f2:
            f.write(line)
    f.close()

test_auc         0.691943971912472
test_accuracy    0.6387380438136377
test_loss        0.6308175406280249

In [82]:
a = [7359,11345,11356,11425,12108,12055,12116,11867,11757,11743,11765,11749,11763,11746,11764,11751,11766,11752,11745,11748,11762,7143,7062,11755,11754,5927,6145,5985,5997,6121,5996,5957,5955,5966,6709,6464,6379,6379,7714,7723,7741,7713,7749,7781,7771,7735,7774,7737,7711,7780,7702,7776,7699,7729,7775,7764,7778,7761,12375,12400,12500,12461,12549,12426,12593,12561,12577,7747,7767,7700,7879,7882,7441,7515,7436,7516,7440,7466,7442,13839,13889,13890,13808,13798,13773,13774,13775,13757,7471,14050,10286,13871,13908,13909,13814,13763,13796,5614,5702,5639,5598,5652,5566,5559,5557,5465,11827,11824,11840,11823,11861,8943,9029,5535,11891,11993,11995,15072,15102,16572,16720,16505,16538,16626,15129,10815,10618,10759,10816,8250,8250,8344,8344,8345,8345,8253,8253,8298,8298,8299,8299,8342,8342,8377,8377,8378,8378,8343,8343,8413,8413,8414,8414,8252,8252,8363,8363,8364,8364,8360,8360,10773,10773,4508,4538,4491,4529,4576,4462,4512,4488,4498,4455,4511,4416,4563,4482,13142,13143,13144,13145,13146,13147,13148,13149,13087,13201,13202,13203,13204,13205,13206,13207,13200,13270,13271,13272,13273,13274,13275,13276,13068,13279,13280,13281,13282,13283,13284,13285,13209,13210,13211,13212,13213,13214,13215,13216,13093,13150,13151,13152,13153,13267,13268,13269,13154,13157,13158,13159,13160,13161,13162,13163,4324,4324,4876,8639,8654,14494,14501,15040,14262,14241,14585,15006,14280,14553,5263,5210,5179,4999,5137,5187,5082,5351,5295,4720,8640,8665,8627,8649,8644,8668,8652,8118,8050,8051,8110,9552,9500,9535,9523,9550,14799,14800,14801,14154,14470,14458,14422,14283,14150,14160,14354,14600,14626,14231,14899,14900,14901,14291,14722,14723,14724,14368,14822,14823,14824,14215,14906,14907,14908,14132,14133,14134,14135,14178,14726,14727,14728,14225,14226,14227,14228,14230,14325,14326,14327,14292,15086,15087,15088,14266,14502,14435,14179,14855,14856,14857,14280,5352,5301,5198,5128,5023,5058,5222,4822,4645,4823,14171,14401,15244,15171,15199,16721,16599,16510,16731,16606,16462,8305,8305,8307,8307,8350,8350,8351,8351,8326,8326,8242,8242,14568,14576,15026,14988,14213,14720,15711,15701,15712,15687,15686,15842,15826,15827,15806,15822,15858,15903,15904,15905,15906,15897,5175,5159,5024,5191,4996,5139,4572,4531,4552,4577,4504,4518,4405,4439,13155,13260,13261,13262,13263,13264,13265,13266,13102,13103,13104,13105,13106,13107,13108,13109,13086,13293,13294,13295,13296,13297,13298,13299,13259,13300,13301,13302,13303,13304,13305,13306,13085,13286,13287,13288,13289,13290,13291,13292,13208,12868,12877,12878,12879,12886,12887,12888,12889,12885,12858,12949,12984,13169,13014,13339,4793,4632,4824,4821,4631,4657,4719,5209,5244,15258,15213,15147,4825,4766,4826,15919,15920,15921,15978,15988,15989,15990,15911,15887,15888,15889,15890,5245,8672,8660,8684,8648,9491,9612,9526,9626,9618,14281,14207,14393,14345,14316,14219,14173,14177,8254,8254,8318,8318,8381,8381,8382,8382,8301,8301,8246,8246,8249,8249,8354,8354,15685,15708,15695,15902,15979,15980,15981,15896,15940,15941,15942,15943,15922,15923,15924,15925,15892,15893,15894,15895,15928,15929,15930,15931,15948,15961,15962,15963,15927,13075,13314,13315,13316,13317,13318,13319,13320,13277,13224,13237,5121,5119,5194,5174,5120,4396,4396,4397,4397,4391,4391,4385,4385,4372,4372,4381,4381,4383,4383,8306,8306,8375,8375,8376,8376,8347,8347,8319,8319,8320,8320,8256,8256,8422,8422,8423,8423,13448,13407,13425,13412,13429,13390,13442,13402,13354,13387,13427,13430,13406,13432,13373,13566,13493,13587,13656,15991,15992,15993,15994,15898,15899,15900,15901,15974,15975,15976,15977,15907,15908,15909,15910,15967,15968,15969,15970,15926,15985,15986,15987,15957,15971,15972,15973,15915,2038,2104,1982,1979,2115,2055,2096,15936,1962,2092,1911,2018,2065,1975,1965,2029,2026,15937,15938,15939,15944,15935,1998,2029,2037,2037,1942,1942,1943,1943,1930,1930,2042,2042,2056,2056,2119,2119,1927,1927,1933,1933,2116,2116,1978,1978,1994,1994,1922,1922,1959,1959,1978,1978,1227,1227,1190,1190,1182,1182,961,985,510,558,472,466,553,540,1377,1377,1434,1434,1585,1585,1371,1371,1498,1498,1586,1586,1501,1501,1467,1467,2288,2174,2322,2171,2187,1809,1809,8320,8320,8401,8401,8402,8402,8347,8347,8366,8366,8367,8367,8326,8326,2768,2781,2815,2774,2718,2573,11785,1810,1810,1524,1524,8445,8445,1743,1743,1638,1638,8435,8435,8405,8405,8305,8305,8318,8318,8242,8242,8393,8393,8394,8394,8307,8307,8387,8387,8388,8388,2783,2782,2767,12599,55,55,48,48,7,7,2,2,11,11,136,136,380,380,380,11791,11777,11794,11771,11767,11789,11787,11779,11796,11768,12604,12639,12620,12623,12585,12618,12628,12600,12621,12646,11246,11301,11302,11270,11303,11304,11252,7544,7988,7998,8003,181,181,8254,8254,8300,8300,8250,8250,8344,8344,8345,8345,49,49,69,69,70,70,75,75,27,27,87,87,58,58,291,291,291,305,305,305,177,177,315,315,440,484,546,2968,2959,2981,2996,2964,2994,490,500,518,161,161,161,228,228,228,155,155,361,361,361,268,268,268,159,159,297,297,297,371,371,360,360,377,377,381,381,338,338,338,330,330,200,200,200,313,313,313,8253,8253,8342,8342,8377,8377,8378,8378,8319,8319,8249,8249,8250,8250,8344,8344,8345,8345,8321,8321,8350,8350,8351,8351,11786,11769,11774,11781,12602,12610,11297,7619,7537,7556,7628,7629,7594,7566,7630,7535,7612,7984,8015,8007,8016,8017,7991,7938,7956,8012,8004,9098,8976,8986,8992,8981,8979,9095,8985,8978,9019,5643,5474,5444,5593,5525,8687,8686,8870,8127,8631,8700,8076,8176,303,303,8139,8639,8648,8651,8691,8626,8629,8658,8668,8630,8681,1546,1546,1659,1659,1535,1535,1529,1529,2748,2751,2761,2835,2762,2838,2784,2478,2478,2461,2461,2460,2460,2493,2493,2457,2457,374,374,374,10110,10113,10114,10125,10128,10103,10127,10126,8254,8254,277,277,2504,2504,2488,2488,2465,2465,2459,2459,2477,2477,8253,8253,8359,8359,8395,8395,8396,8396,8251,8251,8300,8300,8246,8246,8657,7596,7620,7582,7542,7602,2508,2508,2523,2523,8346,8346,8320,8320,9020,8977,9065,8704,302,302,359,359,8632,8072,8654,8644,8627,8649,8637,8659,8646,8664,8650,8652,10126,10105,3575,3540,4126,942,957,984,989,951,1006,936,1325,1325,1127,1127,1291,1291,1240,1240,1145,1145,1324,1324,1142,1142,67,67,25,25,56,56,53,53,61,61,59,59,50,50,111,111,3985,4038,4089,3877,4026,2463,2463,3028,3068,1804,1804,10946,10939,4049,3892,3967,3974,3966,3889,3859,3960,3945,3894,4047,4061,3986,3911,3963,4023,3932,3862,3865,4086,3936,3875,3979,3908,3928,3929,4076,3964,3921,3866,3867,3956,4019,3925,3313,3564,3382,3382,3305,3305,3383,3383,3330,3330,3229,3229,12799,12782,12807,12786,12836,12790,12792,5130,5032,5273,5038,5016,5053,5044,5129,5006,5110,12854,12778,1726,1726,3067,3073,3119,3026,3078,1704,1704,3092,5076,5049,4983,5072,5066,5046,5105,5115,5073,5070,4987,4992,5007,5008,12809,465,461,110,110,22,22,100,100,67,67,23,23]

In [83]:
len(a)

1261