In [1]:
import os
import pandas as pd
from utils.metrics import get_model_metrics
from utils.data_loader import *
import pickle
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import random
from utils.utils import set_seed
set_seed(0)
import numpy as np
import math
from tqdm import tqdm_notebook

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Load datas

In [2]:
train_data_dir = "data/csedm_2021/datashop/F19_Release_Train_06-28-21/Train/"
early,late,main_table,code_state,subject,metadata = load_raw_data(train_data_dir)

test_data_dir = "data/csedm_2021/datashop/F19_Release_Test_06-28-21/Test"
early_test,late_test,main_table_test,code_state_test,subject_test,metadata_test = load_raw_data(test_data_dir)

In [3]:
early_test = early_test.sample(frac=1,random_state=0)

In [4]:
all_early = pd.concat([early,early_test])
all_early.shape

(14218, 6)

### Get questions's basic information

In [5]:
df_questions = pd.read_excel(
    "data/csedm_2021/datashop/2nd CSEDM Data Challenge - Problem Prompts & Concepts Used.xlsx")
df_questions = df_questions.fillna(0)
concept_list = ['If/Else', 'NestedIf', 'While', 'For', 'NestedFor', 'Math+-*/', 'Math%', 'LogicAndNotOr', 'LogicCompareNum',
                'LogicBoolean', 'StringFormat', 'StringConcat', 'StringIndex', 'StringLen', 'StringEqual', 'CharEqual',
                'DefFunction', 'ArrayIndex']
concept_map = dict(zip(concept_list, range(len(concept_list))))
len(concept_list)

18

In [6]:
df_questions.head(1)

Unnamed: 0,AssignmentID,ProblemID,Requirement,If/Else,NestedIf,While,For,NestedFor,Math+-*/,Math%,...,LogicCompareNum,LogicBoolean,StringFormat,StringConcat,StringIndex,StringLen,StringEqual,CharEqual,ArrayIndex,DefFunction
0,439.0,1.0,Write a function in Java that implements the f...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
que_info_dict = {}
for _, row in df_questions.iterrows():
    concepts = row[3:][row[3:] == 1].index.tolist()
    que_info_dict[int(row['ProblemID'])] = {"concepts": concepts,
                                            "concepts_id": [concept_map[x] for x in concepts],
                                            "Requirement": row['Requirement'],
                                            "AssignmentID": row['AssignmentID']
                                            }

In [9]:
que_info_dict

{1: {'concepts': ['If/Else', 'Math+-*/', 'LogicAndNotOr', 'LogicCompareNum'],
  'concepts_id': [0, 5, 7, 8],
  'Requirement': 'Write a function in Java that implements the following logic: Given 2 ints, a and b, return their sum. However, sums in the range 10..19 inclusive, are forbidden, so in that case just return 20.',
  'AssignmentID': 439.0},
 3: {'concepts': ['If/Else',
   'NestedIf',
   'LogicAndNotOr',
   'LogicCompareNum',
   'LogicBoolean'],
  'concepts_id': [0, 1, 7, 8, 9],
  'Requirement': 'Write a function in Java that implements the following logic: Given a number n, return true if n is in the range 1..10, inclusive. Unless "outsideMode" is true, in which case return true if the number is less or equal to 1, or greater or equal to 10.',
  'AssignmentID': 439.0},
 5: {'concepts': ['If/Else', 'NestedIf', 'LogicBoolean'],
  'concepts_id': [0, 1, 9],
  'Requirement': 'Write a function in Java that implements the following logic: Your cell phone rings. Return true if you shoul

In [10]:
with open('data/csedm_2021/que_info_dict.json','w') as f:
    json.dump(que_info_dict,f)

In [11]:
with open('data/csedm_2021/concept_list.json','w') as f:
    json.dump(concept_list,f)

### Get features

In [12]:
problem_ids = [1,   3,   5,  12,  13,  17,  20,  21,  22,  24,  25,  28,  31,
               32,  33,  34,  36,  37,  38,  39,  40, 100, 101, 102, 128, 232,
               233, 234, 235, 236]
test_problem_ids = [41,  43,  44,  45,  46,  48,  49,  51,  56,  57,  64,  67,  70,
                    71, 104, 106, 107, 108, 112, 118]
all_problem_ids = problem_ids+test_problem_ids

In [13]:
que_info_dict = json.load(open('data/csedm_2021/que_info_dict.json'))
concept_list = json.load(open('data/csedm_2021/concept_list.json'))
len(concept_list)

18

In [14]:
def get_same_que_dict():
    same_que_dict = {}
    for i in all_problem_ids:
        same_que_dict[i] = []
        que_i_concepts = set(que_info_dict[str(i)]['concepts_id'])
        que_i_num = len(que_i_concepts)
        for j in all_problem_ids:
            if i == j:
                continue
            que_j_concepts = set(que_info_dict[str(j)]['concepts_id'])
            que_j_num = len(que_j_concepts)
            inter_num = len(que_i_concepts & que_j_concepts)
            if que_i_num <= 2 and inter_num >= 1:
                same_que_dict[i].append(j)
            elif que_i_num <= 3 and inter_num >= 2:
                same_que_dict[i].append(j)
            elif inter_num+1 >= min(que_i_num, que_j_num):
                same_que_dict[i].append(j)
    for que_id in test_problem_ids:
        print(f"que_id is {que_id}")
        inter_ques = set(same_que_dict[que_id]) & set(problem_ids)
        print(inter_ques)
        print(len(inter_ques))
        print('-------------')
    return same_que_dict

In [15]:
same_que_dict = get_same_que_dict()

que_id is 41
{36}
1
-------------
que_id is 43
{40, 235, 236}
3
-------------
que_id is 44
{1, 100, 233, 235, 236, 24, 25}
7
-------------
que_id is 45
{1, 101, 233, 235, 236, 17, 20, 21, 24, 25}
10
-------------
que_id is 46
{1, 40, 233, 235, 236, 17, 24, 25}
8
-------------
que_id is 48
{40, 235, 236}
3
-------------
que_id is 49
{1, 233, 235, 236, 24, 25}
6
-------------
que_id is 51
{235, 236}
2
-------------
que_id is 56
{1, 101, 233, 235, 236, 17, 20, 21, 24, 25}
10
-------------
que_id is 57
{128, 34, 36, 38, 39, 40, 31}
7
-------------
que_id is 64
{235, 236}
2
-------------
que_id is 67
{1, 101, 233, 235, 236, 17, 20, 21, 24, 25}
10
-------------
que_id is 70
{1, 40, 233, 235, 236, 17, 24, 25}
8
-------------
que_id is 71
{1, 233, 235, 236, 24, 25}
6
-------------
que_id is 104
{1, 233, 235, 236, 17, 24, 25}
7
-------------
que_id is 106
{1, 233, 235, 236, 17, 24, 25}
7
-------------
que_id is 107
{1, 101, 233, 235, 236, 17, 20, 21, 24, 25}
10
-------------
que_id is 108
{1, 4

#### Save question features

In [16]:
stu_question_dict = {}
for subject_id, subject_group in all_early.groupby("SubjectID"):
    for que_id in all_problem_ids:
        # Remove the question needed to predict
        subject_group_remove_now = subject_group[subject_group['ProblemID'] != que_id]
        label_list = subject_group_remove_now['Label'].tolist()

        # get similar questions
        same_ques = subject_group[subject_group['ProblemID'].isin(
            same_que_dict[que_id])]
        same_label_list = same_ques['Label'].tolist()

        # get features
        stu_question_dict[f"{subject_id}_{que_id}"] = [
            same_label_list.count(True),
            same_label_list.count(False),
            0.5 if len(same_label_list) == 0 else round(
                np.mean(same_label_list), 4),  
            0 if len(same_ques) == 0 else math.ceil(
                same_ques['Attempts'].mean()),
            label_list.count(True),
            label_list.count(False),
            round(np.mean(label_list), 4),
            math.ceil(subject_group_remove_now['Attempts'].mean()),
        ]

In [17]:
with open('data/csedm_2021/stu_question_dict.json','w') as f:
    json.dump(stu_question_dict,f)

### Get features

In [18]:
stu_question_dict = json.load(open('data/csedm_2021/stu_question_dict.json'))
feature_cols = ['same_success_num', 'same_fail_num', 'same_s_rate', 'same_avg_attempt',
                'success_num', 'fail_num', 's_rate', 'avg_attempt']

In [19]:
stu_question_dict

{'00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd4670206aa82503a575dc_1': [16,
  2,
  0.8889,
  3,
  25,
  4,
  0.8621,
  5],
 '00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd4670206aa82503a575dc_3': [13,
  2,
  0.8667,
  4,
  25,
  4,
  0.8621,
  5],
 '00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd4670206aa82503a575dc_5': [9,
  0,
  1.0,
  3,
  25,
  4,
  0.8621,
  5],
 '00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd4670206aa82503a575dc_12': [13,
  2,
  0.8667,
  4,
  25,
  4,
  0.8621,
  5],
 '00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd4670206aa82503a575dc_13': [14,
  2,
  0.875,
  3,
  25,
  4,
  0.8621,
  5],
 '00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd4670206aa82503a575dc_17': [15,
  2,
  0.8824,
  3,
  25,
  4,
  0.8621,
  5],
 '00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd4670206aa82503a575dc_20': [14,
  2,
  0.875,
  4,
  25,
  4,
  0.8621,
  5],
 '00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd4670206aa82503a575dc_21': [14,
  2,
  0.875,
  4,
  25,
  4,
  0.8621,
  5],
 '00358c94503a8d9e

In [20]:
def df_add_feature(df):
    for i,feature_name in zip(range(len(feature_cols)),feature_cols):
        df[feature_name] = df.apply(lambda x:stu_question_dict[f"{x['SubjectID']}_{x['ProblemID']}"][i],axis=1)

In [21]:
df_add_feature(early)
df_add_feature(late)
df_add_feature(early_test)
df_add_feature(late_test)

In [22]:
early.head()

Unnamed: 0,SubjectID,AssignmentID,ProblemID,Attempts,CorrectEventually,Label,same_success_num,same_fail_num,same_s_rate,same_avg_attempt,success_num,fail_num,s_rate,avg_attempt
0,014604ba54339d4b1266cf78e125053a5ac11dd861ef3c...,439,1,1,True,True,17,1,0.9444,4,24,5,0.8276,6
1,014604ba54339d4b1266cf78e125053a5ac11dd861ef3c...,439,3,1,True,True,15,0,1.0,3,24,5,0.8276,6
2,014604ba54339d4b1266cf78e125053a5ac11dd861ef3c...,439,5,1,True,True,9,0,1.0,3,24,5,0.8276,6
3,014604ba54339d4b1266cf78e125053a5ac11dd861ef3c...,439,12,1,True,True,15,0,1.0,3,24,5,0.8276,6
4,014604ba54339d4b1266cf78e125053a5ac11dd861ef3c...,439,13,1,True,True,16,0,1.0,3,24,5,0.8276,6


### Encode

In [23]:
def get_lbe_dict(df,features):
    lbe_dict = {}
    for feat in features:
        df[feat] = df[feat].apply(str)
        lbe = LabelEncoder()
        lbe.fit(df[feat])
        lbe_dict[feat] = lbe
    return lbe_dict

In [24]:
df_merge = pd.concat([early,late,early_test,late_test])

In [25]:
sparse_features = ['SubjectID', 'ProblemID', 'AssignmentID', 'same_success_num',
                   'same_fail_num', 'same_avg_attempt', 'success_num', 'fail_num','avg_attempt']
lbe_dict = get_lbe_dict(df_merge, sparse_features)
len(sparse_features)

9

In [26]:
field_info_map = {'SubjectID': "user",
                  'ProblemID': "item",
                  'AssignmentID': "concept",
                  "concept": "concept",
                  "same_success_num": "same",
                  "same_fail_num": "same",
                  "same_s_rate": "same",
                  "same_avg_attempt": "same",
                  "success_num": "all",
                  "fail_num": "all",
                  "s_rate": "all",
                  "avg_attempt": "all"
                  }
len(field_info_map)

12

In [27]:
early.columns

Index(['SubjectID', 'AssignmentID', 'ProblemID', 'Attempts',
       'CorrectEventually', 'Label', 'same_success_num', 'same_fail_num',
       'same_s_rate', 'same_avg_attempt', 'success_num', 'fail_num', 's_rate',
       'avg_attempt'],
      dtype='object')

In [28]:
random_state = 0

In [29]:
save_dir = f'data/csedm_2021/processed_seed-{random_state}'
os.makedirs(save_dir, exist_ok=True)

In [30]:
with open(os.path.join(save_dir,'lbe_dict.pkl'),'wb') as f:
    pickle.dump(lbe_dict,f)

In [31]:
from sklearn.model_selection import KFold

In [32]:
kf = KFold(n_splits=5,shuffle=True,random_state=random_state)

In [33]:
stu_ids = early['SubjectID'].unique()

In [34]:
split_info = {}
i = 0
for train_index, test_index in kf.split(stu_ids):
    split_info[i] = [stu_ids[train_index],stu_ids[test_index]]
#     split_info[i] = [train_index,test_index]
    i +=1

In [35]:
dense_features = ["s_rate","same_s_rate"]

In [36]:
for fold in range(5):
    #训练时测试的学生
    test_stu_ids = split_info[fold][1]
    print(len(test_stu_ids))
    
    early_train = early[~early['SubjectID'].isin(test_stu_ids)]
    late_train = late[~late['SubjectID'].isin(test_stu_ids)]

    early_dev = early[early['SubjectID'].isin(test_stu_ids)]
    late_dev = late[late['SubjectID'].isin(test_stu_ids)]
    
    train_data = pd.concat([early_train,late_train,early_dev,early_test])
    dev_data = pd.concat([late_dev.copy()])
    test_data = late_test.copy()
    for feat in lbe_dict:
        lbe = LabelEncoder()
        train_data[feat+"_encoded"] = lbe_dict[feat].transform(train_data[feat].apply(str))
        dev_data[feat+"_encoded"] = lbe_dict[feat].transform(dev_data[feat].apply(str))
        test_data[feat+"_encoded"] = lbe_dict[feat].transform(test_data[feat].apply(str))
        
    for feat in dense_features:
        train_data[feat+"_encoded"] = train_data[feat]
        dev_data[feat+"_encoded"] = dev_data[feat]
        test_data[feat+"_encoded"] = test_data[feat]
    data_list = [train_data, dev_data, test_data]
    with open(os.path.join(save_dir, f'data_list_{fold}.pkl'), 'wb') as f:
        pickle.dump(data_list, f)

74
74
73
73
73


In [37]:
dev_data.shape

(1406, 23)

In [38]:
dev_data['new_id'] = dev_data.apply(lambda x:f"{x['SubjectID']}_{x['SubjectID']}_{x['ProblemID']}",axis=1)
train_data['new_id'] = train_data.apply(lambda x:f"{x['SubjectID']}_{x['SubjectID']}_{x['ProblemID']}",axis=1)

In [39]:
set(dev_data['new_id']) & set(train_data['new_id'])

set()