In [1]:
import os
import json
import pickle
import torch
import numpy as np
import pandas as pd
from utils.utils import set_seed
from utils.data_loader import *
from torch.optim import Adam
from deepctr_torch.inputs import SparseFeat, DenseFeat,VarLenSparseFeat, get_feature_names
from deepctr_torch.models import *
from utils.task1_helps import *

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# test dataset dir
test_data_dir = "../csedm_2021/data/csedm_2021/datashop/F19_Release_Test_06-28-21/Test"
# load data
early_test, late_test, main_table_test, code_state_test, subject_test, metadata_test = load_raw_data(
    test_data_dir)

In [None]:
model_dir = 'data/models/task1'

In [3]:
#encode test features
lbe_dict = pickle.load(open(os.path.join(model_dir,'lbe_dict.pkl'),'rb'))
test_data = late_test.copy()
for feat in lbe_dict:
    lbe = lbe_dict[feat]
    test_data[feat+"_encoded"] = lbe_dict[feat].transform(test_data[feat].apply(str))

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
def infer_one_model(test_data, args, lbe_dict):
    test_data = test_data.copy()
    config = {"dense_features": [],
              "sparse_features": ['SubjectID', 'ProblemID', 'AssignmentID'],
              'sparse_emb_dim': args.sparse_emb_dim}
    # get features
    dnn_feature_columns, linear_feature_columns, test_model_input = get_input_data(
        test_data, config, lbe_dict)

    # load best model
    model = eval(args.model_name)(linear_feature_columns, dnn_feature_columns,
                                  task='binary', device=device, seed=args.seed)
    model.compile(Adam(model.parameters(), args.lr),
                  'binary_crossentropy', metrics=['binary_crossentropy'])
    model.load_state_dict(torch.load(args.model_path))

    # predict
    test_data['Label'] = model.predict(
        test_model_input, batch_size=args.batch_size)
    keep_cols = ['SubjectID', 'AssignmentID', 'ProblemID', 'Label']
    return test_data[keep_cols]

In [6]:
model_ids = ['9qqg9n7b','k22nsoiv','nq8jqdld']#we use more than 3 models

In [7]:
pred_list = []
for model_id in model_ids:
    args = load_model_config(model_id,model_dir)
    df_submit = infer_one_model(test_data,args,lbe_dict)
    pred_list.append(df_submit['Label'].values)

In [8]:
y_pred = np.array(pred_list).mean(axis=0)

In [9]:
df_submit['Label'] = y_pred

In [10]:
os.makedirs("data/submit/track1/",exist_ok=True)

In [11]:
df_submit.to_csv('data/submit/track1/predictions.csv',index=False)

In [12]:
df_submit

Unnamed: 0,SubjectID,AssignmentID,ProblemID,Label
0,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,41,0.855945
1,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,43,0.883483
2,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,44,0.898083
3,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,46,0.875365
4,00358c94503a8d9e6869efc6e5cdb0e1c8e9eb39b1fd46...,494,49,0.875623
...,...,...,...,...
2360,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,64,0.221462
2361,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,70,0.125746
2362,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,71,0.205003
2363,fa7805c1d46ef49851de43750a665a993eef750b560159...,502,112,0.203444
