In [1]:
import sys
sys.path.append('..')

In [2]:
from omegaconf import DictConfig, OmegaConf
import pandas as pd
import os
import numpy as np
import random
from transformers import AutoTokenizer, AutoModel
import torch
import yaml
import multiprocessing
import torch.nn as nn

In [3]:
from cles.predict import get_model
from cles.predict import get_dataloader
from cles.predict import predict

In [4]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [5]:
seed_everything(42)

In [6]:
base_path = '..'
base_data_path = '../data_raw'
config_backbone_path = f'{base_path}/models_config/db_l_gru_pool/microsoft/deberta-v3-large/config.json'
tokenizer_path = f'{base_path}/models_config/db_l_gru_pool/microsoft/deberta-v3-large'
train_config = f'{base_path}/config/db_large_gru_pooling.yaml'
model_weights_directory = f'{base_path}/model_weights/db_l_gru_pool'
summaries_path = f'{base_data_path}/summaries_train.csv'
prompts_path = f'{base_data_path}/prompts_train.csv'

In [7]:
with open(train_config, "r") as stream:
    try:
        train_config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [8]:
train_config = DictConfig(train_config)
train_config['backbone']['config_path'] = config_backbone_path

In [9]:
num_workers = multiprocessing.cpu_count()
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [10]:
os.listdir(model_weights_directory)

['microsoft_deberta_v3_large_3b9047.pth',
 'microsoft_deberta_v3_large_39c16e.pth',
 'microsoft_deberta_v3_large_814d6b.pth',
 'microsoft_deberta_v3_large_ebad26.pth']

In [11]:
oof_preds = []

In [12]:
for fold in ["3b9047", "39c16e", "814d6b", "ebad26"]:
    df = pd.read_csv(f'../data/{fold}/test.csv')
    model_w_file = f'{model_weights_directory}/microsoft_deberta_v3_large_{fold}.pth'

    dataloader = get_dataloader(dataframe=df, num_workers=num_workers, tokenizer=tokenizer, cfg=train_config)
    model = get_model(train_config)
    model.load_state_dict(torch.load(model_w_file))
    model.to(device);
    model.eval();
    predictions = predict(model, dataloader, device)
    del model;
    del dataloader;
    torch.cuda.empty_cache()
    df.loc[:, ['pred_content', 'pred_wording']] = predictions
    df = df[['prompt_id', 'student_id', 'content', 'wording', 'pred_content', 'pred_wording']].copy()
    oof_preds.append(df)

100%|██████████| 503/503 [00:30<00:00, 16.36it/s]
100%|██████████| 515/515 [00:23<00:00, 21.99it/s]
100%|██████████| 276/276 [00:13<00:00, 20.04it/s]
100%|██████████| 499/499 [00:27<00:00, 18.11it/s]


In [13]:
from cles.metrics.metric import mcrmse

In [14]:
oof_df = pd.concat(oof_preds, axis=0, ignore_index=True)

In [15]:
oof_df

Unnamed: 0,prompt_id,student_id,content,wording,pred_content,pred_wording
0,3b9047,004e978e639e,3.128928,4.231226,2.457785,2.490094
1,3b9047,005ab0199905,-0.210614,-0.471415,-1.089983,-1.100757
2,3b9047,0072b649a88c,0.205683,0.380538,-0.049564,0.565939
3,3b9047,0089072f5fea,-1.547163,-1.461245,-1.453158,-1.505344
4,3b9047,008efab52a76,-0.066112,-0.715083,0.108171,-0.263506
...,...,...,...,...,...,...
7160,ebad26,ff37545b2805,1.520355,-0.292990,1.510806,-0.091507
7161,ebad26,ff4ed38ef099,-1.204574,-1.169784,-0.847583,-1.228878
7162,ebad26,ff53b94f7ce0,0.328739,-1.053294,0.596798,-0.291178
7163,ebad26,ff7c7e70df07,0.205683,0.380538,-0.193301,-0.214642


In [16]:
mcrmse(oof_df[['content', 'wording']].values, oof_df[['pred_content', 'pred_wording']].values)

(0.5392426184601709, [0.45534196020864137, 0.6231432767117003])

In [17]:
oof_df.to_csv('../oof_predictions/microsoft_deberta_v3_large_gru_pool.csv', index=False)