In [1]:
import sys
sys.path.append('..')

In [2]:
from omegaconf import DictConfig, OmegaConf
import pandas as pd
import os
import numpy as np
import random
from transformers import AutoTokenizer, AutoModel
import torch
import yaml
import multiprocessing
import torch.nn as nn

In [3]:
from cles.predict import get_model
from cles.predict import get_dataloader
from cles.predict import predict

In [4]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [5]:
seed_everything(42)

In [6]:
base_path = '..'
base_data_path = '../data_raw'
config_backbone_path = f'{base_path}/models_config/db_l_conc_pool/microsoft/deberta-v3-large/config.json'
tokenizer_path = f'{base_path}/models_config/db_l_conc_pool/microsoft/deberta-v3-large'
train_config = f'{base_path}/config/db_large_conc_pool.yaml'
model_weights_directory = f'{base_path}/model_weights/db_l_conc_pool'
summaries_path = f'{base_data_path}/summaries_train.csv'
prompts_path = f'{base_data_path}/prompts_train.csv'

In [7]:
with open(train_config, "r") as stream:
    try:
        train_config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [8]:
train_config = DictConfig(train_config)
train_config['backbone']['config_path'] = config_backbone_path

In [9]:
num_workers = multiprocessing.cpu_count()
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [10]:
summaries = pd.read_csv(summaries_path)
prompts = pd.read_csv(prompts_path)

In [11]:
df = prompts.merge(summaries, on='prompt_id', how='left')

In [12]:
os.listdir(model_weights_directory)

['microsoft_deberta_v3_large_3b9047.pth',
 'microsoft_deberta_v3_large_39c16e.pth',
 'microsoft_deberta_v3_large_814d6b.pth',
 'microsoft_deberta_v3_large_ebad26.pth']

In [13]:
preds = []
for model_w_file in os.listdir(model_weights_directory):
    dataloader = get_dataloader(dataframe=df, num_workers=num_workers, tokenizer=tokenizer, cfg=train_config)
    model = get_model(train_config)
    model.load_state_dict(torch.load(f'{model_weights_directory}/{model_w_file}'))
    model.to(device);
    model.eval();
    predictions = predict(model, dataloader, device)
    del model;
    torch.cuda.empty_cache()
    preds.append(predictions)

100%|██████████| 1792/1792 [01:46<00:00, 16.75it/s]
100%|██████████| 1792/1792 [01:48<00:00, 16.58it/s]
100%|██████████| 1792/1792 [01:48<00:00, 16.55it/s]
100%|██████████| 1792/1792 [01:48<00:00, 16.58it/s]


In [14]:
predictions_agg = np.mean(preds, axis=0)

In [15]:
df.loc[:, ['content', 'wording']] = predictions_agg

In [16]:
df[['student_id', 'content', 'wording']].head(5)

Unnamed: 0,student_id,content,wording
0,00791789cc1f,0.054285,-0.259413
1,0086ef22de8f,-0.946032,-1.054013
2,0094589c7a22,-0.024926,-0.538148
3,00cd5736026a,0.221033,0.005303
4,00d98b8ff756,-0.465475,-0.529504


In [17]:
df[['student_id', 'content', 'wording']].to_csv('submission.csv', index=False)