In [1]:
import pandas as pd;
pdf_test = pd.read_csv('./input/prompts_train.csv')
sdf_test = pd.read_csv('./input/summaries_train.csv')
df_test = pdf_test.merge(sdf_test, on ='prompt_id')

In [2]:
df_test.head(3)

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181


In [3]:
df_test

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.594710
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886
...,...,...,...,...,...,...,...,...
7160,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff37545b2805,"In paragraph two, they would use pickle meat a...",1.520355,-0.292990
7161,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff4ed38ef099,"in the first paragraph it says ""either can it...",-1.204574,-1.169784
7162,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff53b94f7ce0,They would have piles of filthy meat on the fl...,0.328739,-1.053294
7163,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff7c7e70df07,They used all sorts of chemical concoctions to...,0.205683,0.380538


In [4]:
import os
import logging
import warnings
from dataclasses import dataclass, field
from typing import Optional
import torch

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoConfig,
    set_seed,
    Trainer,
    TrainingArguments,
    HfArgumentParser,
    DataCollatorWithPadding,
)
from datasets import Dataset, disable_progress_bar
import pandas as pd
import numpy as np

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['WANDB_PROJECT'] = 'kaggle-commonlit-eval-student-summaries'

disable_progress_bar()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

@dataclass
class Config:
    model_name_or_path: Optional[str] = field(
        default="microsoft/deberta-v3-base",
        metadata={"help": "Model name or path"},
    )

    data_dir: Optional[str] = field(
        default="/kaggle/input/commonlit-evaluate-student-summaries",
        metadata={"help": "Data directory"},
    )

    max_seq_length: Optional[int] = field(
        default=1600,
        metadata={"help": "Max sequence length"},
    )

    add_prompt_question: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt question into input"},
    )

    add_prompt_text: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt text into input"},
    )

    fold: Optional[int] = field(
        default=0,
        metadata={"help": "Fold"},
    )

    num_proc: Optional[int] = field(
        default=4,
        metadata={"help": "Number of processes"},
    )

    dropout: Optional[float] = field(
        default=0.,
        metadata={"help": "Amount of dropout to apply"},
    )
    max_position_embeddings: Optional[int] = field(
        default=1600,
        metadata={"help": "Amount of dropout to apply"},
    )


def tokenize(example, tokenizer, config):
    sep = tokenizer.sep_token

    # if config.add_prompt_question:
    #     text = sep.join(
    #         [example["prompt_question"], example["prompt_text"], example["text"]]
    #     )
    # elif config.add_prompt_text:
    #     text = sep.join([example["prompt_text"], example["text"]])
    # else:
    #     text = example["text"]
    prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
    labels = [example["content"], example["wording"]]

    tokenized = tokenizer(
        prompt,
        example["text"],
        padding=False,
        truncation=False,
        max_length=config.max_seq_length,
    )

    return {
        **tokenized,
        "labels": labels,
    }

def tokenize_inf(example, tokenizer, config):
    sep = tokenizer.sep_token;
    prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
    tokenized = tokenizer(
        prompt,
        example["text"],
        padding=False,
        truncation=False,
        max_length=config.max_seq_length,
    )

    return {
        **tokenized
    }


def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }


In [5]:
model_name_or_path = './output_fold3_seed42_2108'

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model_config = AutoConfig.from_pretrained(model_name_or_path)

model_config.update({
    "hidden_dropout_prob": 0,
    "attention_probs_dropout_prob": 0,
    "num_labels": 2,
    "problem_type": "regression",
    "max_position_embeddings":1600,
})

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=16,
)
# Do not use pretrained model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path, config=model_config
)
model.to(device)

config = Config()

In [15]:
config.max_seq_length

1600

In [6]:
import torch
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df;
        self.tokenizer = tokenizer;
        self.max_len = config.max_seq_length;
    
    def __len__(self):
        return len(self.df);
    
    def __getitem__(self, index):
        data_row = self.df.iloc[index, :]
        #print(data_row)
        text_input = self.tokenizer.sep_token.join([data_row["prompt_title"], data_row["prompt_text"], data_row["prompt_question"]])
        text_input_paired = data_row['text']
        inputs = self.tokenizer(
            text_input_paired,
            text_input,
            padding=False,
            truncation=False,
            max_length=self.max_len,
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return {
            'input_ids': torch.tensor(ids, dtype = torch.long),
            'attention_mask': torch.tensor(mask, dtype = torch.long)
        }


In [22]:
import numpy as np
from tqdm import tqdm
@torch.no_grad()

def test_run(model, loader):
    model.eval()
    preds = []
    for idx, data in tqdm(enumerate(loader), total = len(loader)):
        ids = data['input_ids'].to(device, dtype = torch.long);
        mask = data['attention_mask'].to(device, dtype = torch.long);
        y_preds = model(ids, mask);
        preds.append(y_preds[0].to('cpu').numpy());
        del ids, mask
    torch.cuda.empty_cache()
    predictions = np.concatenate(preds)
    
    return predictions
        

In [23]:
preds = test_run(model, test_loader)
preds

100%|██████████| 3583/3583 [16:05<00:00,  3.71it/s]


array([[ 0.12350778, -0.23918925],
       [-0.9611728 , -1.0675625 ],
       [ 0.14993002, -0.7006547 ],
       ...,
       [ 0.5615608 , -0.11869518],
       [-0.17722498, -0.08349483],
       [ 0.8947087 ,  0.68251175]], dtype=float32)

In [21]:
preds[0].to('cpu').numpy()

array([[ 0.12350778, -0.23918925],
       [-0.9611728 , -1.0675625 ]], dtype=float32)

In [15]:
test_ds = Dataset.from_pandas(df_test)
tokenized_test_ds = test_ds.map(tokenize_inf, batched=False, num_proc=4, fn_kwargs={"tokenizer": tokenizer, "config": config})

In [16]:
from transformers import TrainingArguments
test_args = TrainingArguments(
    output_dir = './',
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 1,   
    dataloader_drop_last = False,
    eval_accumulation_steps=1,
)

# init trainer
trainer = Trainer(
    model = model, 
    args = test_args,
    data_collator = data_collator,
    tokenizer = tokenizer
)

test_results = trainer.predict(tokenized_test_ds)

In [18]:
test_results

PredictionOutput(predictions=array([[-0.33315864, -0.40877536],
       [-0.34470144, -0.41816443],
       [-0.32910156, -0.41634947],
       [-0.34233406, -0.42136148]], dtype=float32), label_ids=None, metrics={'test_runtime': 0.2444, 'test_samples_per_second': 16.363, 'test_steps_per_second': 16.363})

In [24]:
# submission
submission_df = pd.DataFrame()
submission_df['student_id'] = df_test['student_id']
submission_df['content'] = test_results[0][:, 0]
submission_df['wording'] = test_results[0][:, 1]
submission_df.to_csv('submission.csv')