In [1]:
import pandas as pd;
pdf_test = pd.read_csv('./input/prompts_test.csv')
sdf_test = pd.read_csv('./input/summaries_test.csv')
df_test = pdf_test.merge(sdf_test, on ='prompt_id')

In [7]:
import os
import logging
import warnings
from dataclasses import dataclass, field
from typing import Optional

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoConfig,
    set_seed,
    Trainer,
    TrainingArguments,
    HfArgumentParser,
    DataCollatorWithPadding,
)
from datasets import Dataset, disable_progress_bar
import pandas as pd
import numpy as np

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['WANDB_PROJECT'] = 'kaggle-commonlit-eval-student-summaries'

disable_progress_bar()


def tokenize(example, add_prompt_question, add_prompt_text):
    """
    To see how long it would be with prompt question, prompt text, and text.
    """

    cols = []

    if add_prompt_question:
        cols.append("prompt_question")
    if add_prompt_text:
        cols.append("prompt_text")

    cols.append("text")

    return tok(
        " ".join([example[c] for c in cols]),
        padding=False,
        truncation=False
    )


@dataclass
class Config:
    model_name_or_path: Optional[str] = field(
        default="microsoft/deberta-v3-base",
        metadata={"help": "Model name or path"},
    )

    data_dir: Optional[str] = field(
        default="/kaggle/input/commonlit-evaluate-student-summaries",
        metadata={"help": "Data directory"},
    )

    max_seq_length: Optional[int] = field(
        default=1600,
        metadata={"help": "Max sequence length"},
    )

    add_prompt_question: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt question into input"},
    )

    add_prompt_text: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt text into input"},
    )

    fold: Optional[int] = field(
        default=0,
        metadata={"help": "Fold"},
    )

    num_proc: Optional[int] = field(
        default=4,
        metadata={"help": "Number of processes"},
    )

    dropout: Optional[float] = field(
        default=0.,
        metadata={"help": "Amount of dropout to apply"},
    )
    max_position_embeddings: Optional[int] = field(
        default=1600,
        metadata={"help": "Amount of dropout to apply"},
    )


def tokenize(example, tokenizer, config):
    sep = tokenizer.sep_token

    # if config.add_prompt_question:
    #     text = sep.join(
    #         [example["prompt_question"], example["prompt_text"], example["text"]]
    #     )
    # elif config.add_prompt_text:
    #     text = sep.join([example["prompt_text"], example["text"]])
    # else:
    #     text = example["text"]
    prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
    labels = [example["content"], example["wording"]]

    tokenized = tokenizer(
        prompt,
        example["text"],
        padding=False,
        truncation=False,
        max_length=config.max_seq_length,
    )

    return {
        **tokenized,
        "labels": labels,
    }

def tokenize_inf(example, tokenizer, config):
    sep = tokenizer.sep_token;
    prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
    tokenized = tokenizer(
        prompt,
        example["text"],
        padding=False,
        truncation=False,
        max_length=config.max_seq_length,
    )

    return {
        **tokenized
    }


def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }


In [8]:
model_name_or_path = './output_fold3_seed42_1908'

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model_config = AutoConfig.from_pretrained(model_name_or_path)

model_config.update({
    "hidden_dropout_prob": 0,
    "attention_probs_dropout_prob": 0,
    "num_labels": 2,
    "problem_type": "regression",
    "max_position_embeddings":1600,
})

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=16,
)
# Do not use pretrained model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path, config=model_config
)

config = Config()

In [9]:
test_ds

Dataset({
    features: ['prompt_id', 'prompt_question', 'prompt_title', 'prompt_text', 'student_id', 'text'],
    num_rows: 4
})

In [10]:
test_ds = Dataset.from_pandas(df_test)
tokenized_test_ds = test_ds.map(tokenize_inf, batched=False, num_proc=4, fn_kwargs={"tokenizer": tokenizer, "config": config})

In [11]:
from transformers import TrainingArguments
test_args = TrainingArguments(
    output_dir = './',
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 1,   
    dataloader_drop_last = False,
    eval_accumulation_steps=1,
)

# init trainer
trainer = Trainer(
    model = model, 
    args = test_args,
    data_collator = data_collator,
    tokenizer = tokenizer
)

test_results = trainer.predict(tokenized_test_ds)

In [24]:
# submission
submission_df = pd.DataFrame()
submission_df['student_id'] = df_test['student_id']
submission_df['content'] = test_results[0][:, 0]
submission_df['wording'] = test_results[0][:, 1]
submission_df.to_csv('submission.csv')