### Generate new training data

In [1]:
import os
import logging
import warnings
from dataclasses import dataclass, field
from typing import Optional

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoConfig,
    set_seed,
    Trainer,
    TrainingArguments,
    HfArgumentParser,
    DataCollatorWithPadding,
)
from datasets import Dataset, disable_progress_bar
import datasets
import pandas as pd
import numpy as np

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['WANDB_PROJECT'] = 'kaggle-commonlit-eval-student-summaries'

disable_progress_bar()


def tokenize(example, add_prompt_question, add_prompt_text):
    """
    To see how long it would be with prompt question, prompt text, and text.
    """

    cols = []

    if add_prompt_question:
        cols.append("prompt_question")
    if add_prompt_text:
        cols.append("prompt_text")

    cols.append("text")

    return tok(
        " ".join([example[c] for c in cols]),
        padding=False,
        truncation=False
    )


@dataclass
class Config:
    model_name_or_path: Optional[str] = field(
        default="microsoft/deberta-v3-base",
        metadata={"help": "Model name or path"},
    )

    data_dir: Optional[str] = field(
        default="/kaggle/input/commonlit-evaluate-student-summaries",
        metadata={"help": "Data directory"},
    )

    max_seq_length: Optional[int] = field(
        default=1600,
        metadata={"help": "Max sequence length"},
    )

    add_prompt_question: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt question into input"},
    )

    add_prompt_text: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt text into input"},
    )

    fold: Optional[int] = field(
        default=0,
        metadata={"help": "Fold"},
    )

    num_proc: Optional[int] = field(
        default=4,
        metadata={"help": "Number of processes"},
    )

    dropout: Optional[float] = field(
        default=0.,
        metadata={"help": "Amount of dropout to apply"},
    )
    max_position_embeddings: Optional[int] = field(
        default=1600,
        metadata={"help": "Amount of dropout to apply"},
    )


def tokenize(example, tokenizer, config):
    sep = tokenizer.sep_token

    # if config.add_prompt_question:
    #     text = sep.join(
    #         [example["prompt_question"], example["prompt_text"], example["text"]]
    #     )
    # elif config.add_prompt_text:
    #     text = sep.join([example["prompt_text"], example["text"]])
    # else:
    #     text = example["text"]
    prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
    labels = [example["content"], example["wording"]]

    tokenized = tokenizer(
        prompt,
        example["text"],
        padding=False,
        truncation=False,
        max_length=config.max_seq_length,
    )

    return {
        **tokenized,
        "labels": labels,
    }

def tokenize_inf(example, tokenizer, config):
    sep = tokenizer.sep_token;
    #prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
    tokenized = tokenizer(
        example['summary'],
        example["input_text"],
        padding=False,
        truncation=False,
        max_length=config.max_seq_length,
    )

    return {
        **tokenized
    }


def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }


### Read in data

In [5]:
df = pd.read_csv('./input/new_train_data.csv').dropna().reset_index(drop = True)
df['input_text'] = df['title']+df['text']+df['question']
df = df[['input_text', 'summary']]
df

Unnamed: 0,input_text,summary
0,Leaving Hobbiton Leaving Hobbiton by Jeff Hitc...,The text talks about a common pattern in many ...
1,Leaving Hobbiton Leaving Hobbiton by Jeff Hitc...,The text talks about a common story structure ...
2,Leaving Hobbiton Leaving Hobbiton by Jeff Hitc...,The author's main purpose in the text is to ex...
3,Untitled Untitled by Tiago Vasconcelos is lice...,"**Grade 3-5:**\n\nThe poem ""I felt a Funeral, ..."
4,Untitled Untitled by Tiago Vasconcelos is lice...,"**Grade 3-5:**\n\nThe poem ""I felt a Funeral, ..."
...,...,...
426,Man Wearing Distressed Black Denim Man Wearing...,The detail that best supports the idea that th...
427,Man Wearing Distressed Black Denim Man Wearing...,The end of paragraph 10 contributes to the moo...
428,Man Wearing Distressed Black Denim Man Wearing...,"In the story, the boy's feelings change throug..."
429,Untitled Untitled by Kirsten Winegeart is lice...,The researchers conducted an experiment with R...


In [9]:
df.loc[
    df.summary.str.contains('**', regex = False), :
]

Unnamed: 0,input_text,summary
3,Untitled Untitled by Tiago Vasconcelos is lice...,"**Grade 3-5:**\n\nThe poem ""I felt a Funeral, ..."
4,Untitled Untitled by Tiago Vasconcelos is lice...,"**Grade 3-5:**\n\nThe poem ""I felt a Funeral, ..."
5,Untitled Untitled by Tiago Vasconcelos is lice...,**Grade 3-5:**\n\nIn the fifth part of the poe...
6,Untitled Untitled by Tiago Vasconcelos is lice...,"**Grade 3-5 Level:**\n\nThe poem ""I felt a Fun..."
7,Footprints in the sands of time… Footprints in...,"**Grade 3-5:**\n\nThe poem ""A Psalm of Life"" b..."
...,...,...
418,The sirens and Odysseus - John Waterhouse The ...,"**Grade 3-5:**\n\nIn the story, Odysseus and h..."
419,Untitled Untitled by Ryan Loughlin @rylomedia ...,**Grade 3-5:**\n\nThis poem is about never giv...
420,Untitled Untitled by Ryan Loughlin @rylomedia ...,**Grade 3-5 Level:**\n\nStanza 2 and stanza 5 ...
421,Untitled Untitled by Ryan Loughlin @rylomedia ...,**Grade 3-5:**\n\nIn the sixth part of the poe...
