In [2]:
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold
import shutil
import time
import gc
import random
import math
import torch
from torch.utils.data import DataLoader, Dataset
import logging
import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForWholeWordMask
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from transformers.modeling_outputs  import BaseModelOutput,SequenceClassifierOutput
from torch import nn
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
from datasets import load_metric, disable_progress_bar
import datasets
# imports the torch_xla package
import wandb
from torch.nn.parameter import Parameter
#os.environ["WANDB_DISABLED"] = "true"
from tqdm import tqdm
from spellchecker import SpellChecker
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
import spacy
import lightgbm as lgb
import warnings
from dataclasses import dataclass, field
from typing import Optional

warnings.simplefilter('ignore')
logging.disable(logging.ERROR);
os.environ['TOKENIZER_PARALLELISM'] = 'false'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [3]:
import spellchecker
spellchecker.__version__

'0.7.2'

In [4]:
# set random seed
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
    
seed_everything(seed=42)

## Config

In [5]:
class CFG:
    pretraining = False
    load_pretrained = False
    input_path = './input/'
    input_type = '2'
    model_path = 'microsoft/deberta-v3-large' #  nghuyong/ernie-2.0-large-en studio-ousia/luke-large
    model_type = 'custom'
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 512
    max_position_embeddings = 512
    n_folds = 4
    folds = [1]
    epochs = 4  # 5
    # layer - wise larning rate 
    discriminative_learning_rate = False
    discriminative_learning_rate_num_groups = 1
    discriminative_learning_rate_decay_rate = 0.99
    # reinint layer
    reinit_layers = 0
    
#     encoder_lr = 5e-6
#     head_lr = 5e-6
    encoder_lr = 20e-6
    head_lr = 10e-5
    
    min_lr = 1e-7
    eps = 1e-7
    betas = (0.9, 0.999)
    weight_decay = 0
    dropout = 0
    num_fold = 5
    batch_size = 8
    seed = 42
    OUTPUT_DIR = './pretrain/'
    num_workers = 2
    device='cuda'
    print_freq = 100

    
@dataclass
class Config:
    model_name_or_path: Optional[str] = field(
        default="microsoft/deberta-v3-base",
        metadata={"help": "Model name or path"},
    )

    data_dir: Optional[str] = field(
        default="/kaggle/input/commonlit-evaluate-student-summaries",
        metadata={"help": "Data directory"},
    )

    max_seq_length: Optional[int] = field(
        default=1600,
        metadata={"help": "Max sequence length"},
    )

    add_prompt_question: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt question into input"},
    )

    add_prompt_text: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt text into input"},
    )

    fold: Optional[int] = field(
        default=0,
        metadata={"help": "Fold"},
    )

    num_proc: Optional[int] = field(
        default=4,
        metadata={"help": "Number of processes"},
    )

    dropout: Optional[float] = field(
        default=0.,
        metadata={"help": "Amount of dropout to apply"},
    )
    max_position_embeddings: Optional[int] = field(
        default=1600,
        metadata={"help": "Amount of dropout to apply"},
    )
    

In [6]:
# pdf = pd.read_csv(f"{CFG.input_path}/prompts_test.csv")
pdf = pd.read_csv(f'{CFG.input_path}/prompts_train.csv');
# sdf = pd.read_csv(f"{CFG.input_path}/summaries_test.csv")
sdf = pd.read_csv(f'{CFG.input_path}/summaries_train.csv');

df = pdf.merge(sdf, on="prompt_id")

# 4 prompt ids, 4 folds
# id2fold = {
#     "814d6b": 0,
#     "39c16e": 1,
#     "3b9047": 2,
#     "ebad26": 3,
# }

# df["fold"] = df["prompt_id"].map(id2fold)

In [7]:
nltk.data.path.append('/root/workspace/commonlit/nltk')

In [8]:
class Preprocessor:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(CFG.model_path);
        self.STOP_WORDS = set(stopwords.words('english'));
        
        self.spacy_ner_model = spacy.load('en_core_web_sm', )
        self.speller = SpellChecker()
        
    def count_text_length(self, df: pd.DataFrame, col:str) -> pd.Series:
        """ text length """
        tokenizer=self.tokenizer
        return df[col].progress_apply(lambda x: len(tokenizer.encode(x)))

    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int):
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)

        # # Optionally, you can get the frequency of common n-grams for a more nuanced analysis
        # original_ngram_freq = Counter(ngrams(original_words, n))
        # summary_ngram_freq = Counter(ngrams(summary_words, n))
        # common_ngram_freq = {ngram: min(original_ngram_freq[ngram], summary_ngram_freq[ngram]) for ngram in common_ngrams}

        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.speller.unknown(wordlist)))

        return amount_miss
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(self.tokenizer.encode(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x), 
                skip_special_tokens=True
            )
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(self.tokenizer.encode(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x), 
                skip_special_tokens=True
            )

        )
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)

        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        
        # Crate dataframe with count of each category NERs overlap for all the summaries
        # Because it spends too much time for this feature, I don't use this time.
#         ners_count_df  = input_df.progress_apply(
#             lambda row: pd.Series(self.ner_overlap_count(row, mode=mode), dtype='float64'), axis=1
#         ).fillna(0)
#         self.ner_keys = ners_count_df.columns
#         ners_count_df['sum'] = ners_count_df.sum(axis=1)
#         ners_count_df.columns = ['NER_' + col for col in ners_count_df.columns]
#         # join ner count dataframe with train dataframe
#         input_df = pd.concat([input_df, ners_count_df], axis=1)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor()   

In [9]:
#train = preprocessor.run(pdf, sdf, mode="train")
data = preprocessor.run(pdf, sdf, mode='test')
#test = preprocessor.run(prompts_test, summaries_test, mode="test")

#test.head()

100%|██████████| 4/4 [00:00<00:00, 8966.98it/s]
100%|██████████| 4/4 [00:00<00:00, 4321.80it/s]
100%|██████████| 4/4 [00:00<00:00, 5149.54it/s]
100%|██████████| 4/4 [00:00<00:00, 5412.01it/s]
100%|██████████| 4/4 [00:00<00:00, 4106.02it/s]


In [10]:
data.head(3)

Unnamed: 0,student_id,prompt_id,text,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,length_ratio,word_overlap_count,bigram_overlap_count,trigram_overlap_count,quotes_count
0,000000ffffff,abc123,Example text 1,5,0,Summarize...,Example Title 1,Heading\nText...,7,0.714286,0,0,0,0
1,111111eeeeee,def789,Example text 2,5,0,Summarize...,Example Title 2,Heading\nText...,7,0.714286,0,0,0,0
2,222222cccccc,abc123,Example text 3,5,0,Summarize...,Example Title 1,Heading\nText...,7,0.714286,0,0,0,0


## Deberta Model

In [11]:
def tokenize_inf(example, tokenizer, config):
    sep = tokenizer.sep_token;
    prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
    tokenized = tokenizer(
        example["text"],
        prompt,
        padding=False,
        truncation=False,
        max_length=config.max_seq_length,
    )

    return {
        **tokenized
    }

model_paths = [
    #'./pretrain/2308_model/output_fold0_seed42_2308/output_fold0_seed42_2308',
    #'./pretrain/2308_model/output_fold1_seed42_2308/output_fold1_seed42_2308',
    #'./pretrain/2308_model/output_fold2_seed42_2308/output_fold2_seed42_2308',
    './pretrain/2308_model/output_fold3_seed42_2308/output_fold3_seed42_2308'
]
tokenizer = AutoTokenizer.from_pretrained(model_paths[0]);
model_config = AutoConfig.from_pretrained(model_paths[0]);
model_config.update({
    'hidden_dropout_prob': 0,
    'attention_probs_dropout_prob': 0,
    'num_labels':2,
    'problem_type': 'regression',
    'max_position_embeddings': 1600,
});

data_collator = DataCollatorWithPadding(
    tokenizer = tokenizer,
    pad_to_multiple_of=16,
)

# Do not use pretrained model
models = []
for model_path in model_paths:
    model = AutoModelForSequenceClassification.from_pretrained(model_path, config = model_config);
    models.append(model)
config = Config()

In [12]:
data_ds = datasets.Dataset.from_pandas(data)
tokenized_data_ds = data_ds.map(tokenize_inf, batched = False, num_proc = 4, fn_kwargs= {'tokenizer': tokenizer, 'config': config})

In [13]:
from transformers import TrainingArguments
infer_args = TrainingArguments(
    output_dir = './',
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 16,   
    dataloader_drop_last = False,
    eval_accumulation_steps=1,
)

# init trainer
trainer = Trainer(
    model = model, 
    args = infer_args,
    data_collator = data_collator,
    tokenizer = tokenizer
)

deberta_res = []
for model in models:
    deberta_res.append(trainer.predict(tokenized_data_ds)[0])

In [14]:
deberta_res_mean = np.mean(np.array(deberta_res), axis = 0)

In [15]:
data['pred_content_score'] = deberta_res_mean[:,0]
data['pred_wording_score'] = deberta_res_mean[:,1]
#train.to_csv('./deberta_infer.csv', index = False)

In [16]:
#train = pd.read_csv('./train_2.csv')

In [18]:
def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }


# mcrmse = compute_mcrmse((train.loc[:, ['pred_content_score', 'pred_wording_score']].values, 
#                train.loc[:, ['content', 'wording']].values))
# mcrmse

### Lightgbm Model

In [25]:
targets = ["content", "wording"]

drop_columns = ["student_id", "prompt_id", "text", 
                "prompt_question", "prompt_title", 
                "prompt_text"
               ]

In [20]:
### test load
model_targets = ['content', 'wording']

lgb_model_dict = {}
for model_target in model_targets:
    lgb_model_dict[model_target] = [];
    for idx in range(4):
        lgb_model_dict[model_target].append(lgb.Booster(model_file = f'./lgbt_{model_target}_{idx}.txt'))

In [21]:
lgb_model_content_0 = lgb_model_dict['content'][0]
lgb_model_wording_0 = lgb_model_dict['wording'][0]

In [26]:
train_content_res = lgb_model_content_0.predict(data.drop(columns = drop_columns)).reshape(-1,1)
train_wording_res = lgb_model_wording_0.predict(data.drop(columns = drop_columns)).reshape(-1,1)
preds = np.concatenate([train_content_res, train_wording_res], axis = 1)
preds

array([[-0.92559399, -0.11320166],
       [-0.89140067, -0.10740623],
       [-0.92559399, -0.11320166],
       [-0.92559399, -0.11320166]])

In [None]:
data['content_score'] =  preds[:, 0]
data['wording_score'] =  preds[:, 1]