In [None]:
from typing import List
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import logging
import os
import shutil
import json
import yaml
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from torch.nn.parameter import Parameter
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm
import re

import random
import nltk
from text_unidecode import unidecode
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
import spacy
import re
import lightgbm as lgb
from pathlib import Path
import gc
import glob
from typing import Tuple
import codecs

# logging setting

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
disable_progress_bar()
tqdm.pandas()

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=42)

## c1

In [None]:
import pandas as pd
import numpy as np
import gc
from tqdm.auto import tqdm
import json, yaml
import random
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from text_unidecode import unidecode
import codecs
from sklearn.model_selection import StratifiedKFold
from typing import Tuple
import os

import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
import numpy as np


def get_last_hidden_state(backbone_outputs):
    last_hidden_state = backbone_outputs[0]
    return last_hidden_state


def get_all_hidden_states(backbone_outputs):
    all_hidden_states = torch.stack(backbone_outputs[1])
    return all_hidden_states


def get_input_ids(inputs):
    return inputs["input_ids"]


def get_attention_mask(inputs):
    return inputs["attention_mask"]


class MeanPooling(nn.Module):
    def __init__(self, backbone_config):
        super(MeanPooling, self).__init__()
        self.output_dim = backbone_config.hidden_size

    def forward(self, inputs, backbone_outputs):
        attention_mask = get_attention_mask(inputs)
        last_hidden_state = get_last_hidden_state(backbone_outputs)

        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        )
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings\

class GeMText(nn.Module):
    def __init__(self, backbone_config, cfg):
        super(GeMText, self).__init__()

        self.dim = cfg["pooling"]["dim"]
        self.eps = cfg["pooling"]["eps"]
        self.feat_mult = 1

        self.p = Parameter(torch.ones(1) * cfg["pooling"]["p"])

        self.output_dim = backbone_config.hidden_size

    def forward(self, inputs, backbone_output):
        attention_mask = get_attention_mask(inputs)
        last_hidden_state = get_last_hidden_state(backbone_output)

        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        x = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret

class CLSPooling(nn.Module):
    def __init__(self, backbone_config):
        super(CLSPooling, self).__init__()
        self.output_dim = backbone_config.hidden_size

    def forward(self, inputs, backbone_outputs):
        last_hidden_state = get_last_hidden_state(backbone_outputs)
        return last_hidden_state[:, 0]


def get_pooling_layer(cfg, backbone_config):
    if cfg["pooling"]["type"] == "mean":
        return MeanPooling(backbone_config)
    elif cfg["pooling"]["type"] == "gem":
        return GeMText(backbone_config, cfg)
    else:
        raise NotImplementedError

In [None]:
class CustomModel(nn.Module):
    def __init__(self, cfg, backbone_cfg=None):
        super().__init__()
        self.cfg = cfg
        self.backbone_cfg = backbone_cfg
        if backbone_cfg is not None:
            self.backbone = AutoModel.from_config(config=self.backbone_cfg)
            
        else:
            self.backbone_cfg = AutoConfig.from_pretrained(cfg["model"]["backbone_path"])
            self.backbone = AutoModel.from_pretrained(cfg["model"]["backbone_path"])

        self.backbone.resize_token_embeddings(len(cfg["tokenizer"]))
        self.pool = get_pooling_layer(self.cfg, self.backbone_cfg)
        self.fc = nn.Linear(self.pool.output_dim, len(cfg["general"]["target_columns"]))

    def forward(self, input):
        outputs = self.backbone(**input)
        feature = self.pool(input, outputs)
        output = self.fc(feature)
        return output
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


def freeze(module):
    for param in module.parameters():
        param.requires_grad = False


def get_model(cfg, check_point_path=None, backbone_cfg_path=None, train=True):
    backbone_cfg = get_backbone_config(cfg) if backbone_cfg_path is None else torch.load(backbone_cfg_path)
    model = CustomModel(cfg, backbone_cfg=backbone_cfg)

    if check_point_path is not None:
        state = torch.load(check_point_path, map_location="cpu")
        if 'model.embeddings.position_ids' in state['model'].keys():
            state = update_old_state(state)  
        model.load_state_dict(state["model"], False)

    if cfg["model"]["gradient_checkpointing"]:
        if model.backbone.supports_gradient_checkpointing:
            model.backbone.gradient_checkpointing_enable()
        else:
            print("Gradient checkpointing is not supported by the model")

    if train:
        if cfg["model"]["freeze_embeddings"]:
            freeze(model.backbone.embeddings)
        if cfg["model"]["freeze_n_layers"] > 0:
            freeze(model.backbone.encoder.layer[: cfg["model"]["freeze_n_layers"]])
        if cfg['model']['reinitialize_n_layers'] > 0:
            for module in model.backbone.encoder.layer[-cfg['model']['reinitialize_n_layers']:]:
                model._init_weights(module)

    return model

def get_backbone_config(config):
    if config["model"]["backbone_config_path"] == '':
        backbone_config = AutoConfig.from_pretrained(config["model"]["backbone_path"], output_hidden_states=True)

    else:
        backbone_config = torch.load(config["model"]["backbone_config_path"])
    return backbone_config

def update_old_state(state):
    new_state = {}
    for key, value in state['model'].items():
        new_key = key
        if key.startswith('model.'):
            new_key = key.replace('model', 'backbone')
        new_state[new_key] = value

    updated_state = {'model': new_state, 'predictions': state['predictions']}
    return updated_state

In [None]:
def seed_everything(seed=42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
def get_additional_special_tokens() -> dict:
    special_tokens_replacement = {
        "\n": "[BR]",
        "\r\n": "[BR]"
    }
    return special_tokens_replacement


def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end


codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


def replace_special_tokens(text: str) -> str:
    special_tokens_replacement = get_additional_special_tokens()
    for k, v in special_tokens_replacement.items():
        text = text.replace(k, v)
    return text


def preprocess_text(text):
    text = resolve_encodings_and_normalize(text)
    text = replace_special_tokens(text)
    return text

In [None]:
class CommonLitDataset(Dataset):
    def __init__(self, df, cfg):
        self.df = df
        self.cfg = cfg
        self.text = df["full_text"].values
        self.labels = None

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = self.text[item]
        input = self.cfg["tokenizer"].encode_plus(
            text,
            return_tensors = None,
            add_special_tokens=True,
            max_length=1024,
            padding="max_length",
            truncation=True,
        )
        for k, v in input.items():
            input[k] = torch.tensor(v, dtype=torch.long)
        if self.labels:
            label = torch.tensor(self.labels[item], dtype=torch.float)
            return input, label
        return input

def get_test_dataloader(cfg, df):
    dataset = CommonLitDataset(df, cfg)
    dataloader = DataLoader(
        dataset,
        batch_size=4,
        num_workers=1,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )
    return dataloader

In [None]:
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, _ in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

In [None]:
def get_config(path: str) -> dict:
    """
    Load config from yaml file
    """
    with open(path, "r") as f:
        config: dict = yaml.safe_load(f)

    return config

In [None]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
prompts_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
summary_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')

snorf_df = prompts_test.merge(summary_test, on="prompt_id")
pq = snorf_df["prompt_question"].values
text = snorf_df["text"].values
full_text = pq + "[SEP]" + text
snorf_df["full_text"] = full_text
snorf_df

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = get_config("/kaggle/input/exp1-content-model/config.yaml")
seed_everything()

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/exp1-content-model")
config["tokenizer"] = tokenizer

In [None]:
snorf_df["full_text"] = snorf_df["full_text"].apply(preprocess_text)

target_columns = ["content"]

predictions = []

test_dataloader = get_test_dataloader(config, snorf_df)

In [None]:
for i in range(4):
    model = get_model(config,
                      check_point_path=f"/kaggle/input/exp1-content-model/fold{i}_best.pt", 
                      backbone_cfg_path="/kaggle/input/exp1-content-model/config.pt",
                      train=False)
    prediction = inference_fn(test_dataloader, model, device)
    gc.collect()
    torch.cuda.empty_cache()
    predictions.append(prediction)

In [None]:
content_c1 = np.mean(predictions, axis=0)

In [None]:
content_c1

## w2

In [None]:
class CFG:
    competition = "CommonLit"
    num_workers = 2
    tokenizer = "/kaggle/input/microsoft-deberta-v3-large"
    model = "microsoft/deberta-v3-large"
    ckpt_name = "microsoft/deberta-v3-large"
    betas = (0.9, 0.999)
    batch_size = 12
    infer_batch_size = 8
    max_len = 512
    max_grad_norm = 1000
    gradient_checkpointing = True
    mlm_ratio = False
    layer_reinitialize_n = 0
    freeze_n_layers = 0
    target_cols = [
        "wording",
    ]
    seed = 42
    n_fold = 4
    trn_fold = [0, 1, 2, 3]
    multi_sample_dropouts = None

In [None]:
def prepare_input(text: str) -> dict[str, torch.Tensor]:
    inputs = CFG.tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True,
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.cfg = CFG
        self.texts = df["full_text"].values

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        inputs = prepare_input(text)
        return inputs


def collate(inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
    """dynamic padding"""
    if CFG.debug:
        return inputs

    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = v[:, :mask_len]
    return inputs

# ====================================================
# Model
# ====================================================
def freeze(module):
    """
    Freezes module's parameters.
    """
    for parameter in module.parameters():
        parameter.requires_grad = False


def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
    return freezed_parameters


class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(
        self, last_hidden_state: torch.Tensor, attention_mask: torch.Tensor
    ) -> torch.Tensor:
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        )
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class CustomModel(nn.Module):
    def __init__(
        self, cfg: CFG, config_path: Path | None = None, pretrained: bool = False
    ):
        super().__init__()
        self.cfg = cfg
        self.n_target = len(cfg.target_cols)

        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                cfg.model, output_hidden_states=True
            )
            self.config.hidden_dropout = 0.0
            self.config.hidden_dropout_prob = 0.0
            self.config.attention_dropout = 0.0
            self.config.attention_probs_dropout_prob = 0.0
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.model.resize_token_embeddings(len(cfg.tokenizer))
        
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        if cfg.multi_sample_dropouts is not None:
            self.dropouts = nn.ModuleList(
                [nn.Dropout(p) for p in cfg.multi_sample_dropouts]
            )
        else:
            self.dropouts = None

        self.fc = nn.Linear(self.config.hidden_size, self.n_target) 


        self._re_init_layers(self.cfg.layer_reinitialize_n)
        if 1 <= self.cfg.freeze_n_layers:
            freeze(self.model.embeddings)
            freeze(
                self.model.encoder.layer[: self.cfg.freeze_n_layers]
            ) 

    def _re_init_layers(self, n_layers: int):
        if n_layers >= 1:
            for layer in self.model.encoder.layer[-n_layers:]:
                # deverta-v3
                if hasattr(layer, "modules"):
                    for module in layer.modules():
                        for name, child in module.named_children():
                            init_type_name = self._init_weights(child)
                            if init_type_name is not None:
                                print(
                                    f"{name} is re-initialized, type: {init_type_name}, {module.__class__}"
                                )

    def _init_weights(self, module: nn.Module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
            return "nn.Linear"
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
            return "nn.Embedding"
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            return "nn.LayerNorm"
        return None


    def forward(self, inputs: dict[str, torch.Tensor]) -> torch.Tensor:
        outputs = self.model(**inputs)
        cls_hidden_state = outputs.last_hidden_state[:, 0, :] 
        output = self.fc(cls_hidden_state)
        return output

In [None]:
@torch.inference_mode()
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    for inputs in tqdm(test_loader, total=len(test_loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
def remove_short_quotes(text):
    """Removes quotes shorter than 30 chars"""
    pattern = r'\[QUOTE\](.*?)\[ENDQUOTE\]'

    def replace_quotes(match):
        content = match.group(1)
        if len(content) <= 30:
            return content
        else:
            return match.group()

    processed_text = re.sub(pattern, replace_quotes, text)
    return processed_text


def replace_quotes_with_passage(text):
    """Replaces content inside [QUOTE]...[ENDQUOTE] with either [PASSAGE] or [PLAGIARISM] depending on proximity of quotations"""

    pattern = r'\[QUOTE\](.*?)\[ENDQUOTE\]'

    def determine_and_count_tokens(match):
        content = match.group(1)  # Get content inside [QUOTE]...[ENDQUOTE]
        quote_index = match.start(0)  # Get start index of [QUOTE]
        endquote_index = match.end(0)  # Get end index of [ENDQUOTE]
        content_length = len(content)
        passage_count = content_length // 50

        if '"' in content or \
            '"' in text[max(0, quote_index - 10):min(len(text), endquote_index + 10)]:
            return ' [PASSAGE] ' * passage_count
        else:
            return ' [PASSAGE] ' * passage_count

    # Replace matches based on context and content length
    cleaned_text = re.sub(pattern, determine_and_count_tokens, text)
    return cleaned_text


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def plagiarism_checker(input_text, source_text, n=3, similarity_threshold=0.85):
    try:
        # Generate n-grams
        def generate_ngrams(text, n):
            words = text.split()
            ngrams = []
            for i in range(len(words) - n + 1):
                ngram = ' '.join(words[i:i + n])
                ngrams.append(ngram)
            return ngrams

        source_ngrams = generate_ngrams(source_text, n)
        input_ngrams = generate_ngrams(input_text, n)

        source_ngram_strings = [' '.join(ngram.split()) for ngram in source_ngrams]
        input_ngram_strings = [' '.join(ngram.split()) for ngram in input_ngrams]

        # Calculate TF-IDF vectors
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(source_ngram_strings + input_ngram_strings)
        source_tfidf = tfidf_matrix[:len(source_ngram_strings)]
        input_tfidf = tfidf_matrix[len(source_ngram_strings):]
        # Calculate similarity
        similarity_matrix = cosine_similarity(input_tfidf, source_tfidf)
 
        # Find segments in the input text
        plagiarized_indices = []
        for i, input_ngram in enumerate(input_ngrams):
            most_similar_index = np.argmax(similarity_matrix[i])
            cosine_sim = similarity_matrix[i, most_similar_index]

            if cosine_sim >= similarity_threshold:
                plagiarized_indices.append(i)

        # Mark plagiarized segments with [QUOTE] and [ENDQUOTE]
        marked_input_text = input_text
        for index in plagiarized_indices:
            input_ngram = input_ngrams[index]
            source_ngram = source_ngrams[np.argmax(similarity_matrix[index])]
            # Replace only whole words
            marked_input_text = marked_input_text.replace(f" {input_ngram} ", f" [QUOTE]{input_ngram}[ENDQUOTE] ")
        marked_input_text = marked_input_text.replace("[ENDQUOTE] [QUOTE]", " ")

        return  remove_short_quotes(marked_input_text)
    except Exception:
        return input_text

In [None]:
prompts_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
summary_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')

test = prompts_test.merge(summary_test, on="prompt_id")
test["full_text"] = [replace_quotes_with_passage(plagiarism_checker(text, prompt_text, n =3)) for text, prompt_text in zip(test["text"].values,test["prompt_text"].values)]
test

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer)
tokenizer.add_tokens(['[PASSAGE]','[PLAGIARISM]'])
CFG.tokenizer = tokenizer
CFG.tokenizer

In [None]:
test_dataset = TestDataset(test)
test_loader = DataLoader(test_dataset,
  batch_size=CFG.batch_size,
  shuffle=False,
  collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
  num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
wording_w2 = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path='/kaggle/input/exp3-wording-model/config.pth', pretrained=False)
    state = torch.load(f'/kaggle/input/exp3-wording-model/microsoft_deberta-v3-large_fold{fold}_best.pth',
                      map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    wording_w2.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
wording_w2 = np.mean(wording_w2, axis=0)

## w4

In [None]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
test = prompts_test.merge(summaries_test, on="prompt_id")
test.head()

In [None]:
def remove_short_quotes(text):
    """Removes quotes shorter than 30 chars"""
    pattern = r'\[QUOTE\](.*?)\[ENDQUOTE\]'

    def replace_quotes(match):
        content = match.group(1)
        if len(content) <= 30:
            return content
        else:
            return match.group()

    processed_text = re.sub(pattern, replace_quotes, text)
    return processed_text


def replace_quotes_with_passage(text):
    """Replaces content inside [QUOTE]...[ENDQUOTE] with either [PASSAGE] or [PLAGIARISM] depending on proximity of quotations"""

    pattern = r'\[QUOTE\](.*?)\[ENDQUOTE\]'

    def determine_and_count_tokens(match):
        content = match.group(1)  # Get content inside [QUOTE]...[ENDQUOTE]
        quote_index = match.start(0)  # Get start index of [QUOTE]
        endquote_index = match.end(0)  # Get end index of [ENDQUOTE]
        content_length = len(content)
        passage_count = content_length // 50

        if '"' in content or \
            '"' in text[max(0, quote_index - 10):min(len(text), endquote_index + 10)]:
            return ' [PASSAGE] '*len(match.group(1).split())
        else:
            return ' [PASSAGE] '*len(match.group(1).split())

    # Replace matches based on context and content length
    cleaned_text = re.sub(pattern, determine_and_count_tokens, text)
    return cleaned_text


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def plagiarism_checker(input_text, source_text, n=3, similarity_threshold=0.9):
    try:
        # Generate n-grams
        def generate_ngrams(text, n):
            words = text.split()
            ngrams = []
            for i in range(len(words) - n + 1):
                ngram = ' '.join(words[i:i + n])
                ngrams.append(ngram)
            return ngrams

        source_ngrams = generate_ngrams(source_text, n)
        input_ngrams = generate_ngrams(input_text, n)

        source_ngram_strings = [' '.join(ngram.split()) for ngram in source_ngrams]
        input_ngram_strings = [' '.join(ngram.split()) for ngram in input_ngrams]

        # Calculate TF-IDF vectors
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(source_ngram_strings + input_ngram_strings)
        source_tfidf = tfidf_matrix[:len(source_ngram_strings)]
        input_tfidf = tfidf_matrix[len(source_ngram_strings):]
        # Calculate similarity
        similarity_matrix = cosine_similarity(input_tfidf, source_tfidf)
 
        # Find segments in the input text
        plagiarized_indices = []
        for i, input_ngram in enumerate(input_ngrams):
            most_similar_index = np.argmax(similarity_matrix[i])
            cosine_sim = similarity_matrix[i, most_similar_index]

            if cosine_sim >= similarity_threshold:
                plagiarized_indices.append(i)

        # Mark plagiarized segments with [QUOTE] and [ENDQUOTE]
        marked_input_text = input_text
        for index in plagiarized_indices:
            input_ngram = input_ngrams[index]
            source_ngram = source_ngrams[np.argmax(similarity_matrix[index])]
            # Replace only whole words
            marked_input_text = marked_input_text.replace(f" {input_ngram} ", f" [QUOTE]{input_ngram}[ENDQUOTE] ")
        marked_input_text = marked_input_text.replace("[ENDQUOTE] [QUOTE]", " ")

        return  remove_short_quotes(marked_input_text)
    except Exception:
        return input_text

In [None]:
test["fixed_text"] = [replace_quotes_with_passage(plagiarism_checker(text, prompt_text, n=3)) for text, prompt_text in zip(test["text"].values,test["prompt_text"].values)]

In [None]:
test["full_text"] = test["fixed_text"]
test.head()

In [None]:
def seed_everything(seed=42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
def get_additional_special_tokens() -> dict:
    special_tokens_replacement = {
        "\n": "[BR]",
        "\r\n": "[BR]",
        "[PASSAGE]": "[PASSAGE]"
    }
    return special_tokens_replacement


def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end


codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


def replace_special_tokens(text: str) -> str:
    special_tokens_replacement = get_additional_special_tokens()
    for k, v in special_tokens_replacement.items():
        text = text.replace(k, v)
    return text


def preprocess_text(text):
    text = resolve_encodings_and_normalize(text)
    text = replace_special_tokens(text)
    return text

In [None]:
def get_last_hidden_state(backbone_outputs):
    last_hidden_state = backbone_outputs[0]
    return last_hidden_state


def get_all_hidden_states(backbone_outputs):
    all_hidden_states = torch.stack(backbone_outputs[1])
    return all_hidden_states


def get_input_ids(inputs):
    return inputs["input_ids"]


def get_attention_mask(inputs):
    return inputs["attention_mask"]


class MeanPooling(nn.Module):
    def __init__(self, backbone_config):
        super(MeanPooling, self).__init__()
        self.output_dim = backbone_config.hidden_size

    def forward(self, inputs, backbone_outputs):
        attention_mask = get_attention_mask(inputs)
        last_hidden_state = get_last_hidden_state(backbone_outputs)

        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        )
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings\

class GeMText(nn.Module):
    def __init__(self, backbone_config, cfg):
        super(GeMText, self).__init__()

        self.dim = cfg["pooling"]["dim"]
        self.eps = cfg["pooling"]["eps"]
        self.feat_mult = 1

        self.p = Parameter(torch.ones(1) * cfg["pooling"]["p"])

        self.output_dim = backbone_config.hidden_size

    def forward(self, inputs, backbone_output):
        attention_mask = get_attention_mask(inputs)
        last_hidden_state = get_last_hidden_state(backbone_output)

        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        x = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret

def get_pooling_layer(cfg, backbone_config):
    if cfg["pooling"]["type"] == "mean":
        return MeanPooling(backbone_config)
    elif cfg["pooling"]["type"] == "gem":
        return GeMText(backbone_config, cfg)
    else:
        raise NotImplementedError

In [None]:
class CustomModel(nn.Module):
    def __init__(self, cfg, backbone_cfg=None):
        super().__init__()
        self.cfg = cfg
        self.backbone_cfg = backbone_cfg
        if backbone_cfg is not None:
            self.backbone = AutoModel.from_config(config=self.backbone_cfg)
            
        else:
            self.backbone_cfg = AutoConfig.from_pretrained(cfg["model"]["backbone_path"])
            self.backbone = AutoModel.from_pretrained(cfg["model"]["backbone_path"])

        self.backbone.resize_token_embeddings(len(cfg["tokenizer"]))
        self.pool = get_pooling_layer(self.cfg, self.backbone_cfg)
        self.fc = nn.Linear(self.pool.output_dim, len(cfg["general"]["target_columns"]))

    def forward(self, input):
        outputs = self.backbone(**input)
        feature = self.pool(input, outputs)
        output = self.fc(feature)
        return output
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.backbone_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


def freeze(module):
    for param in module.parameters():
        param.requires_grad = False


def get_model(cfg, check_point_path=None, backbone_cfg_path=None, train=True):
    backbone_cfg = get_backbone_config(cfg) if backbone_cfg_path is None else torch.load(backbone_cfg_path)
    model = CustomModel(cfg, backbone_cfg=backbone_cfg)

    if check_point_path is not None:
        state = torch.load(check_point_path, map_location="cpu")
        if 'model.embeddings.position_ids' in state['model'].keys():
            state = update_old_state(state)  
        model.load_state_dict(state["model"])

    if cfg["model"]["gradient_checkpointing"]:
        if model.backbone.supports_gradient_checkpointing:
            model.backbone.gradient_checkpointing_enable()
        else:
            print("Gradient checkpointing is not supported by the model")

    if train:
        if cfg["model"]["freeze_embeddings"]:
            freeze(model.backbone.embeddings)
        if cfg["model"]["freeze_n_layers"] > 0:
            freeze(model.backbone.encoder.layer[: cfg["model"]["freeze_n_layers"]])
        if cfg['model']['reinitialize_n_layers'] > 0:
            for module in model.backbone.encoder.layer[-cfg['model']['reinitialize_n_layers']:]:
                model._init_weights(module)

    return model

def get_backbone_config(config):
    if config["model"]["backbone_config_path"] == '':
        backbone_config = AutoConfig.from_pretrained(config["model"]["backbone_path"], output_hidden_states=True)

    else:
        backbone_config = torch.load(config["model"]["backbone_config_path"])
    return backbone_config

def update_old_state(state):
    new_state = {}
    for key, value in state['model'].items():
        new_key = key
        if key.startswith('model.'):
            new_key = key.replace('model', 'backbone')
        new_state[new_key] = value

    updated_state = {'model': new_state, 'predictions': state['predictions']}
    return updated_state

In [None]:
class CommonLitDataset(Dataset):
    def __init__(self, df, cfg):
        self.df = df
        self.cfg = cfg
        self.text = df["full_text"].values
        self.labels = None

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = self.text[item]
        input = self.cfg["tokenizer"].encode_plus(
            text,
            return_tensors = None,
            add_special_tokens=True,
            max_length=768,
            padding="max_length",
            truncation=True,
        )
        for k, v in input.items():
            input[k] = torch.tensor(v, dtype=torch.long)
        if self.labels:
            label = torch.tensor(self.labels[item], dtype=torch.float)
            return input, label
        return input

def get_test_dataloader(cfg, df):
    dataset = CommonLitDataset(df, cfg)
    dataloader = DataLoader(
        dataset,
        batch_size=4,
        num_workers=2,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )
    return dataloader

In [None]:
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, _ in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

In [None]:
def get_config(path: str) -> dict:
    """
    Load config from yaml file
    """
    with open(path, "r") as f:
        config: dict = yaml.safe_load(f)

    return config

In [None]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = get_config("/kaggle/input/exp4-wording-model/config.yaml")
seed_everything()

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/exp4-wording-model")
config["tokenizer"] = tokenizer

In [None]:
test["full_text"] = test["full_text"].apply(preprocess_text)

target_columns = ["wording"]

predictions1 = []

test_dataloader = get_test_dataloader(config, test)

In [None]:
for i in range(4):
    model = get_model(config,
                      check_point_path=f"/kaggle/input/exp4-wording-model/fold{i}_best.pt", 
                      backbone_cfg_path="/kaggle/input/exp4-wording-model/config.pt",
                      train=False)
    prediction = inference_fn(test_dataloader, model, device)
    gc.collect()
    torch.cuda.empty_cache()
    
    predictions1.append(prediction)

In [None]:
preds = np.mean(predictions1, axis=0)
preds

## w6

In [None]:
class CFGw6:
    competition = "CommonLit"
    num_workers = 2
    tokenizer = "/kaggle/input/debertav3base"
    model = "microsoft/deberta-v3-base"
    ckpt_name = "microsoft/deberta-v3-base"
    betas = (0.9, 0.999)
    batch_size = 12
    infer_batch_size = 8
    max_len = 640
    max_grad_norm = 1000
    gradient_checkpointing = True
    mlm_ratio = False
    layer_reinitialize_n = 0
    freeze_n_layers = 0
    target_cols = [
        "wording",
    ]
    seed = 42
    n_fold = 4
    trn_fold = [0, 1, 2, 3]
    multi_sample_dropouts = None

In [None]:
prompts_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
summary_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')

test = prompts_test.merge(summary_test, on="prompt_id")
test["full_text"] = [replace_quotes_with_passage(plagiarism_checker(text, prompt_text, n =3)) for text, prompt_text in zip(test["text"].values,test["prompt_text"].values)]
test

In [None]:
import re
import numpy as np

def remove_short_quotes(text):
    """Removes quotes shorter than 30 chars"""
    pattern = r'\[QUOTE\](.*?)\[ENDQUOTE\]'
    
    def replace_quotes(match):
        content = match.group(1)
        
        if len(content) <= 20:  
            return content
        else:
            return match.group()
        
    processed_text = re.sub(pattern, replace_quotes, text)
    return processed_text

def remove_chars_by_indices(input_string, indices_to_remove):
    new_string = ""
    for i in range(len(input_string)):
        if i not in indices_to_remove:
            new_string += input_string[i]
    return new_string

def combine_nearby_quotes(text):
    result = []
    quote_state = False
    indices_to_delete = []

    for i in range(len(text)):
        if text[i:i+7] == "[QUOTE]":
            if quote_state:
                indices_to_delete.extend([j for j in range(i,i+7)])
            quote_state = True
        elif text[i:i+10] == "[ENDQUOTE]":
            if quote_state:
                quote_state = False
                for z in range(20):
                    if text[i+z:i+z+7] == "[QUOTE]":
                        quote_state = True
                        indices_to_delete.extend([j for j in range(i,i+10)])
                        break


    return remove_chars_by_indices(text,indices_to_delete)



def replace_quotes_with_passage(text):
    """Replaces content inside [QUOTE]...[ENDQUOTE] with either [PASSAGE] or [PLAGIARISM] depending on proximity of quotations"""
    
    pattern = r'(?s)\[QUOTE\](.*?)\[ENDQUOTE\]'
    def determine_and_count_tokens(match):
        content = match.group(1)  # Get content inside [QUOTE]...[ENDQUOTE]
        quote_index = match.start(0)  # Get start index of [QUOTE]
        endquote_index = match.end(0)  # Get end index of [ENDQUOTE]
        content_length = len(content)
        passage_count = content_length // 20 
        quotation_symbols = ['"', '“', '”', "'", '‘', '’', '❝', '❞']
        if content_length < 40:
            return f' [REFERENCE] {content} [ENDREFERENCE]'
        elif any(symbol in content for symbol in quotation_symbols) or \
            '"' in text[max(0, quote_index - 20):min(len(text), endquote_index + 20)]:
            return ' [PASSAGE] ' * passage_count
        else:
            return ' [PLAGIARISM] ' * passage_count
        
    # Replace matches based on context and content length
    cleaned_text = re.sub(pattern, determine_and_count_tokens, text)
    #cleaned_text= cleaned_text.replace("[ENDQUOTE]", "")
    #cleaned_text= cleaned_text.replace("[QUOTE]", "")
    return cleaned_text

def remove_substrings(strings_list):
    return [s for i, s in enumerate(strings_list) if not any(s in t for j, t in enumerate(strings_list) if i != j)]

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def plagiarism_checker(input_text, source_text, n=3, similarity_threshold=0.85):
    try:
        # Generate n-grams 
        def generate_ngrams(text, n):
            words = text.split()
            ngrams = []
            for i in range(len(words) - n + 1):
                ngram = ' '.join(words[i:i + n])
                ngrams.append(ngram)
            return ngrams

        source_ngrams = generate_ngrams(source_text, n)
        input_ngrams = generate_ngrams(input_text, n)

        source_ngram_strings = [' '.join(ngram.split()) for ngram in source_ngrams]
        input_ngram_strings = [' '.join(ngram.split()) for ngram in input_ngrams]

        # Calculate TF-IDF vectors
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(source_ngram_strings + input_ngram_strings)
        source_tfidf = tfidf_matrix[:len(source_ngram_strings)]
        input_tfidf = tfidf_matrix[len(source_ngram_strings):]

        # Calculate similarity
        similarity_matrix = cosine_similarity(input_tfidf, source_tfidf)

        # Find segments in the input text
        plagiarized_indices = []
        for i, input_ngram in enumerate(input_ngrams):
            most_similar_index = np.argmax(similarity_matrix[i])
            cosine_sim = similarity_matrix[i, most_similar_index]

            if cosine_sim >= similarity_threshold:
                plagiarized_indices.append(i)

        marked_input_text = input_text
        if len(plagiarized_indices) > 0:
            input_ngrams = remove_substrings(merge_plagiarized_ngrams(input_ngrams,plagiarized_indices,marked_input_text))
            # Mark plagiarized segments with [QUOTE] and [ENDQUOTE]
            for input_ngram in input_ngrams:
                # Replace only whole words

                if len(input_ngram) > 5:
                    marked_input_text = marked_input_text.replace(f"{input_ngram}", f"[QUOTE]{input_ngram}[ENDQUOTE]")
                    #print(marked_input_text)
        marked_input_text = marked_input_text.replace("[ENDQUOTE][QUOTE]", " ")
        marked_input_text = marked_input_text.replace("[ENDQUOTE] [QUOTE]", " ")
        #print(marked_input_text)
        #print(combine_nearby_quotes(marked_input_text))
        #print(remove_short_quotes(combine_nearby_quotes(marked_input_text)))
        tokenized_text = remove_short_quotes(combine_nearby_quotes(marked_input_text))

        return  tokenized_text
    except Exception:
        return input_text

def merge_plagiarized_ngrams(ngrams, plagiarized_indices, text):
    ngrams = [ngrams[indice] for indice in plagiarized_indices]
    merged_ngrams = []
    current_ngram = ngrams[0]
    for ngram in ngrams[1:]:
        last_word = current_ngram.split()[-1]
        second_word = ngram.split()[1]
        first_word = ngram.split()[0]
        if last_word == second_word and (current_ngram + " " + " ".join(ngram.split()[2:])) in text:  
            current_ngram += " " + " ".join(ngram.split()[2:])
        elif last_word == first_word and (current_ngram + " " + " ".join(ngram.split()[1:])) in text:
            current_ngram += " " + " ".join(ngram.split()[1:])
        else:
            merged_ngrams.append(current_ngram)
            current_ngram = ngram

    merged_ngrams.append(current_ngram)
    #print(merged_ngrams)
    return merged_ngrams

In [None]:
test["full_text"] = [replace_quotes_with_passage(plagiarism_checker(text, prompt_text, n =3)) for text, prompt_text in zip(test["text"].values,test["prompt_text"].values)]
test

In [None]:
def prepare_input(text: str, cfg) -> dict[str, torch.Tensor]:
    inputs = cfg.tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=cfg.max_len,
        pad_to_max_length=True,
        truncation=True,
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, df: pd.DataFrame, cfg):
        self.cfg = cfg
        self.texts = df["full_text"].values

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        inputs = prepare_input(text,self.cfg)
        return inputs


def collate(inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
    """dynamic padding"""

    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = v[:, :mask_len]
    return inputs

# ====================================================
# Model
# ====================================================
def freeze(module):
    """
    Freezes module's parameters.
    """
    for parameter in module.parameters():
        parameter.requires_grad = False


def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
    return freezed_parameters


class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(
        self, last_hidden_state: torch.Tensor, attention_mask: torch.Tensor
    ) -> torch.Tensor:
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        )
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class CustomModel(nn.Module):
    def __init__(
        self, cfg, config_path: Path | None = None, pretrained: bool = False
    ):
        super().__init__()
        self.cfg = cfg
        self.n_target = len(cfg.target_cols)

        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                cfg.model, output_hidden_states=True
            )
            self.config.hidden_dropout = 0.0
            self.config.hidden_dropout_prob = 0.0
            self.config.attention_dropout = 0.0
            self.config.attention_probs_dropout_prob = 0.0
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.model.resize_token_embeddings(len(cfg.tokenizer))
        
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        if cfg.multi_sample_dropouts is not None:
            self.dropouts = nn.ModuleList(
                [nn.Dropout(p) for p in cfg.multi_sample_dropouts]
            )
        else:
            self.dropouts = None

        self.fc = nn.Linear(self.config.hidden_size, self.n_target) 


        self._re_init_layers(self.cfg.layer_reinitialize_n)
        if 1 <= self.cfg.freeze_n_layers:
            freeze(self.model.embeddings)
            freeze(
                self.model.encoder.layer[: self.cfg.freeze_n_layers]
            ) 

    def _re_init_layers(self, n_layers: int):
        if n_layers >= 1:
            for layer in self.model.encoder.layer[-n_layers:]:
                # deverta-v3
                if hasattr(layer, "modules"):
                    for module in layer.modules():
                        for name, child in module.named_children():
                            init_type_name = self._init_weights(child)
                            if init_type_name is not None:
                                print(
                                    f"{name} is re-initialized, type: {init_type_name}, {module.__class__}"
                                )

    def _init_weights(self, module: nn.Module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
            return "nn.Linear"
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
            return "nn.Embedding"
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            return "nn.LayerNorm"
        return None


    def forward(self, inputs: dict[str, torch.Tensor]) -> torch.Tensor:
        outputs = self.model(**inputs)
        cls_hidden_state = outputs.last_hidden_state[:, 0, :] 
        output = self.fc(cls_hidden_state)
        return output

In [None]:
@torch.inference_mode()
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    for inputs in tqdm(test_loader, total=len(test_loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFGw6.tokenizer)
tokenizer.add_tokens(['[PASSAGE]','[PLAGIARISM]','[REFERENCE]','[ENDREFERENCE]'])
CFGw6.tokenizer = tokenizer

test_dataset = TestDataset(test, CFGw6)
test_loader = DataLoader(test_dataset,
  batch_size=CFGw6.batch_size,
  shuffle=False,
  collate_fn=DataCollatorWithPadding(tokenizer=CFGw6.tokenizer, padding='longest'),
  num_workers=CFGw6.num_workers, pin_memory=True, drop_last=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

predictions6 = []
for fold in CFGw6.trn_fold:
    model = CustomModel(CFGw6, config_path='/kaggle/input/commonlit-wording-exp5/config.pth', pretrained=False)
    state = torch.load(f'/kaggle/input/commonlit-wording-exp5/microsoft_deberta-v3-base_fold{fold}_best.pth',
                      map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions6.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions6 = np.mean(predictions6, axis=0)

In [None]:
predictions6

## Shigeria1

In [None]:
from __future__ import annotations

param = {
    'awp_eps': 1e-2,
    'awp_lr': 1e-4,
    'batch_size': 8, # 2
    'betas': (0.9, 0.999),
    'ckpt_name': 'deberta_v3_large',
    'debug': False, # False
    'decoder_lr': 5e-5,
    'encoder_lr': 2e-5,
    'layerwise_lr_decay': 1.0,
    'eps': 1e-6,
    'max_len': 1024,
    'min_lr': 1e-7,
    'model_name': 'microsoft/deberta-v3-large',
    'n_cycles': 0.5,
    'n_epochs': 4, # 12
    'n_eval_steps': 100000,
    'n_folds': 4, # 4
    'n_gradient_accumulation_steps': 1,
    'n_warmup_steps': 0,
    'n_workers': 2,
    'nth_awp_start_epoch': 5, # 4
    'output_dir': './output/',
    'path': '/kaggle/input/cess-deberta-v3-large-exp26/',
    'print_freq': 100,
    'scheduler_name': 'cosine',
    'max_grad_norm': 100000000.0,
    'seed': 42,
    'weight_decay': 0.01,
    'config_path': '/kaggle/input/cess-deberta-v3-large-exp26/config.pth',
    'reinit_layers': 0,
}

In [None]:
class Config:
    def __init__(self, d: dict) -> None:
        for k,v in d.items():
            setattr(self, k, v)

cfg = Config(d=param)

In [None]:
import os

if not os.path.exists(cfg.output_dir):
    os.makedirs(cfg.output_dir)

In [None]:
import os
import random
import numpy as np
import torch

def seed_everything(seed:int) -> None:
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything(seed=cfg.seed)

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

test_df = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')
prompt_df = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
test_df = prompt_df.merge(test_df, on='prompt_id')
submission = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv')
test_df

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(cfg.path+'tokenizer/')

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

In [None]:
import random
from pandas import DataFrame
from torch import Tensor
from torch.utils.data import Dataset
from transformers.tokenization_utils import PreTrainedTokenizer

class TestDataset(Dataset):
    def __init__(self, df: DataFrame, tokenizer: PreTrainedTokenizer, max_len: int):
        self.tokenizer = tokenizer
        self.max_len = max_len
        if len(self.tokenizer.encode("\n\n"))==2:
            df["text"] = df['text'].transform(lambda x:x.str.replace("\n\n","|"))
            df["prompt_question"] = df['prompt_question'].transform(lambda x:x.str.replace("\n\n","|"))
            df["prompt_title"] = df['prompt_title'].transform(lambda x:x.str.replace("\n\n","|"))
            df["prompt_text"] = df['prompt_text'].transform(lambda x:x.str.replace("\n\n","|"))
        if len(self.tokenizer.encode("\r\n"))==2:
            df["text"] = df['text'].transform(lambda x:x.str.replace("\r\n","|"))
            df["prompt_question"] = df['prompt_question'].transform(lambda x:x.str.replace("\r\n","|"))
            df["prompt_title"] = df['prompt_title'].transform(lambda x:x.str.replace("\r\n","|"))
            df["prompt_text"] = df['prompt_text'].transform(lambda x:x.str.replace("\r\n","|"))
        self.prompt_ids = df['prompt_id'].to_numpy()
        self.prompt_questions = df['prompt_question'].to_numpy()
        self.prompt_titles = df['prompt_title'].to_numpy()
        self.prompt_texts = df['prompt_text'].to_numpy()
        self.ids = df['student_id'].to_numpy()
        self.texts = df['text'].to_numpy()

    def __len__(self) -> int:
        return len(self.ids)

    def __getitem__(self, item: int) -> "tuple[dict, Tensor, Tensor]":
        
        text1 = 'Content Wording'
        text1 += self.tokenizer.sep_token
        text1 += "Instruction : "
        text1 += self.prompt_questions[item]
        text1 += self.tokenizer.sep_token
        text1 += "Title : "
        text1 += self.prompt_titles[item]
        text1 += self.tokenizer.sep_token
        text1 += "Summary : "
        text1 += self.texts[item]
        text1 += self.tokenizer.sep_token
        text1 += "Full Text : "
        text1 += self.prompt_texts[item]

        encoded1 = self.tokenizer(
            text1,
            max_length = self.max_len,
            padding='max_length',
            add_special_tokens=True,
            truncation=True
        )
        
        sep_ids = np.nonzero(np.array(encoded1["input_ids"]) == self.tokenizer.sep_token_id)[0].tolist()
        
        sep_ids += [self.max_len for _ in range(5 - len(sep_ids))]

        for k,v in encoded1.items():
            encoded1[k] = torch.tensor(v, dtype=torch.long)

        return encoded1, torch.tensor(sep_ids, dtype=torch.long)

In [None]:
from torch import nn
from torch.nn.parameter import Parameter
import torch.nn.functional as F
import copy


class GeMText(nn.Module):
    def __init__(self, dim=1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps

    def forward(self, x, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(x.shape)
        x = ((x.clamp(min=self.eps) * attention_mask_expanded).pow(self.p)).sum(self.dim)
        ret = (x/(attention_mask_expanded.sum(self.dim))).clip(min=self.eps)
        ret = ret.pow(1/self.p)
        return ret
    
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [None]:
class MaskFilledAttentionHead(nn.Module):
    def __init__(self, input_dim, head_hidden_dim):
        super(MaskFilledAttentionHead, self).__init__()
        head_hidden_dim = input_dim if head_hidden_dim is None else head_hidden_dim
        self.W = nn.Linear(input_dim, head_hidden_dim)
        self.ln = nn.LayerNorm(head_hidden_dim)
        self.V = nn.Linear(head_hidden_dim, 1)
        
    def forward(self, x, attention_mask):
        attention_scores = self.V(torch.tanh(self.ln(self.W(x))))
        attention_scores[attention_mask==0] = -10.
        attention_scores = torch.softmax(attention_scores, dim=1)
        attentive_x = attention_scores * x
        attentive_x = attentive_x.sum(axis=1)
        return attentive_x

In [None]:
from torch import Tensor
from torch.nn import Module
from transformers import AutoModel, AutoConfig

class CustomModel(Module):
    def __init__(self, cfg, model_name, config_path, n_vocabs) -> None:
        super().__init__()
        self.cfg = cfg
        self.model_config = torch.load(config_path)
        self.model = AutoModel.from_config(self.model_config)
        self.pool = MeanPooling()
        self.dec0 = nn.Sequential(
            nn.Linear(self.model_config.hidden_size, self.model_config.hidden_size),
            nn.LayerNorm(self.model_config.hidden_size, 1e-7)
        )
        self.dec1 = nn.Sequential(
            nn.Linear(self.model_config.hidden_size, self.model_config.hidden_size),
            nn.LayerNorm(self.model_config.hidden_size, 1e-7)
        )
        self.dec2 = nn.Sequential(
            nn.Linear(self.model_config.hidden_size, self.model_config.hidden_size),
            nn.LayerNorm(self.model_config.hidden_size, 1e-7)
        )
        self.dec3 = nn.Sequential(
            nn.Linear(self.model_config.hidden_size, self.model_config.hidden_size),
            nn.LayerNorm(self.model_config.hidden_size, 1e-7)
        )
        self.dec4 = nn.Sequential(
            nn.Linear(self.model_config.hidden_size, self.model_config.hidden_size),
            nn.LayerNorm(self.model_config.hidden_size, 1e-7)
        )
        self.dec5 = nn.Sequential(
            nn.Linear(self.model_config.hidden_size, self.model_config.hidden_size),
            nn.LayerNorm(self.model_config.hidden_size, 1e-7)
        )

        self.fc = nn.Sequential(
            nn.Linear(self.model_config.hidden_size * 6, self.model_config.hidden_size * 3),
            nn.LayerNorm(self.model_config.hidden_size * 3, 1e-7),
            nn.ReLU(),
            nn.Linear(self.model_config.hidden_size * 3, self.model_config.hidden_size),
            nn.LayerNorm(self.model_config.hidden_size, 1e-7),
            nn.ReLU(),
            nn.Linear(self.model_config.hidden_size, 2)
        )

    def _init_weights(self, module: Module) -> None:
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(
                mean=0.0, std=self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(
                mean=0.0, std=self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask, idxes) -> Tensor:
        encoder_layer = self.model(input_ids=input_ids, attention_mask=attention_mask)[0]
        
        l0 = self.pool(encoder_layer, input_ids)
        
        bs = encoder_layer.shape[0]
        
        l1 = []

        for i in range(bs):
            if idxes[i][0] > 1:
                l1.append(torch.mean(encoder_layer[i, 0:idxes[i][0]], dim=0))
            else:
                l1.append(torch.zeros((encoder_layer.size(2)), dtype=torch.float16, device=encoder_layer.device))

        l1 = torch.stack(l1)  # (bs, h)
        
        l2 = []

        for i in range(bs):
            if idxes[i][1] - idxes[i][0] > 1:
                l2.append(torch.mean(encoder_layer[i, idxes[i][0]:idxes[i][1]], dim=0))
            else:
                l2.append(torch.zeros((encoder_layer.size(2)), dtype=torch.float16, device=encoder_layer.device))

        l2 = torch.stack(l2)  # (bs, h)
        
        l3 = []

        for i in range(bs):
            if idxes[i][2] - idxes[i][1] > 1:
                l3.append(torch.mean(encoder_layer[i, idxes[i][1]:idxes[i][2]], dim=0))
            else:
                l3.append(torch.zeros((encoder_layer.size(2)), dtype=torch.float16, device=encoder_layer.device))

        l3 = torch.stack(l3)  # (bs, h)
        
        l4 = []

        for i in range(bs):
            if idxes[i][3] - idxes[i][2] > 1:
                l4.append(torch.mean(encoder_layer[i, idxes[i][2]:idxes[i][3]], dim=0))
            else:
                l4.append(torch.zeros((encoder_layer.size(2)), dtype=torch.float16, device=encoder_layer.device))

        l4 = torch.stack(l4)  # (bs, h)
        
        l5 = []

        for i in range(bs):
            if idxes[i][4] - idxes[i][3] > 1:
                l5.append(torch.mean(encoder_layer[i, idxes[i][3]:idxes[i][4]], dim=0))
            else:
                l5.append(torch.zeros((encoder_layer.size(2)), dtype=torch.float16, device=encoder_layer.device))

        l5 = torch.stack(l5)  # (bs, h)

        output = torch.cat([self.dec0(l0), self.dec1(l1), self.dec2(l2), self.dec3(l3), self.dec4(l4), self.dec5(l5)], 1)
        output = self.fc(output)
        return output
    

In [None]:
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [None]:
import os
import gc
import random
import warnings
from functools import reduce
warnings.filterwarnings("ignore")
import numpy as np
from numpy import ndarray
import scipy as sp
import torch
from torch import inference_mode
from torch import nn
from transformers import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import _LRScheduler
from IPython.display import display
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from sklearn import metrics
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    for step, (inputs, idxes) in enumerate(test_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        idxes = idxes.to(device)
        with torch.no_grad():
            y_preds = model(inputs['input_ids'], inputs['attention_mask'], idxes)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

test_dataset = TestDataset(df=test_df, tokenizer=tokenizer, max_len=cfg.max_len)

test_loader = DataLoader(test_dataset,
                                batch_size=cfg.batch_size,
                                shuffle=False,
                                num_workers=cfg.n_workers, 
                                pin_memory=True, 
                                drop_last=False)

predictions = []
for fold in range(cfg.n_folds):
    model = CustomModel(cfg, cfg.model_name, config_path=cfg.config_path, n_vocabs=len(tokenizer))
    state = torch.load(cfg.path+f"{cfg.ckpt_name}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state; gc.collect()
    torch.cuda.empty_cache()
predictions_shigeria = np.mean(predictions, axis=0)

In [None]:
test_df[["content", "wording"]] = predictions_shigeria
test_df

In [None]:
predictions_shigeria_content = predictions_shigeria[:, 0:1]
predictions_shigeria_wording = predictions_shigeria[:, 1:2]

## Wording Ensemble

In [None]:
weights = [0.20208678, 0.16789604, 0.23403965, 0.42868295]

In [None]:
predictions = weights[0]*wording_w2+weights[1]*preds+weights[2]*predictions6 + weights[3]*predictions_shigeria_wording
predictions

In [None]:
test['wording'] = predictions
submission_wording = test[['student_id','wording']]
submission_wording

## Content Ensemble

### c1 lgbm

In [None]:
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"

In [None]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
y = prompts_test.merge(summaries_test, on="prompt_id")

In [None]:
y['content_c1'] = content_c1
y['wording_pred'] = submission_wording['wording'].values

In [None]:
y.head()

In [None]:
import re
from spellchecker import SpellChecker
from nltk.corpus import stopwords
import spacy
from collections import Counter
import nltk
from tqdm import tqdm
tqdm.pandas()

from sklearn.model_selection import GroupKFold
import lightgbm as lgb

from transformers import AutoTokenizer

In [None]:
class Preprocessor:
    def __init__(self, model_name="/kaggle/input/microsoft-deberta-v3-large") -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm')
        
        self.spellchecker = SpellChecker() 

    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int):
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)

        # # Optionally, you can get the frequency of common n-grams for a more nuanced analysis
        # original_ngram_freq = Counter(ngrams(original_words, n))
        # summary_ngram_freq = Counter(ngrams(summary_words, n))
        # common_ngram_freq = {ngram: min(original_ngram_freq[ngram], summary_ngram_freq[ngram]) for ngram in common_ngrams}

        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def add_spelling_dictionary(self, tokens):
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
        
    def filter_stopwords(self, text):
        stop_words = set(stopwords.words('english'))
        stop_lens = len([i for i in text if i in stop_words])
        
        return stop_lens
    
    def parse_pos(self, text, pos):
        pos_dict = Counter([j for i,j in nltk.pos_tag(text)])
        n = 0
        for k in pos:
            if k in pos_dict:
                n += pos_dict[k]
                
        return n
    
    def run(self, 
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        summaries["prompt_length"] = summaries["prompt_text"].apply(
            lambda x: len(self.tokenizer.encode(x))
        )
        summaries["prompt_tokens"] = summaries["prompt_text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(
                self.tokenizer.encode(x), 
                skip_special_tokens=True
            )
        )
        
        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(self.tokenizer.encode(x))
        )
        summaries["summary_unique_length"] = summaries["text"].apply(
            lambda x: len(set(self.tokenizer.encode(x)))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(x), skip_special_tokens=True)
        )
        
        summaries['stopword_count'] = summaries['summary_tokens'].progress_apply(self.filter_stopwords)
        summaries['stopword_ratio'] = summaries['stopword_count'] / summaries['summary_length']
        
#         summaries['pos_NN_count'] = summaries['summary_tokens'].progress_apply(self.parse_pos,args=(['NN'],))
#         summaries['pos_NNP_count'] = summaries['summary_tokens'].progress_apply(self.parse_pos,args=(['NNP'],))
#         summaries['pos_RB_count'] = summaries['summary_tokens'].progress_apply(self.parse_pos,args=(['RB'],))
#         summaries['pos_RP_count'] = summaries['summary_tokens'].progress_apply(self.parse_pos,args=(['RP'],))
#         summaries['pos_WPWRB_count'] = summaries['summary_tokens'].progress_apply(self.parse_pos,args=(['WP','WRB'],))
        
        #summaries["fixed_summary_text"] = summaries["text"].progress_apply(lambda x: self.speller(x))
    
        
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        # summaries["fixed_splling_err_num"] = summaries["fixed_summary_text"].progress_apply(self.spelling)
        summaries["splling_err_ratio"] = summaries["splling_err_num"] / summaries["summary_length"]

        # merge prompts and summaries
        input_df = summaries.copy()

        # after merge preprocess
        #input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        #input_df['unique_ratio'] = input_df['summary_unique_length'] / input_df['prompt_length']
        input_df['unique_length_ratio'] = input_df['summary_unique_length'] / input_df['summary_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        
        input_df['bigram_overlap_count'] = input_df.progress_apply(self.ngram_co_occurrence,args=(2,), axis=1)
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(self.ngram_co_occurrence, args=(3,), axis=1)
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        # Crate dataframe with count of each category NERs overlap for all the summaries
        # Because it spends too much time for this feature, I don't use this time.
#         ners_count_df  = input_df.progress_apply(
#             lambda row: pd.Series(self.ner_overlap_count(row, mode=mode), dtype='float64'), axis=1
#         ).fillna(0)
#         self.ner_keys = ners_count_df.columns
#         ners_count_df['sum'] = ners_count_df.sum(axis=1)
#         ners_count_df.columns = ['NER_' + col for col in ners_count_df.columns]
#         # join ner count dataframe with train dataframe
#         input_df = pd.concat([input_df, ners_count_df], axis=1)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        # input_df['fixed_splling_err_ratio'] = input_df['fixed_splling_err_num'] / input_df['summary_length']
        input_df['quotes_ratio'] = input_df['quotes_count'] / input_df['summary_length']
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])

In [None]:
preprocessor = Preprocessor()
y_test = preprocessor.run(y, mode='test')

In [None]:
y_test.head()

In [None]:
targets = ["content"]

drop_columns = ["student_id", "prompt_id", "text", 
                "prompt_question", "prompt_title", "prompt_length", "prompt_text"
               ]

In [None]:
ordered_columns = ['prompt_length', 'summary_length', 'summary_unique_length', 'stopword_count', 'stopword_ratio',
                   'splling_err_num', 'splling_err_ratio', 'wording_pred', 'content_c1', 'unique_length_ratio',
                   'word_overlap_count', 'bigram_overlap_count', 'bigram_overlap_ratio', 'trigram_overlap_count', 'trigram_overlap_ratio', 
                   'quotes_count', 'quotes_ratio'
]

In [None]:
model_paths = glob.glob("/kaggle/input/commomlit-lgb-c1-0527wording/*")

predictions = []
for fold, model_path in enumerate(model_paths):
    model = lgb.Booster(model_file=model_path)
    X_eval_cv = y_test.drop(columns=drop_columns)
    X_eval_cv = X_eval_cv.reindex(columns=ordered_columns)
    pred = model.predict(X_eval_cv)
    predictions.append(pred)

preds = np.mean(predictions, axis=0)

In [None]:
weights = [0.6295255599255956, 0.3993599947505763]

In [None]:
predictions = weights[0]*predictions_shigeria_content + weights[1]*preds.reshape(-1,1)
predictions

In [None]:
test['content'] = predictions
submission_content = test[["student_id","content"]]
submission_content

## Create Submission file

In [None]:
submission_content

In [None]:
submission_wording

In [None]:
submission = submission_content.merge(submission_wording,on="student_id")
submission

In [None]:
submission["content"] = submission["content"].clip(-2, 4.5)
submission["wording"] = submission["wording"].clip(-2, 4.5)
submission

In [None]:
submission.to_csv("submission.csv", index=False)