## Setup

In [1]:
import numpy as np
import pandas as pd

In [2]:
import os

ENV = "kaggle"
if 'COLAB_JUPYTER_TOKEN' in os.environ:
    ENV = "colab"


In [3]:
libraries = [
    'functools',
    'gc',
    'gzip',
    'hashlib',
    'kaggle',
    'matplotlib',
    'numpy',
    'os',
    'pandas',
    'scikit-learn',
    'sentencepiece',
    'torch',
    'tqdm',
    'transformers==4.30.2',
    'lightgbm'
]

if ENV == "colab":
    import subprocess
    for lib in libraries:
        subprocess.run(["pip", "install", lib])
    from google.colab import drive
    drive.mount('/content/drive')


In [4]:
import random
import numpy as np
import torch

def set_seed(seed_value):
    """Set seed for reproducibility."""
    random.seed(seed_value)            # Set seed for python's built-in random
    np.random.seed(seed_value)         # Set seed for numpy
    torch.manual_seed(seed_value)      # Set seed for pytorch

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)  # Use any number as seed


## Load Data

In [5]:
import pandas as pd

FILE_NAMES = ['prompts_train.csv', 'summaries_train.csv', 'prompts_test.csv', 'summaries_test.csv']

def read_files(base_path, file_names=FILE_NAMES):
    """Read multiple CSV files from a given path."""
    dataframes = []
    for file in file_names:
        dataframes.append(pd.read_csv(base_path + file))
    return tuple(dataframes)

def load_data_in_kaggle():
    """Load data when running in a Kaggle environment."""
    base_path = '/kaggle/input/commonlit-evaluate-student-summaries/'
    return read_files(base_path)

def load_data_in_colab():
    """Load data when running in a Google Colab environment."""
    base_path = "/content/drive/MyDrive/kaggle/commonlit-evaluate-student-summaries/"
    return read_files(base_path)

def load_data():
    """Load data based on the environment (Kaggle/Colab)."""
    if ENV == "kaggle":
        return load_data_in_kaggle()
    elif ENV == "colab":
        return load_data_in_colab()
    else:
        raise ValueError("Unknown environment.")

df_train_prompts, df_train_summaries, df_test_prompts, df_test_summaries = load_data()


## Merge set

In [6]:
df_train = df_train_summaries.merge(df_train_prompts, on='prompt_id')
df_test = df_test_summaries.merge(df_test_prompts, on='prompt_id')

df_test["wording"] = 0.0
df_test["content"] = 0.0

import pandas as pd
import os

if ENV == "colab":
    file_path = 'drive/MyDrive/kaggle/colab/oof_predictions.csv'
    if os.path.exists(file_path):
        df_train = pd.read_csv(file_path)

if ENV == "kaggle":
    file_path = '/kaggle/input/summaries-features-2023-08-24-17-06-57/oof_predictions.csv'
    if os.path.exists(file_path):
        df_train = pd.read_csv(file_path)

print(f"df_train.shape: {df_train.shape}")
print(f"df_test.shape: {df_test.shape}")

df_train.head()

df_train.shape: (7165, 16)
df_test.shape: (4, 8)


Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,full_text,word_count,lexical_similarity,semantic_similarity,predicted_content,predicted_wording,transformer_content,transformer_wording
0,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",They would rub it up with soda to make the sme...,-0.426385,-0.756804,0.0,-0.941234,-0.216973,-0.941234,-0.216973
1,0071d51dab6d,ebad26,They would use chemicals and substances to cha...,0.205683,0.380538,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",They would use chemicals and substances to cha...,-0.594609,-1.002909,0.0,-0.443029,-0.047777,-0.443029,-0.047777
2,00746c7c79c3,ebad26,"Many times the factories would, according to t...",-0.878889,-0.96633,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...","Many times the factories would, according to t...",-0.538534,-0.812601,0.0,-1.005438,-0.703931,-1.005438,-0.703931
3,008db54e7cbc,ebad26,The factory covered up and used spolied meat i...,1.216547,1.166914,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",The factory covered up and used spolied meat i...,0.620339,-0.410604,0.0,0.866758,0.799542,0.866758,0.799542
4,00b8461e9c37,ebad26,The factory would rub up the spoiled meat with...,-1.236282,-0.285223,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",The factory would rub up the spoiled meat with...,-0.688066,-0.571339,0.0,-0.71329,-0.695766,-0.71329,-0.695766


## Features

In [7]:
# Define a separator token for BERT
separator_token = ' [SEP] '

# Create the "full_text" feature by joining "prompt_text" and "text"
df_train['full_text'] = df_train['text'] + separator_token + df_train['prompt_question']
df_test['full_text'] = df_test['text'] + separator_token + df_test['prompt_question']

df_train['word_count'] = df_train['text'].apply(lambda x: len(str(x).split()))
df_test['word_count'] = df_test['text'].apply(lambda x: len(str(x).split()))

import gzip

def calculate_compressed_size(text):
    text_bytes = text.encode('utf-8')
    compressed_bytes = gzip.compress(text_bytes)
    return len(compressed_bytes)

df_train['lexical_similarity'] = df_train.apply(
    lambda row: 1 - calculate_compressed_size(row['text'] + row['prompt_text']) / (calculate_compressed_size(row['text']) + calculate_compressed_size(row['prompt_text'])),
    axis=1
)
df_test['lexical_similarity'] = df_train.apply(
    lambda row: 1 - calculate_compressed_size(row['text'] + row['prompt_text']) / (calculate_compressed_size(row['text']) + calculate_compressed_size(row['prompt_text'])),
    axis=1
)

if 'semantic_similarity' not in df_train.columns:
    df_train['semantic_similarity'] = 0.0

if 'semantic_similarity' not in df_test.columns:
    df_test['semantic_similarity'] = 0.0


In [8]:
def ngrams(tokens, n):
    return [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]

def rouge_n(hypothesis, reference, n=1):
    hyp_ngrams = set(ngrams(hypothesis.split(), n))
    ref_ngrams = set(ngrams(reference.split(), n))
    overlap = len(hyp_ngrams.intersection(ref_ngrams))
    precision = overlap / len(hyp_ngrams) if hyp_ngrams else 0
    recall = overlap / len(ref_ngrams) if ref_ngrams else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return {"f1": f1, "precision": precision, "recall": recall}

def lcs(X, Y):
    m, n = len(X), len(Y)
    L = [[0] * (n + 1) for i in range(m + 1)]
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                L[i][j] = 0
            elif X[i - 1] == Y[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])
    return L[m][n]

def rouge_l(hypothesis, reference):
    lcs_length = lcs(hypothesis.split(), reference.split())
    precision = lcs_length / len(hypothesis.split()) if hypothesis else 0
    recall = lcs_length / len(reference.split()) if reference else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return {"f1": f1, "precision": precision, "recall": recall}

def skip_bigrams(tokens):
    return [(tokens[i], tokens[j]) for i in range(len(tokens)) for j in range(i+1, len(tokens))]

def rouge_s(hypothesis, reference):
    hyp_skip_bigrams = set(skip_bigrams(hypothesis.split()))
    ref_skip_bigrams = set(skip_bigrams(reference.split()))
    overlap = len(hyp_skip_bigrams.intersection(ref_skip_bigrams))
    precision = overlap / len(hyp_skip_bigrams) if hyp_skip_bigrams else 0
    recall = overlap / len(ref_skip_bigrams) if ref_skip_bigrams else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return {"f1": f1, "precision": precision, "recall": recall}

def add_scores(row):
    hyp = row['text']
    ref = row['prompt_text']
    scores_1 = rouge_n(hyp, ref, 1)
    scores_2 = rouge_n(hyp, ref, 2)
    scores_l = rouge_l(hyp, ref)
    scores_s = rouge_s(hyp, ref)
    return pd.Series({
        'rouge_1_f1': scores_1['f1'],
        'rouge_2_f1': scores_2['f1'],
        'rouge_l_f1': scores_l['f1'],
        'rouge_s_f1': scores_s['f1'],
        'rouge_1_precision': scores_1['precision'],
        'rouge_2_precision': scores_2['precision'],
        'rouge_l_precision': scores_l['precision'],
        'rouge_s_precision': scores_s['precision'],
        'rouge_1_recall': scores_1['recall'],
        'rouge_2_recall': scores_2['recall'],
        'rouge_l_recall': scores_l['recall'],
        'rouge_s_recall': scores_s['recall'],
    })


ROUGE_COLUMNS = [
    'rouge_1_f1', 'rouge_1_precision', 'rouge_1_recall',
    'rouge_2_f1', 'rouge_2_precision', 'rouge_2_recall',
    'rouge_l_f1', 'rouge_l_precision', 'rouge_l_recall',
    'rouge_s_f1', 'rouge_s_precision', 'rouge_s_recall'
]

df_train = pd.concat([df_train, df_train.apply(add_scores, axis=1)], axis=1)
df_test = pd.concat([df_test, df_test.apply(add_scores, axis=1)], axis=1)


In [9]:
import torch

if torch.cuda.is_available():
    n_gpu = torch.cuda.device_count()
    device_ids = list(range(n_gpu))
    device = torch.device(f'cuda:{device_ids[0]}')  # Choose the first device
    print('There are %d GPU(s) available.' % n_gpu)
    print('We will use the GPUs:', device_ids)
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    device_ids = None

There are 2 GPU(s) available.
We will use the GPUs: [0, 1]


In [10]:

BATCH_SIZE = 4
DROPOUT = 0.01
MAX_GRAD_NORM = 1.0
MAX_LENGTH = 1024
WEIGHT_DECAY = 0.02

MODEL_NAME = 'debertav3base'

In [11]:
from transformers import AutoModel, AutoTokenizer, AutoConfig

def load_pretrained_from_path(model_path, model_name, config_updates):
    """Load pretrained model and tokenizer from a given path."""

    full_path = f"{model_path}/{model_name}"

    tokenizer = AutoTokenizer.from_pretrained(full_path)

    config = AutoConfig.from_pretrained(full_path)
    config.update(config_updates)
    model = AutoModel.from_pretrained(full_path, config=config)

    return model, tokenizer

def load_pretrained(model_name=MODEL_NAME):

    if ENV == "kaggle":
        MODEL_PATH = '/kaggle/input'
    elif ENV == "colab":
        MODEL_PATH = '/content/drive/MyDrive/kaggle'
    else:
        raise EnvironmentError("Unknown environment.")

    config_updates = {
        "hidden_dropout_prob": DROPOUT,
        "max_grad_norm": MAX_GRAD_NORM,
        "weight_decay": WEIGHT_DECAY
    }

    return load_pretrained_from_path(MODEL_PATH, model_name, config_updates)

model, tokenizer = load_pretrained()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']
Some weights of the model checkpoint at /kaggle/input/debertav3base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predi

In [12]:
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity
import hashlib
from functools import lru_cache
import torch.nn as nn

semantic_model, tokenizer = load_pretrained()
semantic_model.to(device)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    semantic_model = nn.DataParallel(semantic_model)

maxsize = None if ENV == "colab" else 128
@lru_cache(maxsize=maxsize)
def get_embedding(text):
    inputs = tokenizer(text, max_length=MAX_LENGTH, truncation=True, padding='max_length', return_tensors='pt')

    if device_ids:
        inputs = {name: tensor.to(device) for name, tensor in inputs.items()}  # move inputs to GPU
    with torch.no_grad():
        outputs = semantic_model(**inputs)

    embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embedding

# Compute embeddings and similarity score
def get_similarity(row):
    prompt_embedding = get_embedding(row['prompt_text'])
    text_embedding = get_embedding(row['text'])
    similarity = cosine_similarity(prompt_embedding, text_embedding)
    return similarity[0][0]

# Add similarity score to DataFrame
df_test['semantic_similarity'] = df_test.apply(get_similarity, axis=1)
df_train['semantic_similarity'] = df_train.apply(get_similarity, axis=1)

print("Done!")

import gc
import torch

del semantic_model
gc.collect()
torch.cuda.empty_cache()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at /kaggle/input/debertav3base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT

Using 2 GPUs!
Done!


## Scale

In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# Features you want to scale
features_to_scale = ["word_count", "semantic_similarity", "lexical_similarity"]

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler only on the training data for the selected features
df_train[features_to_scale] = scaler.fit_transform(df_train[features_to_scale])

# Transform the test data using the same scaler for the selected features
df_test[features_to_scale] = scaler.transform(df_test[features_to_scale])


## Cross Validation

In [14]:
from sklearn.model_selection import KFold
import pandas as pd

def split_dataframe_by_prompt(df, n_splits=4):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    combined_train_dfs = []
    combined_val_dfs = []

    unique_prompts = df['prompt_id'].unique()

    for train_prompt_ids, val_prompt_ids in kf.split(unique_prompts):
        train_dataframes = [df[df['prompt_id'] == unique_prompts[id]] for id in train_prompt_ids]
        val_dataframes = [df[df['prompt_id'] == unique_prompts[id]] for id in val_prompt_ids]

        combined_train_dfs.append(pd.concat(train_dataframes))
        combined_val_dfs.append(pd.concat(val_dataframes))

    return combined_train_dfs, combined_val_dfs

## Loss

In [15]:
import numpy as np

def compute_mcrmse(df):
    def rmse(actual, predictions):
        return np.sqrt(np.mean((actual - predictions)**2))

    rmse_content = rmse(df['content'], df['predicted_content'])
    rmse_wording = rmse(df['wording'], df['predicted_wording'])

    mcrmse = np.mean([rmse_content, rmse_wording])

    return mcrmse, rmse_content, rmse_wording

## Training

In [16]:
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.nn.parameter import Parameter

class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row['full_text']
        inputs = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        inputs['input_ids'] = inputs['input_ids'].squeeze()
        inputs['attention_mask'] = inputs['attention_mask'].squeeze()
        return inputs, torch.tensor([row['content'], row['wording']], dtype=torch.float)


class GeMText(nn.Module):
    def __init__(self, dim = 1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1

    def forward(self, last_hidden_state, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        x = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret

# https://raphaelb.org/posts/freezing-bert/
def partial_freeze(model, frozen_count=4):
    for param in model.encoder.layer[:frozen_count].parameters():
        param.requires_grad = False
    for param in model.encoder.layer[frozen_count:].parameters():
            param.requires_grad = True

class RegressionModel(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model
        # Safely retrieve the hidden size
        embedding_size = getattr(base_model.config, 'hidden_size', None)
        if embedding_size is None:
            raise ValueError(f"'hidden_size' not found in config of model {type(base_model)}")
        self.gem_pooler = GeMText()
        self.regressor = nn.Linear(embedding_size, 2)
        self._init_weights(self.regressor)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()

    def forward(self, **inputs):
        outputs = self.base_model(**inputs)
        sentence_embedding = self.gem_pooler(outputs.last_hidden_state, inputs['attention_mask'])
        return self.regressor(sentence_embedding)


class MCRMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')  # We will handle the mean computation manually

    def forward(self, pred, target):
        squared_errors = self.mse(pred, target)
        rmse_per_column = torch.sqrt(torch.mean(squared_errors, dim=0))
        mcrmse = torch.mean(rmse_per_column)
        return mcrmse

In [17]:
def predict_with_transformer(model, tokenizer, dataframe, batch_size=4):
    """Use the transformer model to make predictions in batches."""

    # Set the model to evaluation mode
    model.eval()

    texts = dataframe['full_text'].tolist()
    total_samples = len(texts)

    predicted_content = []
    predicted_wording = []

    for i in range(0, total_samples, batch_size):
        inputs = tokenizer(texts[i:i+batch_size], truncation=True, padding="max_length", return_tensors="pt", max_length=MAX_LENGTH)

        # Send all inputs to device (e.g., GPU)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        predicted_content.extend(outputs[:, 0].cpu().numpy())
        predicted_wording.extend(outputs[:, 1].cpu().numpy())

    return predicted_content, predicted_wording


In [18]:
def print_loss(model, tokenizer, dataframe):
    dataframe['predicted_content'], dataframe['predicted_wording'] = predict_with_transformer(model, tokenizer, dataframe)
    print(compute_mcrmse(dataframe))


In [19]:
import torch.optim as optim
from tqdm import tqdm
from torch.nn.utils import clip_grad_norm_
from transformers import get_cosine_schedule_with_warmup
import torch.nn as nn


def train_phase(model, dataloader, optimizer, epochs, frozen_count, df_val=None):
    partial_freeze(model.base_model, frozen_count=frozen_count)
    total_steps = len(dataloader) * epochs
    warmup_steps = int(total_steps * 0.1)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
    loss_history = []
    running_loss = 0.0
    loss_func = MCRMSELoss()

    for epoch in range(epochs):
        model.train()
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        for i, batch in enumerate(progress_bar):
            optimizer.zero_grad()
            inputs, labels = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            outputs = model(**inputs)
            loss = loss_func(outputs, labels)
            loss.backward()
            clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            optimizer.step()
            current_loss = loss.item()
            running_loss += current_loss
            loss_history.append(current_loss)
            if i % 10 == 9:
                avg_loss = running_loss / 10
                progress_bar.set_description(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
                running_loss = 0.0
            scheduler.step()
        if df_val is not None:
            print_loss(model, tokenizer, df_val)
    return loss_history


def train_model(df_train, df_val=None):

    # Load tokenizer and model
    base_model, tokenizer = load_pretrained()

    # Set dropout probability
    base_model.config.hidden_dropout_prob = DROPOUT
    base_model.config.attention_probs_dropout_prob = DROPOUT

    # Create the regression model
    model = RegressionModel(base_model)
    model.to(device)

    # Prepare the data
    dataset_train = TextDataset(df_train, tokenizer, max_length=MAX_LENGTH)
    dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)

    # Set up the optimizer and loss function
    optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay=WEIGHT_DECAY)

    loss_history_phase2 = train_phase(model, dataloader_train, optimizer, epochs=3, frozen_count=4, df_val=df_val)

    loss_history = loss_history_phase2
    return model, loss_history


    return model, loss_history

In [20]:
import torch

if ENV == "colab":
    model, train_loss = train_model(df_train)
    torch.save(model.state_dict(), 'drive/MyDrive/kaggle/colab/tuned_roberta.pth')
else:
    base_model, _ = load_pretrained()
    model = RegressionModel(base_model).to(device)
    model.load_state_dict(torch.load('/kaggle/input/tuned-roberta/tuned_roberta.pth', map_location=device), strict=False) # WARNING: strict false may hide bug!
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
    model.eval()



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at /kaggle/input/debertav3base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT

In [21]:
import json
import os
from datetime import datetime
import shutil

def create_kaggle_dataset(file_path, dataset_name):
    if ENV != "colab":
        print("Environment is not Colab. Exiting function.")
        return

    YOUR_USERNAME = "louissanna"
    now = datetime.now()
    formatted_date = now.strftime("%Y-%m-%d-%H-%M-%S")

    # Upload kaggle.json if not present
    if not os.path.exists('kaggle.json'):
        from google.colab import files
        uploaded = files.upload()

    # List of shell commands for initial setup
    commands = [
        "pip install kaggle",
        "mkdir -p ~/.kaggle",
        "cp kaggle.json ~/.kaggle/",
        "chmod 600 ~/.kaggle/kaggle.json",
    ]

    # Only create 'my_temp_dir' if it doesn't exist
    if os.path.exists('my_temp_dir'):
        shutil.rmtree('my_temp_dir')
    if not os.path.exists('my_temp_dir'):
        commands.append("mkdir my_temp_dir")

    # Only move the file if it's not already in 'my_temp_dir'
    filename = os.path.basename(file_path)
    if not os.path.exists(f'my_temp_dir/{filename}'):
        commands.append(f"cp {file_path} my_temp_dir/")

    # Execute each initial setup command
    for command in commands:
        get_ipython().system(command)

    # Check if 'dataset-metadata.json' doesn't exist, then set it
    if not os.path.exists("my_temp_dir/dataset-metadata.json"):
        metadata = {
            "title": dataset_name + "-" + formatted_date,
            "id": YOUR_USERNAME + "/" + dataset_name + "-" + formatted_date,
            "licenses": [{"name": "CC0-1.0"}]
        }

        with open("my_temp_dir/dataset-metadata.json", "w") as f:
            json.dump(metadata, f, indent=4)

    # Create the dataset on Kaggle
    get_ipython().system("kaggle datasets create -p my_temp_dir/")

In [22]:
if ENV == "colab":
    file_path = 'drive/MyDrive/kaggle/colab/tuned_roberta.pth'
    dataset_name = 'tuned-roberta'

    create_kaggle_dataset(file_path, dataset_name)

In [23]:
import matplotlib.pyplot as plt

if ENV == "colab":
    # Plotting the loss history
    plt.figure(figsize=(10, 5))
    plt.plot(train_loss, label='Training Loss')
    plt.title('Training Loss Over Time')
    plt.xlabel('Every Batche')
    plt.ylabel('Average Loss')
    plt.legend()
    plt.show()

## Inference

In [24]:

_, tokenizer = load_pretrained()

if ENV == "kaggle":
    df_test['transformer_content'], df_test['transformer_wording'] = predict_with_transformer(model, tokenizer, df_test)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at /kaggle/input/debertav3base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT

## 4 fold training

In [25]:
def cross_validation_training(df_train, tokenizer):
    train_4_fold_dfs, val_4_fold_dfs = split_dataframe_by_prompt(df_train)
    validation_losses = []
    oof_predictions = pd.DataFrame() # Initialize an empty DataFrame to store OOF predictions

    for fold_number in range(4):
        print(f"Fold {fold_number}")

        df_4_fold_train = train_4_fold_dfs[fold_number]
        df_4_fold_val = val_4_fold_dfs[fold_number]

        model, _ = train_model(df_4_fold_train, df_val=df_4_fold_val)
        df_4_fold_val['predicted_content'], df_4_fold_val['predicted_wording'] = predict_with_transformer(model, tokenizer, df_4_fold_val)

        oof_predictions = pd.concat([oof_predictions, df_4_fold_val]) # Add the predictions to OOF DataFrame

        import gc
        del model ; gc.collect()
        torch.cuda.empty_cache()

        validation_loss = compute_mcrmse(df_4_fold_val)
        validation_losses.append(validation_loss)

        print(f"Validation Loss for Fold {fold_number}: {validation_loss}")

    print("Average loss across all folds:")
    print(np.mean(validation_losses))

    return oof_predictions, validation_losses

if ENV == "colab":
    oof_predictions, validation_losses = cross_validation_training(df_train, tokenizer)
    print(compute_mcrmse(oof_predictions))
    oof_predictions["transformer_content"] = oof_predictions["predicted_content"]
    oof_predictions["transformer_wording"] = oof_predictions["predicted_wording"]
    oof_predictions.to_csv('drive/MyDrive/kaggle/colab/oof_predictions.csv', index=False)


In [26]:
file_path = 'drive/MyDrive/kaggle/colab/oof_predictions.csv'
dataset_name = 'summaries-features'

if ENV == 'colab':
    create_kaggle_dataset(file_path, dataset_name)

## LGBM

In [27]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error


def train_models_with_folds(train_folds, val_folds, feature_columns, target_column, init_column):
    models = []
    for train_df, val_df in zip(train_folds, val_folds):
        model = lgb.LGBMRegressor(learning_rate=0.05, max_depth=3, lambda_l2=0.01, n_estimators=10000)

        early_stopping_callback = lgb.early_stopping(stopping_rounds=30, verbose=False)
        
        # Extract the feature values for the initial prediction
        init_score_train = train_df[init_column].values
        init_score_val = val_df[init_column].values

        model.fit(train_df[feature_columns], train_df[target_column],
                  eval_set=[(val_df[feature_columns], val_df[target_column])],
                  callbacks=[early_stopping_callback],
                  init_score=init_score_train,
                  eval_init_score=[init_score_val]
                  )

        models.append(model)


        # Predict on validation set and calculate error
        val_predictions = model.predict(val_df[feature_columns])
        mse = mean_squared_error(val_df[target_column], val_predictions + val_df[init_column])
        print(f"Mean Squared Error for target '{target_column}': {mse}")
        feature_importance = model.feature_importances_
        # Print feature importance
        for name, importance in zip(feature_columns, feature_importance):
            print(f"Feature {name}: {importance}")

    return models


def predict_with_models(models, df, feature_columns, init_column):
    init_score_test = df[init_column].values
    predictions = [model.predict(df[feature_columns]) + init_score_test for model in models]
    return np.mean(predictions, axis=0)


train_4_fold_dfs, val_4_fold_dfs = split_dataframe_by_prompt(df_train)

# RMK: transformer_content col will be exploited by the model as initial pred
content_feature_columns = ['lexical_similarity', 'semantic_similarity'] + ROUGE_COLUMNS
content_models = train_models_with_folds(train_4_fold_dfs, val_4_fold_dfs, content_feature_columns, "content", "transformer_content")
df_test["content"] = predict_with_models(content_models, df_test, content_feature_columns, "transformer_content")

# RMK: transformer_wording col will be exploited by the model as initial pred
wording_feature_columns = ['lexical_similarity', 'semantic_similarity'] + ROUGE_COLUMNS
content_models = train_models_with_folds(train_4_fold_dfs, val_4_fold_dfs, wording_feature_columns, "wording", "transformer_wording")
df_test["wording"] = predict_with_models(content_models, df_test, wording_feature_columns, "transformer_wording")

Mean Squared Error for target 'content': 0.17330497596622332
Feature lexical_similarity: 5
Feature semantic_similarity: 33
Feature rouge_1_f1: 0
Feature rouge_1_precision: 6
Feature rouge_1_recall: 0
Feature rouge_2_f1: 1
Feature rouge_2_precision: 1
Feature rouge_2_recall: 4
Feature rouge_l_f1: 5
Feature rouge_l_precision: 3
Feature rouge_l_recall: 1
Feature rouge_s_f1: 4
Feature rouge_s_precision: 0
Feature rouge_s_recall: 0
Mean Squared Error for target 'content': 0.2741369138845592
Feature lexical_similarity: 0
Feature semantic_similarity: 2
Feature rouge_1_f1: 0
Feature rouge_1_precision: 0
Feature rouge_1_recall: 0
Feature rouge_2_f1: 0
Feature rouge_2_precision: 2
Feature rouge_2_recall: 0
Feature rouge_l_f1: 0
Feature rouge_l_precision: 2
Feature rouge_l_recall: 0
Feature rouge_s_f1: 0
Feature rouge_s_precision: 0
Feature rouge_s_recall: 0
Mean Squared Error for target 'content': 0.1899810680305828
Feature lexical_similarity: 1
Feature semantic_similarity: 1
Feature rouge_1_f1:

## Submit

In [28]:
print(f"df_test.shape: {df_test.shape}")

def is_valid_float(x):
    return isinstance(x, float) and x == x  # This checks that x is not NaN since NaN != NaN in Python.

# Replace non-float values in 'wording' and 'content' with zero
cols_to_check = ['wording', 'content']
df_test[cols_to_check] = df_test[cols_to_check].applymap(lambda x: x if is_valid_float(x) else 0.0)

df_test[['student_id', 'content', 'wording']].to_csv('submission.csv',index=False)
display(pd.read_csv('submission.csv'))

df_test.shape: (4, 26)


Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.182625,-0.988491
1,222222cccccc,-1.174274,-0.978993
2,111111eeeeee,-1.170675,-0.989027
3,333333dddddd,-1.163676,-0.964203
