# Basics

In [None]:
MODE = 'test'
POSTPROCESS = True

## Imports

In [None]:
# Native
import os
import re
import warnings
import tempfile

# Torch: BSD
import torch                                      
from torch.utils.data import DataLoader, TensorDataset

# Huggingface: Apache 2
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset                      

import numpy as np                                # BSD
import pandas as pd                               # BSD
from tqdm import tqdm                             # MIT
import lightgbm as lgb                            # MIT

# Not needed for inference
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [None]:
# Base
base = '/kaggle/input/commonlit-evaluate-student-summaries/'
prompts = pd.read_csv(base + f'prompts_{MODE}.csv')
summaries = pd.read_csv(base + f'summaries_{MODE}.csv')
df = prompts.merge(summaries, on='prompt_id') #[::100]

## Mark

In [None]:
#df = df[::10].reset_index(drop=True)

In [None]:
def clean_summary(row):
    text = row['text']
    clean_summary = re.sub(r'\s+', ' ', text).strip()
    return clean_summary
def clean_prompt_text(row):
    text = row['prompt_text']
    clean_prompt_text = re.sub(r'\s+', ' ', text).strip()
    return clean_prompt_text

In [None]:
df['clean_summary'] = df.apply(clean_summary, axis=1)
df['clean_prompt_text'] = df.apply(clean_prompt_text, axis=1)

In [None]:
# Function to find common sequences
def find_common_sequences(text, prompt_text):
    text_words = text.split()
    prompt_words = prompt_text.split()
    common_sequences = []
    words_copied = 0
    for length in range(min(len(prompt_words), len(text_words), 512), 2, -1):  # Start with the longest sequences and end with sequences of length 3
        for start_idx in range(len(text_words) - length + 1):
            sequence_words = text_words[start_idx:start_idx + length]
            sequence = ' '.join(sequence_words)
            if sequence in prompt_text:
                common_sequences.append(sequence)
                prompt_text = prompt_text.replace(sequence, '')  # Remove this sequence from prompt_text to avoid duplicate matches 
                words_copied += len(sequence.split())
    return common_sequences, words_copied

def standardize_summary(row):
    summary = row['clean_summary']
    original = row['clean_prompt_text']  #row['prompt_title'] + ' ' + row['prompt_text'] + ' ' + row['prompt_question']
    common_sequences, words_copied = find_common_sequences(summary, original)
    standardized_summary = summary
    for sequence in common_sequences:
        replacement = f"< {sequence} >"  
        standardized_summary = standardized_summary.replace(sequence, replacement)
    
    return standardized_summary, words_copied

In [None]:
# Function to find common sequences
def find_common_sequences(text, prompt_text):
    common_sequences = []
    words_total = len(text.split())
    for length in range(min(len(prompt_text.split()), len(text.split()), 512), 2, -1):  # Start with the longest sequences and end with sequences of length 3
        text_words = text.split()  # Move text_words inside the loop to ensure it's updated
        prompt_words = prompt_text.split()
        for start_idx in range(len(text_words) - length + 1):
            sequence_words = text_words[start_idx:start_idx + length]
            sequence = ' '.join(sequence_words)
            if sequence in prompt_text:
                common_sequences.append(sequence)
                text = text.replace(sequence, '')  # Remove this sequence from text to avoid duplicate matches
    words_copied = words_total - len(text.split()) # reduce by leftover after removal
    return common_sequences, words_copied

def mark_summary(row):
    summary = row['clean_summary']
    original = row['clean_prompt_text'] # row['prompt_title'] + ' ' + row['clean_prompt_text'] + ' ' + row['clean_prompt_question'] # 
    common_sequences, words_copied = find_common_sequences(summary, original)
    standardized_summary = summary
    braces_summary = summary
    for sequence in common_sequences:
        replacement = f"< {sequence} >"  
        standardized_summary = standardized_summary.replace(sequence, replacement)
        
        replacement_braces = f"{ {sequence} }"
        braces_summary = braces_summary.replace(sequence, replacement_braces)
    return braces_summary, standardized_summary, words_copied

In [None]:
df[['braces_summary', 'standardized_summary', 'words_copied']] = df.apply(mark_summary, axis=1, result_type='expand') 

In [None]:
df['len'] = df.text.apply(lambda x: len(x.split()))
df['rel_copy'] = df['words_copied']/df['len']

# Transformer

## Functions and Seeding

In [None]:
def seed_everything(seed: int): 
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True 
seed_everything(seed=42)

In [None]:
class ContentScoreRegressor:
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.input_col = "input" 
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = [target]
        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(f"{model_name}")  
        self.model_config = AutoConfig.from_pretrained(f"{model_name}")
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 1,                                                         # 2 in 1! 
            "problem_type": "regression",
        })   
        seed_everything(seed=42)
        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )

    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized

    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        sep = self.tokenizer.sep_token  
        in_text = (
            test_df["prompt_title"] + sep 
            + test_df["prompt_question"] + sep 
            + test_df[summary_input]
                  )

        test_df[self.input_col] = in_text
        test_ = test_df[[self.input_col]][test_df.fold == fold]             
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)
        model = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model.eval()
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        
        test_args = TrainingArguments(
            output_dir=tempfile.mkdtemp(),
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = CFG.eval_batch,   # SWITCHED SO IT WORKS BEST WITH CPU
            dataloader_drop_last = False,
            logging_dir=tempfile.mkdtemp(),
            report_to=[],  # Disable all reporting
        )
        infer_content = Trainer(
                      model = model, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]
        return preds

In [None]:
def predict(
    test_df: pd.DataFrame,
    target:str,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    if MODE == 'test':
        test_df[f"{target}_p{ENSEMBLE}"] = 0
    
    for fold in range(4):
        print(f"fold {fold}:")
        for model_file in os.listdir(f"{model_name}"):
            if f"fold_{fold}_{target}" in model_file:
                model_dir =  os.path.join(f"{model_name}", model_file)
        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        if MODE == 'train':
            pred = csr.predict(
                test_df=test_df, 
                fold=fold
            )
            test_df.loc[test_df.fold == fold, f"{target}_p{ENSEMBLE}"] = pred
        else: 
            pred = csr.predict(
                test_df=test_df, 
                fold=0
            ) 
            test_df[f"{target}_p{ENSEMBLE}"] += pred.flatten()
    if MODE == 'test':
        test_df[f"{target}_p{ENSEMBLE}"] /= 4
        
    return test_df

In [None]:
class ContentScoreRegressorBoth:
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target1: str,
                target2: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.input_col = "input" 
        self.text_cols = [self.input_col] 
        self.target1 = target1
        self.target2 = target2
        self.target_cols = [target1, target2]
        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(f"{model_name}")  
        self.model_config = AutoConfig.from_pretrained(f"{model_name}")
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 2,  # Updated to 2 for two targets
            "problem_type": "regression",
        })   
        seed_everything(seed=42)
        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )

    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target1], examples[self.target2]]   # Updated to include both targets
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        sep = self.tokenizer.sep_token  

        in_text = (
            test_df["prompt_title"] + sep 
            + test_df["prompt_question"] + sep 
            + test_df[summary_input]
                  )
 
        test_df[self.input_col] = in_text
        test_ = test_df[[self.input_col]][test_df.fold == fold]             
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)
        model = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model.eval()
        
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        test_args = TrainingArguments(
            output_dir=tempfile.mkdtemp(),
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = CFG.eval_batch,   # SWITCHED SO IT WORKS BEST WITH CPU
            dataloader_drop_last = False,
            logging_dir=tempfile.mkdtemp(),
            report_to=[],  # Disable all reporting
        )
        infer_content = Trainer(
                      model = model, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)
        
        preds = infer_content.predict(test_tokenized_dataset)[0]
        return preds

In [None]:
def predict_both(
    test_df: pd.DataFrame,
    target1: str,
    target2: str,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    if MODE == 'test':
        test_df[f"{target1}_p{ENSEMBLE}"] = 0
        test_df[f"{target2}_p{ENSEMBLE}"] = 0
    for fold in range(4):
        print(f"fold {fold}:")
        for model_file in os.listdir(f"{model_name}"):
            if f"fold_{fold}_" in model_file:
                model_dir =  os.path.join(f"{model_name}", model_file)
        csr = ContentScoreRegressorBoth(
            model_name=model_name,
            target1=target1,
            target2=target2,
            model_dir=model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        if MODE == 'train':
            pred = csr.predict(
                test_df=test_df, 
                fold=fold
            )
            test_df.loc[test_df.fold == fold, f"{target1}_p{ENSEMBLE}"] = pred[:, 0]
            test_df.loc[test_df.fold == fold, f"{target2}_p{ENSEMBLE}"] = pred[:, 1]
        else: 
            pred = csr.predict(
                test_df=test_df, 
                fold=0
            ) 
            test_df[f"{target1}_p{ENSEMBLE}"] += pred[:, 0].flatten()
            test_df[f"{target2}_p{ENSEMBLE}"] += pred[:, 1].flatten()
    if MODE == 'test':
        test_df[f"{target1}_p{ENSEMBLE}"] /= 4
        test_df[f"{target2}_p{ENSEMBLE}"] /= 4
        
    return test_df

## Predict

In [None]:
if MODE == 'train':
    gkf = GroupKFold(n_splits=4)
    for i, (_, val_index) in enumerate(gkf.split(df, groups=df["prompt_id"])):
        df.loc[val_index, "fold"] = i
else:
    df['fold'] = 0

In [None]:
class CFG:
    max_length=512
    eval_batch=8
summary_input = 'standardized_summary'

In [None]:
model_name="/kaggle/input/commonlit-big"
ENSEMBLE = 0
for global_target in ['content','wording']:
    df = predict(
        df,
        target=global_target,
        model_name=model_name,
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0,
        max_length=CFG.max_length
    )

In [None]:
model_name="/kaggle/input/commonlit-v3-large-freeze8clean"
ENSEMBLE = 1
for global_target in ['content','wording']:
    df = predict(
        df,
        target=global_target,
        model_name=model_name,
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0,
        max_length=CFG.max_length
    )

In [None]:
model_name="/kaggle/input/v3large-freeze8-remove-summary"
ENSEMBLE = 2
for global_target in ['content','wording']:
    df = predict(
        df,
        target=global_target,
        model_name=model_name,
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0,
        max_length=CFG.max_length
    )

In [None]:
model_name="/kaggle/input/cv3large-more-regularization-freeze68"
ENSEMBLE = 3
for global_target in ['content','wording']:
    df = predict(
        df,
        target=global_target,
        model_name=model_name,
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0,
        max_length=CFG.max_length
    )

In [None]:
summary_input = 'braces_summary' 
model_name="/kaggle/input/v3l-f5-braces-both"
ENSEMBLE = 4
df = predict_both(
    df,
    target1='content',
    target2='wording',
    model_name=model_name,
    hidden_dropout_prob=0,
    attention_probs_dropout_prob=0,
    max_length=CFG.max_length
)

In [None]:
summary_input = 'braces_summary' 
model_name="/kaggle/input/v3l-f8-braces-both"
ENSEMBLE = 5
df = predict_both(
    df,
    target1='content',
    target2='wording',
    model_name=model_name,
    hidden_dropout_prob=0,
    attention_probs_dropout_prob=0,
    max_length=CFG.max_length
)

In [None]:
if False:
    summary_input = 'braces_summary' 
    model_name="/kaggle/input/v3l-f7-braces-both"
    ENSEMBLE = 6
    df = predict_both(
        df,
        target1='content',
        target2='wording',
        model_name=model_name,
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0,
        max_length=CFG.max_length
    )

In [None]:
summary_input = 'braces_summary' 
model_name="/kaggle/input/v3l-f12-braces-both"
ENSEMBLE = 7
df = predict_both(
    df,
    target1='content',
    target2='wording',
    model_name=model_name,
    hidden_dropout_prob=0,
    attention_probs_dropout_prob=0,
    max_length=CFG.max_length
)

In [None]:
ensembles = [0,1,2,3,4,5,7]

df['content_mean'] = 0
df['wording_mean'] = 0
for ENSEMBLE in ensembles:
    df['content_mean'] += df[f'content_p{ENSEMBLE}'] / len(ensembles)
    df['wording_mean'] += df[f'wording_p{ENSEMBLE}'] / len(ensembles)
    
    
if MODE == 'train':
    for ENSEMBLE in ensembles:
        content_rmse = np.sqrt(((df['content'] - df[f'content_p{ENSEMBLE}']) ** 2).mean())
        wording_rmse = np.sqrt(((df['wording'] - df[f'wording_p{ENSEMBLE}']) ** 2).mean())

        rmse = (content_rmse + wording_rmse)/2
        #print(f'{content_rmse}\n{wording_rmse}\n{rmse}')

        df['content_dif'] = df['content'] - df[f'content_p{ENSEMBLE}']
        df['wording_dif'] = df['wording'] - df[f'wording_p{ENSEMBLE}']

    content_rmse = np.sqrt(((df['content'] - df[f'content_mean']) ** 2).mean())
    wording_rmse = np.sqrt(((df['wording'] - df[f'wording_mean']) ** 2).mean())

    rmse = (content_rmse + wording_rmse)/2
    print(f'{content_rmse}\n{wording_rmse}\n{rmse}')

    df['content_dif'] = df['content'] - df[f'content_mean']
    df['wording_dif'] = df['wording'] - df[f'wording_mean']

    df::10 with commonlit-big trained with marking and removal from prompt and freeze8clean trained on removal from summary
    with removal from prompt
    0.420812660489145
    0.5436554022594821
    0.48223403137431353
    0.4225474840718467
    0.5390831076476523
    0.4808152958597495
    0.41635675494060786
    0.53341817650472
    0.47488746572266394
    
    With removal from summary:
    0.4145619676413469
    0.5405258671127942
    0.4775439173770705
    0.4209995014473389
    0.5328895652573283
    0.4769445333523336
    0.4117069578676729
    0.5292576582820044
    0.4704823080748387
    
    Full
    0.4800887487439707  0 (single + <>)
    0.4846268136752474  1
    0.4804681110307913  2
    0.4815882008131167  3
    0.4794552627000645  4 (both + braces)
    0.4810434541529361  5 
    
    
    0.4755065724485361  0+1
    0.4729761046873674  0-2
    0.4713179620663737  0-3
    0.4677167678019531  0-4
    0.4666305670723127  0-5 postprocess: 0.4636365941271711 Faulty!
    0.4658944885731245  0,1,2,3,4,5,7 non grouped lgbm: 0.45228009742238695
    
    => Noarmilized features and mean removed content and wording difference across folds lgb: 
    mcrmse : 0.46427082773567485, content: 0.40641374126603336, wording: 0.5221279142053163

In [None]:
df['content_prediction'] = df.content_mean
df['wording_prediction'] = df.wording_mean

# Post-Process

In [None]:
if MODE == 'train':
    content_rmse = np.sqrt(((df['content'] - df['content_prediction']) ** 2).mean())
    wording_rmse = np.sqrt(((df['wording'] - df['wording_prediction']) ** 2).mean())
    rmse = (content_rmse + wording_rmse)/2
    print(f'{content_rmse}\n{wording_rmse}\n{rmse}')

In [None]:
if POSTPROCESS:
    prompt_count = len(df.prompt_id.unique())
    if MODE == 'test':
        gkf = GroupKFold(n_splits=prompt_count)
        for i, (_, val_index) in enumerate(gkf.split(df, groups=df["prompt_id"])):
            df.loc[val_index, "fold"] = i

In [None]:
if POSTPROCESS:
    model_name = '/kaggle/input/v3-large-base'
    tokenizer = AutoTokenizer.from_pretrained(model_name) 
    tokenized = tokenizer(df['braces_summary'].tolist())
    token_counts = [len(toks) for toks in tokenized['input_ids']]
    df['token_count'] = token_counts
    df['copy_description'] = df.text.apply(lambda x: 'take notes on' in x)

In [None]:
class CFG:
    model_name= '/kaggle/input/deberta-v3-large'
    hidden_dropout_prob=0        
    attention_probs_dropout_prob=0 
    max_length=512
device = 'cuda'

In [None]:
if POSTPROCESS:
    df = df.sort_values(by='fold', kind='mergesort').reset_index(drop=True)
    outputs = []
    model = AutoModelForSequenceClassification.from_pretrained(f"{model_name}") #/fold_{0}_{N_FREEZE_LAYER}")
    model.eval().cuda()
    for fold in range(prompt_count):
        df_fold = df[df.fold==fold]

        # Tokenization
        sep = tokenizer.sep_token  
        in_text = df_fold['text']
        inputs = []
        for text in in_text:
            tokens = tokenizer(text, padding=False, truncation=True, return_tensors="pt", max_length=CFG.max_length)
            inputs.append(tokens['input_ids'])

        # Inference
        for i in inputs:
            with torch.no_grad():
                out = model(i.cuda(), output_hidden_states=True)['hidden_states']
                outputs.append(np.array([np.array(h.cpu()).squeeze().mean(axis=0) for h in out]))
    outputs = np.array(outputs)
    pt_out = []
    for fold in range(prompt_count):
        df_fold = df[df.fold==fold]

        # Tokenization
        for text in (df_fold["prompt_title"] + sep + df_fold["prompt_text"] + sep + df_fold["prompt_question"]):
            tokens = tokenizer(text, padding=False, truncation=True, return_tensors="pt", max_length=2048)
            with torch.no_grad():
                out = model(tokens['input_ids'].cuda(), output_hidden_states=True)['hidden_states']
                pt_out.append(np.array([np.array(h.cpu()).squeeze().mean(axis=0) for h in out]))
            break
    pt_out = np.array(pt_out).squeeze()

In [None]:
if POSTPROCESS:
    df = df.sort_values(by='fold', kind='mergesort').reset_index(drop=True)
    NORMALIZE = True
    for LAYER in range(25):
        reduced_features = outputs[:,LAYER]
        if NORMALIZE:
            reduced_features /= np.linalg.norm(reduced_features, axis=1, keepdims=True)
        prompt_text_pca = pt_out[:,LAYER]
        if NORMALIZE:
            prompt_text_pca /= np.linalg.norm(prompt_text_pca, axis=1, keepdims=True)
        cosine_sims = np.zeros(len(df))
        start = 0
        for fold in range(prompt_count):
            l = len(df[df.fold==fold])
            cosine_fold = (reduced_features[start:start+l] * prompt_text_pca[fold]).sum(axis=1)
            cosine_sims[start:start+l] = cosine_fold
            start += l
        df[f'cos_sim{LAYER}'] = cosine_sims 

In [None]:
if POSTPROCESS:
    # Find stuff that works for postprocessing
    to_normalized = ['rel_copy', 'token_count'] + [f'cos_sim{layer}' for layer in range(25)] 
    for fold in range(prompt_count):
        fold_mask = df.fold == fold
        for column in to_normalized:
            df.loc[fold_mask, f'{column}_normalized'] = (df.loc[fold_mask, column] - df.loc[fold_mask, column].mean()) / df.loc[fold_mask, column].std()
        [f'cos_sim{layer}' for layer in range(25)] 
    if MODE == 'train':
        df['content_dif'] = df['content'] - df[f'content_mean']
        df['wording_dif'] = df['wording'] - df[f'wording_mean']
        to_normalized = ['content_dif', 'wording_dif']
        for fold in range(prompt_count):
            fold_mask = df.fold == fold
            for column in to_normalized:
                df.loc[fold_mask, f'{column}_normalized'] = (df.loc[fold_mask, column] - df.loc[fold_mask, column].mean())

In [None]:
def train_lgb(target, drop_columns, l1=0.1, l2=0.1, max_depth=3, lr=0.1, display=False, save=False): 
    # Training
    model_dict = {}  
    for target in targets:
        models = []  
        for fold in range(4):
            X_train_cv = df[df["fold"] != fold].drop(columns=drop_columns)
            y_train_cv = df[df["fold"] != fold][target]
            X_eval_cv = df[df["fold"] == fold].drop(columns=drop_columns)
            y_eval_cv = df[df["fold"] == fold][target]
            dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
            dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)
            params = {
                'boosting_type': 'gbdt',
                'random_state': 42,
                'objective': 'regression',
                'metric': 'rmse',
                'learning_rate': lr,
                'max_depth': max_depth, 
                'lambda_l1': l1,
                'lambda_l2': l2,
                'verbose': -1
            }
            evaluation_results = {}
            model = lgb.train(params,
                              num_boost_round=1000,
                              valid_names=['train', 'valid'],
                              train_set=dtrain,
                              valid_sets=[dtrain, dval],
                              callbacks=[
                                  lgb.early_stopping(stopping_rounds=30, verbose=True),
                                   lgb.log_evaluation(100),
                                  lgb.callback.record_evaluation(evaluation_results)
                                ],
                              )
            models.append(model)
        model_dict[target] = models
        
    # Inference
    rmses = []
    for target in ['content_dif','wording_dif']:
        models = model_dict[f"{target}_normalized"]
        preds = []
        trues = []
        for fold, model in enumerate(models):
            if save:
                model.save_model(f"{target}_model_{fold}.txt")
            X_eval_cv = df[df["fold"] == fold].drop(columns=drop_columns)
            y_eval_cv = df[df["fold"] == fold][target]
            pred = model.predict(X_eval_cv)  
            trues.extend(y_eval_cv)
            preds.extend(pred)
        rmse = np.sqrt(((np.array(trues)-np.array(preds))**2).mean())
        print(f"{target}_rmse : {rmse}")
        rmses = rmses + [rmse]
    print(f"mcrmse : {sum(rmses) / len(rmses)}, content: {rmses[0]}, wording: {rmses[1]}")
    if display:
        for target, models in model_dict.items():
            for i, model in enumerate(models):
                lgb.plot_importance(model, importance_type='split')
                plt.title(f'Model for target {target}, fold {i}')
                plt.show()
    return sum(rmses) / len(rmses)

In [None]:
if POSTPROCESS:
    targets = ['content_dif_normalized', 'wording_dif_normalized']
    if MODE == 'train':
        drop_columns = ['prompt_id', 'prompt_question', 'prompt_title', 'prompt_text', 'student_id', 'text', 
                        'clean_summary', 'clean_prompt_text', 'braces_summary', 'standardized_summary',
                        'fold', 'input', 'content_prediction', 'wording_prediction',
                        'content_dif', 'wording_dif', 
                        'content', 'wording', 
                       ] + targets 
        targets = ['content_dif_normalized', 'wording_dif_normalized']
        train_lgb(targets, drop_columns, l1=0.1, l2=0.25, max_depth=15, lr=0.07, display=False, save=True)
    else:
        drop_columns = ['prompt_id', 'prompt_question', 'prompt_title', 'prompt_text', 'student_id', 'text', 
                        'clean_summary', 'clean_prompt_text', 'braces_summary', 'standardized_summary',
                        'fold', 'input', 'content_prediction', 'wording_prediction'] 

In [None]:
if POSTPROCESS:
    model_dict = {}
    for target in ['content_dif','wording_dif']:
        models = []
        for fold in range(4):
            model = lgb.Booster(model_file=f"/kaggle/input/ensemble-lgb-fold-corrected/{target}_model_{fold}.txt")
            models.append(model)
        model_dict[target] = models

    if MODE == 'train':
        rmses = []
        for target in ['content_dif', 'wording_dif']:
            models = model_dict[f"{target}"]
            preds = []
            trues = []
            for fold, model in enumerate(models):
                X_eval_cv = df[df["fold"] == fold].drop(columns=drop_columns)
                y_eval_cv = df[df["fold"] == fold][target]
                pred = model.predict(X_eval_cv)
                trues.extend(y_eval_cv)
                preds.extend(pred)
            rmse = np.sqrt(((np.array(trues) - np.array(preds)) ** 2).mean())
            print(f"{target}_rmse : {rmse}")
            rmses = rmses + [rmse]
        print(f"mcrmse : {sum(rmses) / len(rmses)}, content: {rmses[0]}, wording: {rmses[1]}")
    else:
        for target in ['content', 'wording']:
            models = model_dict[f"{target}_dif"]
            preds = np.zeros(len(df))
            for fold, model in enumerate(models):
                X_eval_cv = df.drop(columns=drop_columns)
                pred = model.predict(X_eval_cv)
                preds += pred / 4
            df[f'{target}_prediction'] += preds

In [None]:
sub_df = df[['student_id', 'content_prediction', 'wording_prediction']].copy()
sub_df.rename(columns={'content_prediction': 'content', 'wording_prediction': 'wording'}, inplace=True)
sub_df

In [None]:
sub_df.to_csv("submission.csv", index=False)