In [None]:
!pip install coral_pytorch transformers datasets torch scikit-learn accelerate



In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, cohen_kappa_score, mean_absolute_error, mean_squared_error
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset
from coral_pytorch.losses import CornLoss, CoralLoss
from coral_pytorch.dataset import corn_label_from_logits

In [None]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

In [None]:
MODEL_NAME = "xlm-roberta-base"
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS_DEEP = 6
NUM_CLASSES = 8

In [None]:
df_text = pd.read_csv("training_data.csv")
df_features = pd.read_csv("training_features.csv")
df = pd.concat([df_text, df_features], axis=1)

In [None]:
feature_cols = [
    'sent_len_tokens_mean', 'sent_len_tokens_max', 'token_len_chars_mean', 'token_len_chars_max',
    'mattr', 'lexical_density', 'noun_ratio', 'verb_ratio', 'adj_ratio', 'pron_ratio',
    'function_word_ratio', 'clauses_per_sentence', 'avg_tree_depth', 'ari', 'fres', 'smog', 'cli',
    'sp_rate', 'gr_rate', 'ratio_a1', 'ratio_a2', 'ratio_b1', 'ratio_b2', 'ratio_c1'
]

In [None]:
class FeatureCORN(nn.Module):
    def __init__(self, input_dim, num_classes=8):
        super(FeatureCORN, self).__init__()
        self.linear = nn.Linear(input_dim, num_classes - 1)

    def forward(self, x):
        return self.linear(x)

In [None]:
class XLMR_DeepCORAL(nn.Module):
    def __init__(self, model_checkpoint, num_classes=8):
        super(XLMR_DeepCORAL, self).__init__()
        self.backbone = AutoModel.from_pretrained(model_checkpoint)
        self.linear = nn.Linear(self.backbone.config.hidden_size, 1, bias=False)
        self.biases = nn.Parameter(torch.zeros(num_classes - 1))

    def forward(self, input_ids, attention_mask, labels=None, **kwargs):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        score = self.linear(cls_embedding)
        logits = score + self.biases

        loss = None
        if labels is not None:
            # Task encoding (Ordinal > k)
            levels = torch.arange(logits.size(1), device=logits.device).expand_as(logits)
            targets = (labels.unsqueeze(1) > levels).float()

            # --- FIX: Use WithLogits for FP16 Stability ---
            loss = nn.functional.binary_cross_entropy_with_logits(logits, targets, reduction='sum')

        return (loss, logits) if loss is not None else logits

In [None]:
def get_coral_score(logits):
    """Returns continuous score (sum of probabilities)"""
    probs = 1 / (1 + np.exp(-logits))
    return np.sum(probs, axis=1)


In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

fold_results = {'qwk': [], 'mae': [], 'rmse': [], 'accuracy': [], 'best_alpha': []}

In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['labels'])):
    print(f"\n{'='*20} FOLD {fold+1}/5 {'='*20}")

    
    train_df = df.iloc[train_idx].copy()
    val_df = df.iloc[val_idx].copy()
    y_val = val_df['labels'].values


    print("Training Feature-CORN...")
    scaler = StandardScaler()
    X_train_feats = scaler.fit_transform(train_df[feature_cols].values)
    X_val_feats = scaler.transform(val_df[feature_cols].values)

    
    feat_model = FeatureCORN(input_dim=len(feature_cols))
    optimizer = torch.optim.Adam(feat_model.parameters(), lr=0.01)
    criterion = CornLoss(num_classes=NUM_CLASSES)

    X_train_t = torch.tensor(X_train_feats, dtype=torch.float32)
    y_train_t = torch.tensor(train_df['labels'].values, dtype=torch.long)

    for epoch in range(50):
        optimizer.zero_grad()
        logits = feat_model(X_train_t)
        loss = criterion(logits, y_train_t)
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        feat_logits = feat_model(torch.tensor(X_val_feats, dtype=torch.float32))
        preds_feat = corn_label_from_logits(feat_logits).numpy() # Shape [N]

    # ---------------------------
    # STEP 2: Train Deep CORAL
    # ---------------------------
    print("Training Deep CORAL...")

    def tok_func(ex):
        return tokenizer(ex["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

    train_ds = Dataset.from_pandas(train_df[['text', 'labels']]).map(tok_func, batched=True)
    val_ds = Dataset.from_pandas(val_df[['text', 'labels']]).map(tok_func, batched=True)

    deep_model = XLMR_DeepCORAL(MODEL_NAME, num_classes=8)

    args = TrainingArguments(
        output_dir=f"results_fusion_fold_{fold}",
        num_train_epochs=EPOCHS_DEEP,
        per_device_train_batch_size=BATCH_SIZE,
        learning_rate=2e-5,
        weight_decay=0.01,
        fp16=torch.cuda.is_available(),
        save_strategy="no",
        remove_unused_columns=False
    )

    trainer = Trainer(model=deep_model, args=args, train_dataset=train_ds, eval_dataset=val_ds)
    trainer.train()

    # Get Predictions
    raw_preds_deep = trainer.predict(val_ds).predictions
    preds_deep = get_coral_score(raw_preds_deep)

    # Optimizing Ensemble Weights (Alpha)
    print("Optimizing Fusion...")
    best_qwk = -1
    best_alpha = 0.0

    # Grid search for alpha based on QWK
    for alpha in np.linspace(0, 1, 21):
        combined_score = (alpha * preds_deep) + ((1 - alpha) * preds_feat)
        combined_int = np.round(combined_score).astype(int)
        combined_int = np.clip(combined_int, 0, NUM_CLASSES - 1)

        score = cohen_kappa_score(y_val, combined_int, weights='quadratic')
        if score > best_qwk:
            best_qwk = score
            best_alpha = alpha

   
    # Calculating ALL Metrics for Best Alpha
    final_combined = (best_alpha * preds_deep) + ((1 - best_alpha) * preds_feat)
    final_preds = np.round(final_combined).astype(int)
    final_preds = np.clip(final_preds, 0, NUM_CLASSES - 1)

    best_mae = mean_absolute_error(y_val, final_preds)
    best_rmse = np.sqrt(mean_squared_error(y_val, final_preds))
    best_acc = accuracy_score(y_val, final_preds)

    print(f"Fold {fold+1} Result: Alpha={best_alpha:.2f} | QWK={best_qwk:.4f} | MAE={best_mae:.4f} | RMSE={best_rmse:.4f}")

    fold_results['qwk'].append(best_qwk)
    fold_results['mae'].append(best_mae)
    fold_results['rmse'].append(best_rmse)
    fold_results['accuracy'].append(best_acc)
    fold_results['best_alpha'].append(best_alpha)

    # Cleanup
    del deep_model, trainer, feat_model
    torch.cuda.empty_cache()


print("\n" + "="*40)
print(f"FINAL LATE FUSION RESULTS")
print("="*40)
for k, v in fold_results.items():
    print(f"{k.upper()}: {np.mean(v):.4f} ± {np.std(v):.4f}")


Training Feature-CORN...
Training Deep CORAL...


Map:   0%|          | 0/910 [00:00<?, ? examples/s]

Map:   0%|          | 0/228 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

XLMRobertaModel LOAD REPORT from: xlm-roberta-base
Key                       | Status     |  | 
--------------------------+------------+--+-
lm_head.layer_norm.weight | UNEXPECTED |  | 
lm_head.dense.weight      | UNEXPECTED |  | 
lm_head.layer_norm.bias   | UNEXPECTED |  | 
lm_head.dense.bias        | UNEXPECTED |  | 
lm_head.bias              | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Step,Training Loss
500,30.793576


Optimizing Fusion...
Fold 1 Result: Alpha=0.95 | QWK=0.8558 | MAE=0.5395 | RMSE=0.8455

Training Feature-CORN...
Training Deep CORAL...


Map:   0%|          | 0/910 [00:00<?, ? examples/s]

Map:   0%|          | 0/228 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

XLMRobertaModel LOAD REPORT from: xlm-roberta-base
Key                       | Status     |  | 
--------------------------+------------+--+-
lm_head.layer_norm.weight | UNEXPECTED |  | 
lm_head.dense.weight      | UNEXPECTED |  | 
lm_head.layer_norm.bias   | UNEXPECTED |  | 
lm_head.dense.bias        | UNEXPECTED |  | 
lm_head.bias              | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Step,Training Loss
500,31.06723


Optimizing Fusion...
Fold 2 Result: Alpha=0.95 | QWK=0.8607 | MAE=0.4825 | RMSE=0.7666

Training Feature-CORN...
Training Deep CORAL...


Map:   0%|          | 0/910 [00:00<?, ? examples/s]

Map:   0%|          | 0/228 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

XLMRobertaModel LOAD REPORT from: xlm-roberta-base
Key                       | Status     |  | 
--------------------------+------------+--+-
lm_head.layer_norm.weight | UNEXPECTED |  | 
lm_head.dense.weight      | UNEXPECTED |  | 
lm_head.layer_norm.bias   | UNEXPECTED |  | 
lm_head.dense.bias        | UNEXPECTED |  | 
lm_head.bias              | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Step,Training Loss
500,31.228254


Optimizing Fusion...
Fold 3 Result: Alpha=0.95 | QWK=0.8729 | MAE=0.4693 | RMSE=0.7404

Training Feature-CORN...
Training Deep CORAL...


Map:   0%|          | 0/911 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

XLMRobertaModel LOAD REPORT from: xlm-roberta-base
Key                       | Status     |  | 
--------------------------+------------+--+-
lm_head.layer_norm.weight | UNEXPECTED |  | 
lm_head.dense.weight      | UNEXPECTED |  | 
lm_head.layer_norm.bias   | UNEXPECTED |  | 
lm_head.dense.bias        | UNEXPECTED |  | 
lm_head.bias              | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Step,Training Loss
500,30.901932


Optimizing Fusion...
Fold 4 Result: Alpha=0.95 | QWK=0.8695 | MAE=0.4802 | RMSE=0.7480

Training Feature-CORN...
Training Deep CORAL...


Map:   0%|          | 0/911 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

XLMRobertaModel LOAD REPORT from: xlm-roberta-base
Key                       | Status     |  | 
--------------------------+------------+--+-
lm_head.layer_norm.weight | UNEXPECTED |  | 
lm_head.dense.weight      | UNEXPECTED |  | 
lm_head.layer_norm.bias   | UNEXPECTED |  | 
lm_head.dense.bias        | UNEXPECTED |  | 
lm_head.bias              | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Step,Training Loss
500,31.284807


Optimizing Fusion...
Fold 5 Result: Alpha=0.95 | QWK=0.8454 | MAE=0.5903 | RMSE=0.8448

FINAL LATE FUSION RESULTS
QWK: 0.8609 ± 0.0099
MAE: 0.5123 ± 0.0460
RMSE: 0.7891 ± 0.0466
ACCURACY: 0.5395 ± 0.0367
BEST_ALPHA: 0.9500 ± 0.0000
