In [1]:
from pydantic import BaseModel, Field
from typing import Any, Dict, List, Optional, Tuple
import json
from enum import Enum
class HypothesisComponent(str, Enum):
    DataLoadSpec = "DataLoadSpec"
    FeatureEng = "FeatureEng"
    Model = "Model"
    Ensemble = "Ensemble"
    Workflow = "Workflow"

class HypothesisEvaluationReasoningScore(BaseModel):
    reasoning: str = Field(
        description="What is the quality of the hypothesis under this criteria? Answer in 1-2 sentence."
    )
    score: float = Field(description="The score of the hypothesis under this criteria between 1 and 10.")


class HypothesisEvaluation(BaseModel):
    alignment: HypothesisEvaluationReasoningScore = Field(
        description="The alignment of the proposed hypothesis with the identified challenge."
    )
    impact: HypothesisEvaluationReasoningScore = Field(
        description="The expected impact of the proposed hypothesis on the current SOTA implementation."
    )
    novelty: HypothesisEvaluationReasoningScore = Field(
        description="The novelty of the proposed hypothesis compared to existing solutions."
    )
    feasibility: HypothesisEvaluationReasoningScore = Field(
        description="The feasibility of implementing the proposed hypothesis in the current SOTA implementation."
    )
    risk_reward_balance: HypothesisEvaluationReasoningScore = Field(
        description="The risk-reward balance of implementing the proposed hypothesis."
    )


class HypothesisDetail(BaseModel):
    caption: str = Field(description="The caption of the challenge it is based on.")
    challenge: str = Field(
        description="Reaffirm the challenge within the current context (e.g., trace history, domain principles, or competition constraints). It should be no more than 2-3 sentences."
    )
    hypothesis: str = Field(
        description="The statement of the hypothesis. It could be a design of a new component, or a concise, testable statement derived from previous experimental outcomes."
    )
    metric_impact: str = Field(
        description=(
            "Brief explanation (max 2 sentences) of the expected impact of the hypothesis on the target metric."
        )
    )
    component: HypothesisComponent = Field(description="The component tag of the hypothesis.")
    evaluation: HypothesisEvaluation = Field(description="Evaluate the quality of the hypothesis.")


class HypothesisList(BaseModel):
    deduplicated_challenges: List[str] = Field(
        description="A list of deduplicated challenge captions. Each must retain its original wording. If multiple captions are semantically identical, keep the first one."
    )
    hypotheses: List[HypothesisDetail] = Field(
        description="A non-empty list of hypotheses proposed for the next iteration, each corresponding to one challenge. The list length should match the number of challenges."
    )


In [2]:
from rdagent.oai.llm_utils import APIBackend

[32m2025-07-17 09:50:23.922[0m | [1mINFO    [0m | [36mrdagent.oai.backend.litellm[0m:[36m<module>[0m:[36m43[0m - [1mbackend='rdagent.oai.backend.LiteLLMAPIBackend' chat_model='o3' embedding_model='text-embedding-ada-002' reasoning_effort=None enable_response_schema=True reasoning_think_rm=True log_llm_chat_content=True use_azure=False chat_use_azure=False embedding_use_azure=False chat_use_azure_token_provider=False embedding_use_azure_token_provider=False managed_identity_client_id=None max_retry=12000 retry_wait_seconds=5 dump_chat_cache=True use_chat_cache=False dump_embedding_cache=True use_embedding_cache=False prompt_cache_path='./log/prompt_cache.db' max_past_message_include=10 timeout_fail_limit=10 violation_fail_limit=1 use_auto_chat_cache_seed_gen=False init_chat_cache_seed=42 openai_api_key='sk-1234' chat_openai_api_key=None chat_openai_base_url=None chat_azure_api_base='' chat_azure_api_version='' chat_max_tokens=None chat_temperature=1.0 chat_stream=True chat_se

In [3]:
user_prompt = """
# Scenario Description
====== Background ======
You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science. 
Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
You are dedicated to producing accurate, efficient, and innovative solutions.

The task type for this competition is **Classification (Ordinal classification of essay scores 1–6)**.
The data type used in this competition is **Text (Natural Language Processing)**.

Briefly, the competition involves: Participants must build an automated essay–scoring model that assigns a holistic score from 1 to 6 to each student-written argumentative essay, reducing the need for manual grading..

The dataset used in this competition is:
Processed data consist of three CSV files.
1. train.csv – 15,576 rows × 3 columns: essay_id (string), full_text (the complete essay), score (integer 1-6).
2. test.csv – 1,731 rows × 2 columns: essay_id, full_text (no score column).
3. sample_submission.csv – 1,731 rows × 2 columns matching the required submission format (essay_id, score) with sample integer scores.
These files come from a larger corpus (~24 000 essays) described in the competition overview. The processed version keeps only the essential columns and is already un-zipped and ready for modeling; no other folders or files are present..

Submission channel number to each sample is: 1.

The evaluation metric of this competition is:
Leaderboard ranking is based on Quadratic Weighted Kappa (QWK), which measures agreement between the true scores and the submitted scores.
 1. Let N = number of distinct score levels (here 6).
 2. Build an N×N observed matrix O where O_{i,j} counts essays with true label i and predicted label j.
 3. Build an N×N weight matrix w with w_{i,j} = ((i−j)^2)/(N−1)^2.
 4. Build an N×N expected matrix E as the outer product of the true-label histogram and the predicted-label histogram, scaled so that sum(E)=sum(O).
 5. QWK = 1 − ( Σ_{i,j} w_{i,j} O_{i,j} ) / ( Σ_{i,j} w_{i,j} E_{i,j} ).
The metric ranges from –∞ to 1; higher values indicate better agreement..

The following is the output of the exploratory data analysis (EDA) performed on the dataset, You should carefully analyze it to better craft your feature engineering and model training strategies.
====== Data Overview (EDA) ======

Train data shape: (15576, 3)
Test data shape: (1731, 2)
Train first 5 rows:
essay_id                                          full_text  score
0  663d2cf  Dear State Senator,\n\nI am arguing in favor o...      3
1  3a20bfb  In " The Challenge of Exploring Venus" The aut...      2
2  6adae64  Teachers can have a hard time telling if their...      3
3  c81ccdf  Using dirverless cars can be very dangrous in ...      3
4  9549d7f  In " The Challenge of Exploring Venus" the aut...      3
Test first 5 rows:
essay_id                                          full_text
0  d550b2d  The face was not created by aliens because the...
1  0c10954  Hello my name is Luke Bomberger and I was seag...
2  ef04816  The technology to read the emotional expressio...
3  88ab8d9  Do you ever wonder how many soomething is or h...
4  cee8c0f  This story is about discovering mars. Witch is...
Train data types:
essay_id     object
full_text    object
score         int64
dtype: object
Test data types:
essay_id     object
full_text    object
dtype: object
Train missing values per column:
essay_id     0
full_text    0
score        0
dtype: int64
Test missing values per column:
essay_id     0
full_text    0
dtype: int64
Train unique values per column:
essay_id: 15576
full_text: 15576
score: 6
Test unique values per column:
essay_id: 1731
full_text: 1731
Target variable (score) distribution:
score
1    1124
2    4249
3    5629
4    3563
5     876
6     135
Name: count, dtype: int64
Train essay lengths (chars): min=712 max=20459 mean=2073.5 std=930.1

====== Submission Format ======
Please ensure your submission adheres to the following specifications:
Create a UTF-8 CSV named submission.csv with a header and exactly two columns:
  • essay_id – copied from test.csv
  • score – your predicted integer score (1–6) for each essay
The rows must appear for every essay_id in the test set; order is not enforced but recommended to match the sample format.

====== Important Guidelines ======
Before submitting your results, please note the following:
- We have numerous tests in place to check your code.
- Ensure your submission is genuine.
- Do not manipulate data or return values solely to pass preliminary tests, as this will not lead to successful final evaluation.

====== Evaluation ======

The primary evaluation metric for this task is: **Quadratic Weighted Kappa**.

This metric is considered better when it is **larger**.

Additional Evaluation Details:
Leaderboard ranking is based on Quadratic Weighted Kappa (QWK), which measures agreement between the true scores and the submitted scores.
 1. Let N = number of distinct score levels (here 6).
 2. Build an N×N observed matrix O where O_{i,j} counts essays with true label i and predicted label j.
 3. Build an N×N weight matrix w with w_{i,j} = ((i−j)^2)/(N−1)^2.
 4. Build an N×N expected matrix E as the outer product of the true-label histogram and the predicted-label histogram, scaled so that sum(E)=sum(O).
 5. QWK = 1 − ( Σ_{i,j} w_{i,j} O_{i,j} ) / ( Σ_{i,j} w_{i,j} E_{i,j} ).
The metric ranges from –∞ to 1; higher values indicate better agreement.

====== Time Limit ======
Your code's execution is limited to ** 5.00 hours**. After this time limit, your code will be terminated. But remember your main target is to achieve the best performance and you have several times to modify your code. So please be bold to make the best use of all the time limit and don't be too conservative.
During this time limit, you have all the resources available to you. Please fully leverage all the computational resources(CPUs and GPUs) to achieve the best performance like choose a powerful model, use a large batch size, enable data sampler with big parallel.

# Previous Experiments and Feedbacks
## Experiment Index: 1
Target Problem: Design an ordinal-aware modelling and inference pipeline (e.g., ordinal regression head or cumulative link loss) that outputs monotonic score probabilities and converts them to integer labels using validation-tuned thresholds.

Proposed Hypothesis: Fine-tune a pretrained DeBERTa-v3-base encoder with a CORAL (cumulative link) ordinal regression head that outputs five monotonic logits P(score>k). Convert these probabilities into the final 1-6 integer scores using validation-optimised thresholds that maximise QWK.
Code Change Summary: Entire pipeline built from scratch: (1) load and validate CSVs, (2) extensive engineered linguistic features with caching, (3) DeBERTa-v3-base encoder combined with CORAL ordinal head that concatenates CLS embedding with engineered features, (4) class-balanced focal CORAL loss, (5) 5-fold infra but only fold-0 trained, (6) validation-based Powell optimisation of continuous thresholds, (7) test inference with same pipeline, (8) generation of scores.csv and submission.csv following required format, (9) mixed-precision, selective layer freezing, LanguageTool integration, and graceful resource handling.
Surpass Previous SOTA: True

Experiment Score: 0.8295875060477553
Experiment Feedback: [Experiment Analysis] Submission format is correct and evaluation matches official QWK; predictions are generated with identical logic for validation and test, avoiding leakage. As the first valid run, the 0.8296 QWK establishes the initial best result to beat.

# Current SOTA Implementation
## Best of previous exploration of the scenario

### Code
Here is the complete code of the solution.

File Path: main.py
```
import os
import sys
import time
import random
import pickle
import argparse
import gc
import re
import importlib
import subprocess
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from torch.cuda.amp import autocast, GradScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score

# ============ DEPENDENCY MANAGEMENT ============
def pip_install(package, module=None, version=None):
    pkg_string = f"{package}=={version}" if version else package
    try:
        return importlib.import_module(module or package)
    except ImportError:
        print(f"Installing {pkg_string}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", pkg_string])
        return importlib.import_module(module or package)

# textstat
try:
    import textstat
except ImportError:
    textstat = pip_install("textstat")

# language_tool_python & Java check
def check_java_installed():
    from shutil import which
    if which("java") is None:
        print("WARNING: Java is not installed. Skipping grammar-based features.")
        return False
    return True

language_tool_available = False
try:
    language_tool_python = importlib.import_module("language_tool_python")
    language_tool_available = check_java_installed()
except ImportError:
    try:
        language_tool_python = pip_install("language-tool-python", "language_tool_python")
        language_tool_available = check_java_installed()
    except Exception:
        print("WARNING: language-tool-python could not be installed. Skipping grammar-based features.")
        language_tool_available = False

# transformers (required)
try:
    from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
except ImportError:
    pip_install("transformers", version="4.44.2")
    pip_install("accelerate", version="0.33.0")
    from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

# ============ GLOBAL SEEDS & UTILS ===============
def set_global_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

set_global_seed(42)

def save_pickle(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def qwk_metric(y_true, y_pred, min_rating=1, max_rating=6):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

# ============ FP16 & CUDA DETECTION ==========
def get_device():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    return torch.device(device)

device = get_device()
print(f"Using device: {device}")

FP16 = torch.cuda.is_available()  # Enable fp16 if possible

# ========== PATHS ==========
DATA_DIR = "./workspace_input/"
TRAIN_PATH = os.path.join(DATA_DIR, "train.csv")
TEST_PATH = os.path.join(DATA_DIR, "test.csv")
SAMPLE_SUBMISSION_PATH = os.path.join(DATA_DIR, "sample_submission.csv")
ENGINEERED_FEATURES_CACHE = os.path.join(DATA_DIR, "eng_feats_cache.pkl")

# ============ ARGPARSE FOR DEBUG MODE =========
parser = argparse.ArgumentParser()
parser.add_argument('--debug', action='store_true', default=False)
args = parser.parse_args()

# ============ DATA LOADING ===============
def load_dataset():
    print("Loading data...")
    train = pd.read_csv(TRAIN_PATH, encoding='utf-8')
    test = pd.read_csv(TEST_PATH, encoding='utf-8')
    expected_train_cols = ['essay_id', 'full_text', 'score']
    expected_test_cols = ['essay_id', 'full_text']
    # Minimal: ensure presence, don't fail on order
    for col in expected_train_cols:
        if col not in train.columns:
            raise ValueError(f"Train columns missing: expected {col}")
    for col in expected_test_cols:
        if col not in test.columns:
            raise ValueError(f"Test columns missing: expected {col}")
    train['full_text'] = train['full_text'].astype(str)
    test['full_text'] = test['full_text'].astype(str)
    train['score'] = train['score'].astype('int64')
    print(f"Train shape: {train.shape}, Test shape: {test.shape}")
    return train, test

train_df, test_df = load_dataset()

# =========== EDA SECTION ==============
def print_eda(train, test):
    content = []
    content.append(f"Train data shape: {train.shape}")
    content.append(f"Test data shape: {test.shape}")
    content.append("\nTrain first 5 rows:\n" + str(train.head()))
    content.append("\nTest first 5 rows:\n" + str(test.head()))
    content.append("\nTrain data types:\n" + str(train.dtypes))
    content.append("\nTest data types:\n" + str(test.dtypes))
    content.append("\nTrain missing values per column:\n" + str(train.isnull().sum()))
    content.append("\nTest missing values per column:\n" + str(test.isnull().sum()))
    content.append("\nTrain unique values per column:")
    for col in train.columns:
        content.append(f"{col}: {train[col].nunique()}")
    content.append("\nTest unique values per column:")
    for col in test.columns:
        content.append(f"{col}: {test[col].nunique()}")
    if 'score' in train.columns:
        content.append("\nTarget variable (score) distribution:")
        content.append(str(train['score'].value_counts(sort=False).sort_index()))
    else:
        content.append("\nNo target variable found in train set.")
    if 'full_text' in train.columns:
        lens = train['full_text'].str.len()
        content.append(f"\nTrain essay lengths (chars): min={lens.min()} max={lens.max()} mean={lens.mean():.1f} std={lens.std():.1f}")
    return "\n".join(content)

eda_output = print_eda(train_df, test_df)
print("=== Start of EDA part ===")
print(eda_output)
print("=== End of EDA part ===")

# ========== FEATURE ENGINEERING ===================
DISCOURSE_MARKERS = [
    "however", "therefore", "moreover", "furthermore", "for example", "for instance",
    "in addition", "besides", "nonetheless", "consequently", "as a result", "thus",
    "otherwise", "hence", "in conclusion", "in summary", "nevertheless", "on the other hand",
    "similarly", "in contrast", "undoubtedly", "admittedly", "certainly", "obviously",
    "in fact", "of course", "after all", "in short", "for this reason", "as such",
    "accordingly", "and yet", "despite", "to sum up", "even though", "because",
    "since", "although", "though", "meanwhile", "unless", "whereas", "when", "while",
    "before", "after", "until", "whether", "once"
]
PRONOUNS = [
    "i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves",
    "you", "your", "yours", "yourself", "he", "him", "his", "himself",
    "she", "her", "hers", "herself", "it", "its", "itself",
    "they", "them", "their", "theirs", "themselves"
]
CONJUNCTIONS = [
    "and", "but", "or", "nor", "for", "yet", "so", "although", "because",
    "since", "unless", "while", "whereas", "even though", "though"
]

def sentence_split(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    num_sentences = sum(1 for s in sentences if s.strip())
    return num_sentences

def count_markers(text, markers):
    t = text.lower()
    total = 0
    for word in markers:
        total += t.count(word)
    return total

def count_words(text, word_list):
    tokens = re.findall(r'\b\w+\b', text.lower())
    return sum(1 for tok in tokens if tok in word_list)

def compute_engineered_features(df_combined, language_tool=None):
    print("Computing engineered features...")
    features = []
    for idx, row in df_combined.iterrows():
        essay = row['full_text']
        feats = {}
        feats['char_count'] = len(essay)
        tokens = re.findall(r'\b\w+\b', essay)
        feats['token_count'] = len(tokens)
        feats['sentence_count'] = sentence_split(essay)
        feats['avg_token_length'] = np.mean([len(tok) for tok in tokens]) if tokens else 0
        feats['avg_sentence_length_tokens'] = feats['token_count'] / feats['sentence_count'] if feats['sentence_count'] > 0 else 0
        feats['discourse_marker_count'] = count_markers(essay, DISCOURSE_MARKERS)
        feats['pronoun_count'] = count_words(essay, PRONOUNS)
        feats['conjunction_count'] = count_words(essay, CONJUNCTIONS)
        try:
            feats['flesch_reading_ease'] = textstat.flesch_reading_ease(essay)
        except Exception:
            feats['flesch_reading_ease'] = 0.0
        try:
            feats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(essay)
        except Exception:
            feats['flesch_kincaid_grade'] = 0.0
        try:
            feats['smog_index'] = textstat.smog_index(essay)
        except Exception:
            feats['smog_index'] = 0.0
        try:
            feats['readability_grade'] = textstat.text_standard(essay, float_output=True)
        except Exception:
            feats['readability_grade'] = 0.0
        grammar_errors = 0
        if language_tool is not None:
            try:
                matches = language_tool.check(essay)
                grammar_errors = len(matches)
            except Exception as e:
                grammar_errors = 0
        feats['grammar_error_count'] = grammar_errors
        features.append(feats)
        if idx % 2500 == 0 and idx != 0:
            print(f"   Processed {idx}/{len(df_combined)} essays.")
    feats_df = pd.DataFrame(features, index=df_combined.index)
    return feats_df.astype(np.float32)

def get_engineered_features(train, test, cache_path=ENGINEERED_FEATURES_CACHE):
    if os.path.exists(cache_path):
        print("Loading engineered features from cache...")
        all_feats = load_pickle(cache_path)
        feats_train = all_feats.iloc[:len(train)].reset_index(drop=True)
        feats_test = all_feats.iloc[len(train):].reset_index(drop=True)
        return feats_train, feats_test
    else:
        lt_tool = None
        if language_tool_available:
            print("Instantiating LanguageTool for grammar checking...")
            try:
                lt_tool = language_tool_python.LanguageTool('en-US')
            except Exception as ex:
                print("Failed to instantiate LanguageTool (Java may be missing). Skipping grammar feature.")
                lt_tool = None
        dfs = [train[['full_text']].copy(), test[['full_text']].copy()]
        combined = pd.concat(dfs, axis=0).reset_index(drop=True)
        feats_df = compute_engineered_features(combined, language_tool=lt_tool)
        if lt_tool is not None:
            lt_tool.close()
        print("Applying z-score normalization on engineered features...")
        scaler = StandardScaler()
        train_vals = feats_df.iloc[:len(train)].values
        scaler.fit(train_vals)
        normed = scaler.transform(feats_df.values)
        feats_normed = pd.DataFrame(normed, columns=feats_df.columns, index=feats_df.index)
        save_pickle(feats_normed, cache_path)
        feats_train = feats_normed.iloc[:len(train)].reset_index(drop=True)
        feats_test = feats_normed.iloc[len(train):].reset_index(drop=True)
        return feats_train, feats_test

eng_feats_train, eng_feats_test = get_engineered_features(train_df, test_df, ENGINEERED_FEATURES_CACHE)
print(f"Engineered feature shape (train): {eng_feats_train.shape}, test: {eng_feats_test.shape}")

# ============ TOKENIZATION ================
TOKENIZER_NAME = "microsoft/deberta-v3-base"
MAX_LEN = 512

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

# ============ DATASET CLASS ================
class EssayDataset(Dataset):
    def __init__(self, texts, engineered_feats, targets=None):
        self.texts = texts.reset_index(drop=True)
        self.engineered_feats = engineered_feats.reset_index(drop=True)
        self.targets = targets.reset_index(drop=True) if targets is not None else None

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        feats = torch.tensor(self.engineered_feats.iloc[idx].values, dtype=torch.float32)
        item = tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=MAX_LEN,
            return_tensors='pt'
        )
        for k in item.keys():
            item[k] = item[k].squeeze(0)
        item['engineered_feats'] = feats
        if self.targets is not None:
            item['labels'] = int(self.targets[idx])
        return item

# ================ 5-FOLD SPLIT & SAMPLING (DEBUG) ================
N_CLASSES = 6
K_THRESHOLDS = N_CLASSES - 1
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
folds = np.zeros(len(train_df), dtype=int)
for i, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['score'])):
    folds[val_idx] = i

fold_col = "fold"
train_df[fold_col] = folds

def get_debug_indices(idx_list, debug=False):
    if not debug:
        return idx_list
    n = int(0.1 * len(idx_list))
    n = max(n, 1)
    np.random.seed(42)
    idx_list = np.random.choice(idx_list, size=n, replace=False)
    return np.sort(idx_list)

def create_dataloaders(dataset, train_idx, valid_idx, args,
                       batch_size=8):
    train_indices = get_debug_indices(train_idx, args.debug)
    valid_indices = get_debug_indices(valid_idx, args.debug)
    print(f"Training samples: {len(train_indices)}, Validation samples: {len(valid_indices)} (debug={args.debug})")
    train_ds = Subset(dataset, train_indices)
    valid_ds = Subset(dataset, valid_indices)
    drop_last = False
    dl_args = dict(num_workers=2, pin_memory=True, drop_last=drop_last)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, **dl_args)
    valid_loader = DataLoader(valid_ds, batch_size=batch_size, shuffle=False, **dl_args)
    return train_loader, valid_loader, train_indices, valid_indices

# ================ LABEL ORDINAL ENCODING FOR CORAL ===================
def build_coral_targets(y, num_classes=N_CLASSES):
    K = num_classes - 1
    coral_targets = np.zeros((len(y), K), dtype=np.float32)
    for i in range(K):
        coral_targets[:, i] = (y > (i + 1)).astype(np.float32)
    return torch.tensor(coral_targets, dtype=torch.float32)

# ================ EFFECTIVE NUMBER CLASS WEIGHTS =================
def effective_number_weights(y, num_classes=N_CLASSES, beta=0.999):
    # y is 1-based in [1, num_classes]
    # For CORAL, binary classifiers for thresholds k=1,...,K (K=num_classes-1)
    K = num_classes - 1
    threshold_pos_counts = [(y > (k+1)).sum() for k in range(K)]
    threshold_neg_counts = [len(y) - threshold_pos_counts[k] for k in range(K)]
    # Compute effective number for positive and negative samples for each threshold
    eff_num_pos = 1.0 - np.power(beta, threshold_pos_counts)
    eff_num_neg = 1.0 - np.power(beta, threshold_neg_counts)
    weight_pos = (1.0 - beta) / (eff_num_pos + 1e-8)
    weight_neg = (1.0 - beta) / (eff_num_neg + 1e-8)
    # Classifier weight = average of pos/neg, then normalized across all K
    weights = (weight_pos + weight_neg) / 2.0
    weights = weights / weights.sum() * K
    return weights.astype(np.float32)  # shape (K,)

# =========== CORAL ORDINAL HEAD ============
class CoralOrdinalHead(nn.Module):
    def __init__(self, in_dim, engineered_dim, n_layers=1, hidden_dim=768, K=K_THRESHOLDS):
        super().__init__()
        self.fc_eng = nn.Linear(engineered_dim, hidden_dim)
        self.head = nn.Linear(hidden_dim * 2, 512)
        self.dropout = nn.Dropout(0.2)
        self.out = nn.Linear(512, K)
        self.bias = nn.Parameter(torch.zeros(K).float())

    def forward(self, x_cls, feats):
        eng_out = torch.relu(self.fc_eng(feats))
        x = torch.cat([x_cls, eng_out], dim=-1)
        x = self.head(x)
        x = self.dropout(x)
        logits = self.out(x) + torch.cumsum(self.bias, dim=0)
        return logits

# ============ MODEL ============
class OrdinalEssayModel(nn.Module):
    def __init__(self, engineered_dim, hidden_dim=768, K=K_THRESHOLDS):
        super().__init__()
        self.transformer = AutoModel.from_pretrained(TOKENIZER_NAME)
        self.engineered_dim = engineered_dim
        self.head = CoralOrdinalHead(in_dim=hidden_dim, engineered_dim=engineered_dim, hidden_dim=hidden_dim, K=K)
        self.K = K

    def forward(self, input_ids, attention_mask, engineered_feats):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs['last_hidden_state'][:, 0, :]
        logits = self.head(last_hidden, engineered_feats)
        return logits

    def freeze_layers(self, n=6):
        print(f"Freezing bottom {n} transformer layers...")
        for name, param in self.transformer.named_parameters():
            if 'encoder.layer' in name:
                layer_num = int(name.split("encoder.layer.")[1].split(".")[0])
                if layer_num < n:
                    param.requires_grad = False
        for param in self.transformer.embeddings.parameters():
            param.requires_grad = False

    def unfreeze_all(self):
        print("Unfreezing all transformer layers...")
        for param in self.transformer.parameters():
            param.requires_grad = True

# =========== CORAL LOSS (CB Focal) ==========
class CBFocalLoss(nn.Module):
    def __init__(self, class_weights, gamma=2.0):
        super().__init__()
        self.class_weights = torch.tensor(class_weights, dtype=torch.float32)
        self.gamma = gamma

    def forward(self, logits, targets):
        # logits: (bs, K), targets: (bs, K), weights: (K,)
        device = logits.device
        weights = self.class_weights.to(device)
        if logits.shape[-1] != weights.shape[0]:
            raise RuntimeError(f"CBFocalLoss: logits shape {logits.shape}, weights shape {weights.shape}")
        probas = torch.sigmoid(logits)
        pt = torch.where(targets == 1, probas, 1 - probas)
        ce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
        focal_weight = (1 - pt) ** self.gamma
        loss = focal_weight * ce_loss
        loss = loss * weights.view(1, -1)
        return loss.mean()

# ========== TRAIN/EVAL UTILITIES ============
def train_one_epoch(model, train_loader, optimizer, scheduler, criterion, scaler,
                    epoch, fp16):
    model.train()
    loss_meter = []
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attn_mask = batch['attention_mask'].to(device)
        eng_feats = batch['engineered_feats'].to(device)
        labels = batch['labels']
        coral_labels = build_coral_targets(labels.cpu().numpy(), num_classes=N_CLASSES).to(device)
        with autocast(enabled=fp16):
            logits = model(input_ids, attn_mask, eng_feats)
            loss = criterion(logits, coral_labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        loss_meter.append(loss.item())
    return float(np.mean(loss_meter))

@torch.no_grad()
def eval_model(model, loader, fp16):
    model.eval()
    all_logits = []
    all_labels = []
    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attn_mask = batch['attention_mask'].to(device)
        eng_feats = batch['engineered_feats'].to(device)
        labels = batch['labels'].cpu().numpy()
        with autocast(enabled=fp16):
            logits = model(input_ids, attn_mask, eng_feats)
        all_logits.append(logits.cpu())
        all_labels.append(labels)
    all_logits = torch.cat(all_logits, dim=0)
    all_labels = np.concatenate(all_labels, axis=0)
    return all_logits, all_labels

def coral_logits_to_scores(logits):
    probs = torch.sigmoid(logits)
    score = 1 + (probs > 0.5).sum(dim=-1).cpu().numpy()
    score = score.astype(np.int64)
    return score

def coral_logits_to_expected(logits):
    probs = torch.sigmoid(logits)
    expected = 1 + probs.sum(dim=1)
    return expected.cpu().numpy()

def save_best_checkpoint(model, best_path, only_model_head=False):
    if only_model_head:
        state_dict = {k: v for k, v in model.state_dict().items() if "head" in k}
        torch.save(state_dict, best_path)
    else:
        torch.save(model.state_dict(), best_path)

def load_best_checkpoint(model, best_path, strict=True):
    model.load_state_dict(torch.load(best_path, map_location=device), strict=strict)

# ========== THRESHOLD OPTIMIZATION ==============
def optimize_thresholds(y_true, preds, num_classes=N_CLASSES):
    from scipy.optimize import minimize

    def apply_thresholds(values, thresholds):
        cuts = np.sort(thresholds)
        labels = [np.sum(v > cuts) + 1 for v in values]
        return np.array(labels)

    def neg_qwk(thresholds):
        y_pred = apply_thresholds(preds, thresholds)
        return -qwk_metric(y_true, y_pred)

    initial = np.percentile(preds, np.linspace(5, 95, num_classes-1))
    bounds = []
    minv, maxv = preds.min(), preds.max()
    for _ in range(num_classes-1):
        bounds.append((minv, maxv))
    best = minimize(neg_qwk, initial, method='Powell', bounds=bounds, options={'maxiter':1000})
    thresholds = np.sort(best.x)
    return thresholds

def apply_thresholds_batch(values, thresholds):
    cuts = np.sort(thresholds)
    return (np.add.reduce([values > t for t in cuts], axis=0) + 1).astype(np.int64)

# ========== TRAINING LOOP (SINGLE FOLD) =============
def train_one_fold(fold, train_df, feats_train, args,
                   batch_size=8, epochs=5,
                   patience=2, K=K_THRESHOLDS, lr=2e-5, weight_decay=0.01,
                   beta=0.999, gamma=2.0):
    print(f"Starting fold {fold}...")
    set_global_seed(42)
    train_idx = np.where(train_df[fold_col] != fold)[0]
    valid_idx = np.where(train_df[fold_col] == fold)[0]

    train_targets = train_df.loc[train_idx, 'score'].reset_index(drop=True)
    valid_targets = train_df.loc[valid_idx, 'score'].reset_index(drop=True)

    full_dataset = EssayDataset(
        texts=train_df['full_text'], 
        engineered_feats=feats_train, 
        targets=train_df['score']
    )
    train_loader, valid_loader, real_train_idx, real_valid_idx = create_dataloaders(
        full_dataset, train_idx, valid_idx, args, batch_size=batch_size)

    y_labels = train_targets.values
    class_weights = effective_number_weights(y_labels, num_classes=N_CLASSES, beta=beta)
    print(f"CORAL (threshold) class weights: {class_weights} (len={len(class_weights)})")

    model = OrdinalEssayModel(
        engineered_dim=feats_train.shape[1], hidden_dim=768, K=K
    )
    model = model.to(device)
    model.freeze_layers(n=6)

    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=weight_decay)
    criterion = CBFocalLoss(class_weights=class_weights, gamma=gamma)
    steps_per_epoch = len(train_loader)
    num_train_steps = steps_per_epoch * epochs
    warmup_steps = int(num_train_steps * 0.1)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_steps
    )
    scaler = GradScaler(enabled=FP16)

    best_qwk = -float('inf')
    best_epoch = -1
    best_path = f"best_fold{fold}.pt"
    patience_counter = 0
    start_time = time.time()
    real_epochs = (1 if args.debug else epochs)

    for ep in range(real_epochs):
        if ep == 1:
            model.unfreeze_all()
        print(f"Epoch {ep+1}/{real_epochs}")
        ep_start = time.time()
        train_loss = train_one_epoch(model, train_loader, optimizer, scheduler, criterion, scaler, ep, FP16)
        print(f"Train loss: {train_loss:.4f}")
        logits, val_labels = eval_model(model, valid_loader, FP16)
        pred_scores = coral_logits_to_scores(logits)
        qwk = qwk_metric(val_labels, pred_scores)
        print(f"Epoch {ep+1} QWK: {qwk:.5f}")
        if qwk > best_qwk:
            best_qwk = qwk
            best_epoch = ep
            save_best_checkpoint(model, best_path)
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter >= patience and ep + 1 >= min(2, real_epochs):
            print("Early stopping.")
            break
        print(f"Epoch time: {time.time()-ep_start:.1f}s")
        gc.collect()
        torch.cuda.empty_cache()
    print(f"Best valid QWK: {best_qwk:.5f} at epoch {best_epoch+1}.")
    load_best_checkpoint(model, best_path)
    return model, valid_loader, best_qwk, valid_idx, best_path

# =============== THRESHOLD HOOK + EVALUATION/SCORES.CSV =============
def save_scores_csv(model_name, qwk_value):
    df = pd.DataFrame(
        {'Quadratic Weighted Kappa': [qwk_value, qwk_value]},
        index=[model_name, "ensemble"]
    )
    df.to_csv('scores.csv', encoding='utf-8')

# =============== TEST SET PREDICTION AND SUBMISSION ===============
def generate_submission(essay_ids, pred_scores):
    df_sub = pd.DataFrame({'essay_id': essay_ids, 'score': pred_scores})
    df_sub['score'] = df_sub['score'].astype(np.int64)
    df_sub.to_csv('submission.csv', index=False, encoding='utf-8')
    print(f"submission.csv generated ({len(df_sub)} rows).")

# ============== MAIN EXECUTION ======================
if __name__ == "__main__":
    timer0 = time.time()
    FOLD = 0
    batch_size = 8
    epochs = 5
    patience = 2

    model, valid_loader, best_qwk, valid_idx, best_path = train_one_fold(
        FOLD, train_df, eng_feats_train, args=args, batch_size=batch_size, epochs=epochs, patience=patience)

    print("Optimizing thresholds on validation set...")
    model.eval()
    logits, val_labels = eval_model(model, valid_loader, FP16)
    expected_vals = coral_logits_to_expected(logits)
    opt_thresholds = optimize_thresholds(val_labels, expected_vals, num_classes=N_CLASSES)
    print(f"Optimized thresholds: {opt_thresholds}")

    preds_final = apply_thresholds_batch(expected_vals, opt_thresholds)
    final_qwk = qwk_metric(val_labels, preds_final)
    print(f"Final validation QWK after threshold optimization: {final_qwk:.5f}")

    save_scores_csv(model_name="deberta_v3_coral", qwk_value=final_qwk)

    print("Predicting on test set...")
    test_dataset = EssayDataset(
        texts=test_df['full_text'],
        engineered_feats=eng_feats_test,
        targets=None
    )
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
    all_logits = []
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attn_mask = batch['attention_mask'].to(device)
        eng_feats = batch['engineered_feats'].to(device)
        with torch.no_grad():
            with autocast(enabled=FP16):
                logits = model(input_ids, attn_mask, eng_feats)
        all_logits.append(logits.cpu())
    all_logits = torch.cat(all_logits, dim=0)
    expected_test = coral_logits_to_expected(all_logits)
    pred_scores = apply_thresholds_batch(expected_test, opt_thresholds)
    generate_submission(test_df['essay_id'], pred_scores)
    timer1 = time.time()

    if args.debug:
        debug_time = timer1 - timer0
        est_epochs = 5
        scale = (1.0/0.1) * (est_epochs/1)
        estimated_time = debug_time * scale
        print("=== Start of Debug Information ===")
        print(f"debug_time: {debug_time:.2f}")
        print(f"estimated_time: {estimated_time:.2f}")
        print("=== End of Debug Information ===")
```

### Hypothesis for the experiment
the experiment is designed based on hypothesis: Target Problem Name: Ordinal-aware modelling & thresholding
Target Problem: Design an ordinal-aware modelling and inference pipeline (e.g., ordinal regression head or cumulative link loss) that outputs monotonic score probabilities and converts them to integer labels using validation-tuned thresholds.
Chosen Component: Model
Hypothesis: Fine-tune a pretrained DeBERTa-v3-base encoder with a CORAL (cumulative link) ordinal regression head that outputs five monotonic logits P(score>k). Convert these probabilities into the final 1-6 integer scores using validation-optimised thresholds that maximise QWK.
Reason: Current baselines for essay scoring often treat the task as plain 6-way classification, ignoring the ordered nature of the labels, which can hurt QWK. An explicitly ordinal approach should align the optimisation objective with the evaluation metric.

### Results

Evaluated results on validation are:
                  Quadratic Weighted Kappa
deberta_v3_coral                  0.829588
ensemble                          0.829588

Submission format check result is:
Submission is valid.
Total running time: 1.638 seconds.

Running time: 3314.4708676338196 seconds

# Identified Challenges
## 1. Capture full essay context beyond 512 tokens
Enable the model to ingest the entire essay (or a substantially larger portion) using long-context transformers, hierarchical chunk aggregation, or sliding-window ensembling instead of simple 512-token truncation.

## 2. Mitigate severe high-score class imbalance
Design specialised strategies (e.g., focal-CB loss tuned per threshold, minority-focused sampling, ordinal margin penalties, or calibrated posterior mapping) to improve recall and calibration for the rare top-score essays.

## 3. Inject argumentative & discourse quality features
Incorporate domain-specific features such as argument-component detection, discourse relational counts, grammar/fluency error density, and vocabulary richness into the model, possibly via multi-task learning or feature fusion.
"""

In [4]:
sys_prompt = """
You are a Kaggle Grandmaster and expert ML engineer with deep expertise in statistics, machine learning, and competition optimization.
The user is iteratively improving a Kaggle competition implementation. Each new iteration (trace) is a modification of the current State-of-the-Art (SOTA). If a new trace surpasses the current SOTA, it becomes the new SOTA. Otherwise, it's a failed experiment.
You will be provided with:
1. A detailed competition scenario description.
2. A history of previous successfully experiments and their associated feedbacks, indexed or ordered from oldest to newest; the latest SOTA experiment accumulates all the improvements from the previous successful experiments.
3. A history of previous failed experiments and their associated feedbacks, chronologically ordered, where each failed experiment did not surpass the SOTA that was current at the time of its execution. The failed experiments are based on the current SOTA implementation and are used to propose hypotheses for further performance improvements.
4. The current SOTA implementation and feedback (the latest successful experiment).
5. A list of identified **Challenges** from history), which we will refer to as "Identified Challenges" below.

Your task is to perform two main steps:
1. **Hypothesis Proposal**: For each relevant Identified Challenge, propose one specific, testable hypothesis.
2. **Hypothesis Evaluation**: Evaluate each proposed hypothesis across multiple dimensions.

# Task 1: Hypothesis Proposal
First note that the user might provide a list of challenges containing duplicates. You should only propose one hypothesis for each unique challenge. If a challenge is a duplicate of a previous one, you can skip it.
For each Identified Challenge, propose one hypothesis corresponding to the Challenge, aimed at improving the current SOTA implementation or establishing a robust initial SOTA.

## 1.1. Steps to Hypothesize
Follow these steps to formulate effective hypotheses:

1. **Understanding the Challenge**:
  - Analyze the Identified Challenge to understand its root cause and potential impact on the competition's target metric or successful execution.
  - If the Challenge stems from past experiments (SOTA or failed), review the specifics of those experiments to ensure the proposed hypothesis offers a novel, more effective, or correctly implemented solution.
  - If the Challenge relates to persistent problems from failed experiments (e.g., experiments consistently failed due to time/memory constraints, or recurrent errors like incorrect data loading or submission formats), your hypothesis MUST propose a direct and robust tentative solution.
2. **Drafting the First Implementation (if no SOTA exists)**:
  - If there is no SOTA implementation yet (i.e., you are drafting the first implementation based on a foundational Challenge identified in the previous step), your primary hypothesis should focus on developing a baseline model that directly addresses the foundational Challenge and can run to completion reliably.
  - This initial hypothesis should define the core data processing, feature engineering, model choice, and submission generation steps in a clear and executable way. Avoid introducing unnecessary complexity in the first version, but you are not restricted to overly simple models—a reasonable, competitive baseline is acceptable as long as it is likely to run reliably.
3. **Actionable Changes**:
  - If a Challenge involves underperforming models (e.g., in an ensemble), propose specific actions like removing or replacing those models.
  - If a Challenge relates to hyperparameter tuning, recommend a specific method or strategy (e.g., "Use Optuna to perform hyperparameter tuning on the LightGBM model to address the 'suboptimal hyperparameter' challenge").
  - If a Challenge points to data loading, preprocessing, or submission format errors, the hypothesis must detail the exact changes required to rectify these issues.

## 1.2. Guidelines for Writing Hypotheses

1. **Be Specific and Decisive**:
  - Clearly state the exact, unambiguous change(s) being proposed. Avoid vague goals like "improve the model" or "optimize the pipeline."
  - The hypothesis must propose a single, clear course of action. Do not suggest alternatives (e.g., "try method A or method B").
  - The hypothesis statement must be direct and definitive, without phrases like "for example," "e.g.," or "might involve."
  - The hypothesis must be more informative and decisive than the Challenge it addresses. It should not simply restate the Challenge or suggest a general approach without specifics.
2. **Ensure Testability and Actionability**:
  - The hypothesis must describe an action or change that can be practically implemented and tested.
  - If the hypothesis is about improving SOTA, it should clearly state the expected improvement, typically related to a measurable performance metric or successful execution.
  - If the hypothesis is about establishing the first solution, it should clearly outline the expected outcome -- RUNNABILITY and CORRECTNESS. Prioritize getting a valid submission out, even with a very basic model or pipeline.
3. **Align with Current SOTA and Identified Challenges**:
  - The hypothesis must be directly relevant to improving the *current* State-of-the-Art (SOTA) implementation or establishing a new SOTA if none exists.
  - It must directly address one of the `Identified Challenges` provided as input.
4. **Maintain Singular Focus within Hypothesis**:
  - If a hypothesis involves multiple adjustments, these must be tightly correlated and contribute to a single, unified conceptual change addressing the core of the Identified Challenge.
  - Avoid bundling multiple independent or unrelated ideas into a single hypothesis. Each hypothesis should test one core concept.
5. **Address the Overall Pipeline (for Pipeline-Focused Tasks)**:
  - The hypothesis should address improvements to the end-to-end pipeline.
  - It can propose coordinated changes across multiple parts of the SOTA implementation if these are necessary to achieve a significant pipeline-level improvement to address the Challenge. (Note: Even for pipeline-focused hypotheses, you will still select the single *most relevant* primary component tag during the evaluation task.)

# Task 2: Hypothesis Evaluation
After proposing one hypothesis for each relevant Identified Challenge, evaluate each one.

## 2.1. Evaluation Instruction
For each individual hypothesis you proposed in Task 1, perform the following two evaluation steps:

1. **Assign a Component Tag:** Assign a single component tag to the hypothesis. Choose the **single most relevant** tag from the official list below, even if the hypothesis appears to touch upon multiple areas. Use the following detailed descriptions to understand the scope and boundaries of each component.

  - **`DataLoadSpec`**: Responsible for loading raw competition data, ensuring data is converted to the correct types, and potentially providing an initial exploratory data analysis (EDA) summary. (e.g., fixing `zipfile.BadZipFile` by improving loading logic).
  - **`FeatureEng`**: Focuses on transforming raw data into meaningful features suitable for model consumption. Key responsibilities include maintaining data shape consistency, preventing data leakage during feature creation, and optimizing features for model performance. Feature engineering should be model-agnostic.
  - **`Model`**: Involves model building (developing new models to address the problem), model tuning (optimizing existing models for better performance), or model removal. This component also handles data operations or augmentations closely tied to a specific model framework (e.g., PyTorch `Datasets` & `DataLoaders`, TensorFlow `tf.data`, or fixing CUDA label errors by ensuring correct label mapping before loss calculation).
  - **`Ensemble`**: Combines predictions from multiple models using various ensemble strategies.
  - **`Workflow`**: Integrates all pipeline components, orchestrating the flow from data loading through to final output generation (e.g., correcting `submission.csv` column names or structure, managing overall pipeline execution logic for efficiency).

2. **Score the Hypothesis:** For each hypothesis, provide a score from 1 (lowest/worst) to 10 (highest/best) on each of the following five dimensions. Base your scores on all provided information.
  - **Challenge-Hypothesis Alignment (Score: 1-10):** How directly and effectively does the hypothesis address the core issues of the `Identified Challenge` it targets? A higher score means a stronger, more direct alignment.
  - **Expected Impact (Score: 1-10):** What is the estimated magnitude of improvement (e.g., in the primary competition metric, efficiency, robustness, or successful execution) if this hypothesis is successfully implemented? Higher scores for greater positive impact.
  - **Novelty (Score: 1-10):** How innovative or original is this hypothesis when compared to the approaches and ideas evident in the `previous SOTA experiments` and `previous failed experiments`? Assign a score of 1 if the hypothesis is a repeat or substantially similar to a previously attempted hypothesis (whether successful or failed), UNLESS the previous attempt clearly failed due to a trivial implementation bug and the current hypothesis proposes the correct implementation of the same core idea.
  - **Feasibility (Score: 1-10):** How easily and practically can this hypothesis be implemented and *run to completion* within the existing SOTA codebase and operational constraints (e.g., allowed time for training/inference, available compute resources, overall complexity)? Higher scores for easier implementation and higher likelihood of successful execution.
  - **Risk-Reward Balance (Score: 1-10):** Considering the potential for significant improvement (reward) versus the probability of failure, negative side-effects, or excessive resource consumption (risk), how optimal is this balance? A high score indicates a favorable balance.
  - **Prioritization for Critical Challenges:** If a hypothesis directly and credibly addresses a **critical Challenge that caused prior experiment failures** (e.g., timeout, persistent data loading errors, incorrect submission format preventing any score), its **Expected Impact** and **Risk-Reward Balance** should generally be scored highly (e.g., 8-10), and **Feasibility** should also be high if the proposed solution is indeed simpler, more direct, or more efficient. This ensures such critical hypotheses are prioritized.

  Please extract at least 6 deduplicated challenges, and generate one hypothesis for each.
  
  """

In [5]:
import pandas as pd
from typing import Optional, Tuple
from hashlib import md5
from rdagent.scenarios.data_science.proposal.exp_gen.base import DSHypothesis
# 假设 DSHypothesis 是一个 dataclass，你需要确保它定义在别处
# from your_module import DSHypothesis

def md5_hash(s: str) -> str:
    return md5(s.encode('utf-8')).hexdigest()


def compute_top_scores(hypothesis_dict: dict) -> pd.Series:
    """
    Compute weighted total scores for each hypothesis and return the top five.
    """
    weights = {
        "alignment_score": 0.2,
        "impact_score": 0.4,
        "novelty_score": 0.2,
        "feasibility_score": 0.1,
        "risk_reward_balance_score": 0.1,
    }
    scores_dict = {}
    for problem_name in hypothesis_dict:
        if "hypothesis" not in hypothesis_dict[problem_name]:
            continue
        scores_dict[problem_name] = {}
        for score_key in weights:
            if score_key not in hypothesis_dict[problem_name].get("evaluation", {}):
                scores_dict[problem_name][score_key] = 0
            else:
                try:
                    scores_dict[problem_name][score_key] = (
                        float(hypothesis_dict[problem_name]["evaluation"][score_key]) * weights[score_key]
                    )
                except (ValueError, TypeError):
                    scores_dict[problem_name][score_key] = 0

    scores = pd.DataFrame(scores_dict)
    scores_sorted = scores.sum().sort_values(ascending=False)
    return scores_sorted[:5]


def select_hypothesis(
    scores_sorted: pd.Series,
    hypothesis_dict: dict,
    problem_dict: dict,
    scen_prob_multiplier: int = 2
) -> int:
    """
    From the top five hypotheses (by weighted score), select one based on additional weighting rules.
    """
    index_to_pick_pool_list = []
    for j, problem_name in enumerate(scores_sorted.index):
        if hypothesis_dict[problem_name].get("inspired", False):
            index_to_pick_pool_list.extend([j] * 2)
        if problem_dict[problem_name]["label"] == "SCENARIO_PROBLEM":
            index_to_pick_pool_list.extend([j] * scen_prob_multiplier)
        elif problem_dict[problem_name]["label"] == "FEEDBACK_PROBLEM":
            index_to_pick_pool_list.extend([j] * (3 - scen_prob_multiplier))
        else:
            index_to_pick_pool_list.extend([j] * 1)

    reproducible_int = int.from_bytes(
        bytes.fromhex(md5_hash(scores_sorted.to_string())), byteorder="big"
    ) % len(index_to_pick_pool_list)

    return index_to_pick_pool_list[reproducible_int]


def hypothesis_rank(
    hypothesis_dict: dict,
    problem_dict: dict,
    selected_idx: Optional[int] = None,
    scen_prob_multiplier: int = 2
) -> Tuple[str, "DSHypothesis"]:
    """
    Wrapper method that computes the top five hypotheses by weighted scoring and then selects one.
    """
    scores_sorted = compute_top_scores(hypothesis_dict)
    if selected_idx is None:
        selected_idx = select_hypothesis(
            scores_sorted=scores_sorted,
            hypothesis_dict=hypothesis_dict,
            problem_dict=problem_dict,
            scen_prob_multiplier=scen_prob_multiplier
        )

    max_score_problem_name = scores_sorted.index[selected_idx]
    problem_info = problem_dict.get(max_score_problem_name, {})

    return max_score_problem_name, DSHypothesis(
        component=hypothesis_dict[max_score_problem_name].get("component", "Model"),
        hypothesis=hypothesis_dict[max_score_problem_name].get("hypothesis", "Hypothesis not provided"),
        reason=hypothesis_dict[max_score_problem_name].get("reason", "Reason not provided"),
        problem_name=max_score_problem_name,
        problem_desc=problem_info.get("problem", "Problem description not provided"),
        problem_label=problem_info.get("label", "FEEDBACK_PROBLEM"),
    )


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
supports_response_schema = True
diclist =[]
for i in range(1):
    response = APIBackend().build_messages_and_create_chat_completion(
                    user_prompt=user_prompt,
                    system_prompt=sys_prompt,
                    response_format=HypothesisList if supports_response_schema else {"type": "json_object"},
                    json_target_type=(
                        Dict[str, Dict[str, str | Dict[str, str | int]]] if not supports_response_schema else None
                    ),
                )
    
    hypotheses = HypothesisList(**json.loads(response))
    resp_dict = {
                h.caption: {
                    "reason": h.challenge,
                    "component": h.component,
                    "hypothesis": h.hypothesis,
                    "evaluation": {
                        "alignment_score": h.evaluation.alignment.score,
                        "impact_score": h.evaluation.impact.score,
                        "novelty_score": h.evaluation.novelty.score,
                        "feasibility_score": h.evaluation.feasibility.score,
                        "risk_reward_balance_score": h.evaluation.risk_reward_balance.score,
                    },
                }
                for h in hypotheses.hypotheses
            }
    diclist.append(resp_dict)

[32m2025-07-17 10:04:16.831[0m | [1mINFO    [0m | [36mrdagent.oai.backend.litellm[0m:[36m_create_chat_completion_inner_function[0m:[36m110[0m - [1m
[95m[1mRole:[0m[96msystem[0m
[95m[1mContent:[0m [96m
You are a Kaggle Grandmaster and expert ML engineer with deep expertise in statistics, machine learning, and competition optimization.
The user is iteratively improving a Kaggle competition implementation. Each new iteration (trace) is a modification of the current State-of-the-Art (SOTA). If a new trace surpasses the current SOTA, it becomes the new SOTA. Otherwise, it's a failed experiment.
You will be provided with:
1. A detailed competition scenario description.
2. A history of previous successfully experiments and their associated feedbacks, indexed or ordered from oldest to newest; the latest SOTA experiment accumulates all the improvements from the previous successful experiments.
3. A history of previous failed experiments and their associated feedbacks, chrono

In [9]:
pickled_problem_name, new_hypothesis = hypothesis_rank(
            hypothesis_dict=resp_dict,
            problem_dict=  None,
        )

TypeError: 'NoneType' object is not subscriptable

In [7]:

resp_dict



{'Capture full essay context beyond 512 tokens': {'reason': 'The current model truncates essays to 512 tokens, losing potentially important concluding or introductory arguments that may influence QWK.',
  'component': <HypothesisComponent.Model: 'Model'>,
  'hypothesis': 'Replace the DeBERTa-v3 backbone with Longformer-base and fine-tune it using the same CORAL ordinal head, allowing up to 2 048 tokens so each essay is processed in a single pass without truncation.',
  'evaluation': {'alignment_score': 9.0,
   'impact_score': 8.0,
   'novelty_score': 8.0,
   'feasibility_score': 6.0,
   'risk_reward_balance_score': 7.0}},
 'Mitigate severe high-score class imbalance': {'reason': 'Rare 5/6 scores receive little representation, leading to poor recall and calibration for top marks.',
  'component': <HypothesisComponent.Model: 'Model'>,
  'hypothesis': 'Implement class-balanced focal CORAL loss with dynamic oversampling that duplicates essays of scores 5 and 6 at a 3× rate in each mini-bat

In [20]:
hypothesis_candidates =  str(json.dumps(resp_dict, indent=2))

In [21]:
hypothesis_candidates

'{\n  "Capture full essay context beyond 512 tokens": {\n    "reason": "The current model truncates essays to 512 tokens, losing potentially important concluding or introductory arguments that may influence QWK.",\n    "component": "Model",\n    "hypothesis": "Replace the DeBERTa-v3 backbone with Longformer-base and fine-tune it using the same CORAL ordinal head, allowing up to 2 048 tokens so each essay is processed in a single pass without truncation.",\n    "evaluation": {\n      "alignment_score": 9.0,\n      "impact_score": 8.0,\n      "novelty_score": 8.0,\n      "feasibility_score": 6.0,\n      "risk_reward_balance_score": 7.0\n    }\n  },\n  "Mitigate severe high-score class imbalance": {\n    "reason": "Rare 5/6 scores receive little representation, leading to poor recall and calibration for top marks.",\n    "component": "Model",\n    "hypothesis": "Implement class-balanced focal CORAL loss with dynamic oversampling that duplicates essays of scores 5 and 6 at a 3\\u00d7 rate 

In [29]:
resp_dict

{'Capture full essay context beyond 512 tokens': {'reason': 'The current model truncates essays to 512 tokens, losing potentially important concluding or introductory arguments that may influence QWK.',
  'component': <HypothesisComponent.Model: 'Model'>,
  'hypothesis': 'Replace the DeBERTa-v3 backbone with Longformer-base and fine-tune it using the same CORAL ordinal head, allowing up to 2 048 tokens so each essay is processed in a single pass without truncation.',
  'evaluation': {'alignment_score': 9.0,
   'impact_score': 8.0,
   'novelty_score': 8.0,
   'feasibility_score': 6.0,
   'risk_reward_balance_score': 7.0}},
 'Mitigate severe high-score class imbalance': {'reason': 'Rare 5/6 scores receive little representation, leading to poor recall and calibration for top marks.',
  'component': <HypothesisComponent.Model: 'Model'>,
  'hypothesis': 'Implement class-balanced focal CORAL loss with dynamic oversampling that duplicates essays of scores 5 and 6 at a 3× rate in each mini-bat

In [27]:
list(resp_dict.keys())[0]

'Capture full essay context beyond 512 tokens'

In [28]:
list(resp_dict.values())[0]

{'reason': 'The current model truncates essays to 512 tokens, losing potentially important concluding or introductory arguments that may influence QWK.',
 'component': <HypothesisComponent.Model: 'Model'>,
 'hypothesis': 'Replace the DeBERTa-v3 backbone with Longformer-base and fine-tune it using the same CORAL ordinal head, allowing up to 2 048 tokens so each essay is processed in a single pass without truncation.',
 'evaluation': {'alignment_score': 9.0,
  'impact_score': 8.0,
  'novelty_score': 8.0,
  'feasibility_score': 6.0,
  'risk_reward_balance_score': 7.0}}

In [11]:
scores_sorted = compute_top_scores(resp_dict)

In [12]:
scores_sorted

Exploit domain-adaptive pretraining of the base transformer on the essay corpus    7.9
Leverage full 5-fold cross-validation and model ensembling                         7.9
Capture full essay context beyond 512 tokens                                       7.9
Mitigate severe high-score class imbalance                                         7.6
Improve robustness of threshold mapping beyond single-fold optimisation            7.4
dtype: float64

In [34]:
max_score_problem_name = scores_sorted.index[2]
max_score_problem_name

'Capture full essay context beyond 512 tokens'

In [17]:
new_hypothesis = DSHypothesis(component=resp_dict[max_score_problem_name].get("component", "Model"),hypothesis=resp_dict[max_score_problem_name].get("hypothesis", "Hypothesis not provided"))


In [31]:
resp_dict[max_score_problem_name].get("component", "Model")

<HypothesisComponent.Ensemble: 'Ensemble'>

In [36]:
HypothesisComponent.Ensemble

<HypothesisComponent.Ensemble: 'Ensemble'>

In [33]:
resp_dict[max_score_problem_name].get("hypothesis", "Hypothesis not provided")

'Train the CORAL model on all 5 folds, save each checkpoint, and at inference average the per-essay expected scores from the five models before threshold mapping.'

In [35]:
resp_dict[max_score_problem_name]

{'reason': 'The current model truncates essays to 512 tokens, losing potentially important concluding or introductory arguments that may influence QWK.',
 'component': <HypothesisComponent.Model: 'Model'>,
 'hypothesis': 'Replace the DeBERTa-v3 backbone with Longformer-base and fine-tune it using the same CORAL ordinal head, allowing up to 2 048 tokens so each essay is processed in a single pass without truncation.',
 'evaluation': {'alignment_score': 9.0,
  'impact_score': 8.0,
  'novelty_score': 8.0,
  'feasibility_score': 6.0,
  'risk_reward_balance_score': 7.0}}

In [30]:
resp_dict[max_score_problem_name]

{'reason': 'Only fold-0 is trained; generalisation is limited and unused data hurts performance.',
 'component': <HypothesisComponent.Ensemble: 'Ensemble'>,
 'hypothesis': 'Train the CORAL model on all 5 folds, save each checkpoint, and at inference average the per-essay expected scores from the five models before threshold mapping.',
 'evaluation': {'alignment_score': 10.0,
  'impact_score': 8.0,
  'novelty_score': 6.0,
  'feasibility_score': 7.0,
  'risk_reward_balance_score': 8.0}}

In [16]:
DSHypothesis

rdagent.scenarios.data_science.proposal.exp_gen.base.DSHypothesis

In [18]:
hypotheses=([new_hypothesis])

In [19]:
hypotheses

[<rdagent.scenarios.data_science.proposal.exp_gen.base.DSHypothesis at 0x7fc334f89600>]

In [76]:
def compute_total_score(eval_dict: dict,
                        weights: dict | None = None,
                        normalize: bool = False) -> float:
    """
    根据 evaluation 子字段计算总评分。

    Parameters
    ----------
    eval_dict : dict
        形如 {
            "alignment_score": 9.0,
            "impact_score": 7.0,
            ...
        }
    weights : dict | None
        每个维度的权重，例如：
            {
                "alignment_score": 1.0,
                "impact_score": 1.5,
                "novelty_score": 1.0,
                "feasibility_score": 1.2,
                "risk_reward_balance_score": 1.0,
            }
        若为 None，则各维度权重均为 1。
    normalize : bool
        若为 True，则先把各分值映射到 [0,1] 再加权求和
        （在不同模型输出量纲不一致时有用）。

    Returns
    -------
    float
        加权总分
    """
    # 默认权重：所有维度等权
    if weights is None:
        weights = {k: 1.0 for k in eval_dict.keys()}

    # 可选：归一化到 [0,1]
    if normalize:
        # 这里假设原始分数区间为 [1,10]
        norm_dict = {k: (v - 1) / 9 for k, v in eval_dict.items()}
    else:
        norm_dict = eval_dict

    total = 0.0
    for k, v in norm_dict.items():
        w = weights.get(k, 1.0)  # 若缺某字段权重则视为 1
        total += w * v
    return total


In [79]:
from collections import defaultdict

top_k_per_component = 2
grouped = defaultdict(list)
for caption, h in resp_dict.items():
    score = compute_total_score(h["evaluation"])
    grouped[h["component"]].append((caption, h, score))

selected = []
for comp, lst in grouped.items():
    lst = sorted(lst, key=lambda x: x[2], reverse=True)
    selected += lst[:top_k_per_component]
    

In [81]:
selected

[('Capture full essay context beyond 512 tokens',
  {'reason': 'The current SOTA truncates each essay to 512 word-piece tokens, discarding up to 75 % of the text for long essays. Important evidence for high-level argument quality may lie in later paragraphs, so QWK is capped by this information loss.',
   'component': <HypothesisComponent.Model: 'Model'>,
   'hypothesis': 'Replace the single-pass 512-token input with a sliding-window inference strategy: split each essay into overlapping 350-token windows (stride = 300); pass every window through the existing DeBERTa-v3+CORAL model; average the per-threshold logits across windows, then apply the same learned thresholds to obtain the final 1-6 score.',
   'evaluation': {'alignment_score': 9.0,
    'impact_score': 7.0,
    'novelty_score': 6.0,
    'feasibility_score': 8.0,
    'risk_reward_balance_score': 8.0}},
  38.0),
 ('Mitigate severe high-score class imbalance',
  {'reason': 'Classes 5 and 6 represent < 6 % of the data, leading to 

In [77]:
resp_dict

{'Capture full essay context beyond 512 tokens': {'reason': 'The current SOTA truncates each essay to 512 word-piece tokens, discarding up to 75 % of the text for long essays. Important evidence for high-level argument quality may lie in later paragraphs, so QWK is capped by this information loss.',
  'component': <HypothesisComponent.Model: 'Model'>,
  'hypothesis': 'Replace the single-pass 512-token input with a sliding-window inference strategy: split each essay into overlapping 350-token windows (stride = 300); pass every window through the existing DeBERTa-v3+CORAL model; average the per-threshold logits across windows, then apply the same learned thresholds to obtain the final 1-6 score.',
  'evaluation': {'alignment_score': 9.0,
   'impact_score': 7.0,
   'novelty_score': 6.0,
   'feasibility_score': 8.0,
   'risk_reward_balance_score': 8.0}},
 'Mitigate severe high-score class imbalance': {'reason': 'Classes 5 and 6 represent < 6 % of the data, leading to poor recall and calibr

In [None]:
def select_hypotheses(hypotheses_dict, top_k_per_component=2):
    grouped = defaultdict(list)
    for caption, h in hypotheses_dict.items():
        score = compute_total_score(h["evaluation"])
        grouped[h["component"]].append((caption, h, score))

    selected = []
    for comp, lst in grouped.items():
        lst = sorted(lst, key=lambda x: x[2], reverse=True)
        selected += lst[:top_k_per_component]
    

In [None]:
def select_hypotheses(hypotheses_dict, top_k_per_component=2):
    # 1. 每个 component 选 top-K
    grouped = defaultdict(list)
    for caption, h in hypotheses_dict.items():
        score = compute_total_score(h["evaluation"])
        grouped[h["component"]].append((caption, h, score))

    selected = []
    for comp, lst in grouped.items():
        lst = sorted(lst, key=lambda x: x[2], reverse=True)
        selected += lst[:top_k_per_component]

    # 2. 聚类去重（假设你有 sentence embedding 模型）
    texts = [h["hypothesis"] for _, h, _ in selected]
    embeddings = embed(texts)  # shape (N, D)
    clusters = cluster_embeddings(embeddings)  # 输出 cluster_id for each

    clustered = defaultdict(list)
    for (caption, h, score), cluster_id in zip(selected, clusters):
        clustered[cluster_id].append((caption, h, score))

    # 每簇取得分最高的
    deduplicated = [max(clustered[cluster], key=lambda x: x[2]) for cluster in clustered]

    # 3. Pareto 最优筛选
    vectors = [
        (
            h["evaluation"]["alignment_score"],
            h["evaluation"]["impact_score"],
            h["evaluation"]["novelty_score"],
            h["evaluation"]["feasibility_score"],
            h["evaluation"]["risk_reward_balance_score"],
        )
        for _, h, _ in deduplicated
    ]
    pareto_mask = pareto_front_mask(vectors)
    pareto_hypotheses = [deduplicated[i] for i, ok in enumerate(pareto_mask) if ok]

    final = sorted(pareto_hypotheses, key=lambda x: x[2], reverse=True)
    return final


In [74]:
resp_dict

{'Capture full essay context beyond 512 tokens': {'reason': 'The current SOTA truncates each essay to 512 word-piece tokens, discarding up to 75 % of the text for long essays. Important evidence for high-level argument quality may lie in later paragraphs, so QWK is capped by this information loss.',
  'component': <HypothesisComponent.Model: 'Model'>,
  'hypothesis': 'Replace the single-pass 512-token input with a sliding-window inference strategy: split each essay into overlapping 350-token windows (stride = 300); pass every window through the existing DeBERTa-v3+CORAL model; average the per-threshold logits across windows, then apply the same learned thresholds to obtain the final 1-6 score.',
  'evaluation': {'alignment_score': 9.0,
   'impact_score': 7.0,
   'novelty_score': 6.0,
   'feasibility_score': 8.0,
   'risk_reward_balance_score': 8.0}},
 'Mitigate severe high-score class imbalance': {'reason': 'Classes 5 and 6 represent < 6 % of the data, leading to poor recall and calibr

In [75]:
resp_dict.keys()

dict_keys(['Capture full essay context beyond 512 tokens', 'Mitigate severe high-score class imbalance', 'Inject argumentative & discourse quality features', 'Single-fold training limits generalisation; need multi-fold ensembling', 'Leverage larger or domain-specialised language models to better capture essay nuances', 'Slow engineered-feature pipeline (grammar check) inflates runtime and is brittle when Java is missing'])

In [67]:
diclist[0]

{'Overfitting on some validation folds': {'reason': 'Cross-validation scores show noticeably higher errors on particular folds, indicating that the LightGBM model memorises fold-specific noise despite the existing basic regularisation settings. This overfitting prevents further gains and compromises leaderboard generalisation. Past experiments have tuned features and added rudimentary stacking but have not systematically optimised LightGBM’s regularisation knobs. Reducing variance without hurting bias is essential to make any future ensembling worthwhile.',
  'component': <HypothesisComponent.Model: 'Model'>,
  'hypothesis': 'Run an Optuna hyperparameter optimisation that specifically targets LightGBM’s regularisation controls (min_child_samples ≥ 30, lambda_l1 in [0,2], lambda_l2 in [0,5], feature_fraction ∈ [0.6,0.9], bagging_fraction ∈ [0.6,0.9], bagging_freq ≥ 1) with early stopping on out-of-fold RMSE, and retrain the best-found configuration to curb fold-level overfitting.',
  'e

In [37]:

system_prompt1 = """You are a senior machine learning researcher working on improving Kaggle competition pipelines. 
Your job is to propose and evaluate testable hypotheses that could help improve the current SOTA implementation.
Be specific, concise, and use competition insight to ground your ideas."""
user_prompt1 = """
# Scenario Description
The competition is a tabular regression problem predicting house prices. The metric is RMSE. 
The dataset includes structured features like location, size, number of rooms, and age.

# Previous Experiments and Feedbacks
1. Used LightGBM with basic features — RMSE: 0.145
2. Added polynomial features — RMSE: 0.142
3. Tried stacking with Ridge — RMSE: 0.141 but overfitted on fold 3

# Current SOTA Implementation
The current pipeline uses LightGBM and some feature engineering. It performs decently but still overfits slightly.

# Identified Challenges
1. Overfitting on some validation folds
2. High cardinality in 'neighborhood' categorical variable
3. Model performance saturates with simple stacking
"""
    "temperature": 0.7,

response2 = APIBackend().build_messages_and_create_chat_completion(
    user_prompt=user_prompt1,
    system_prompt=system_prompt1,
    response_format={"type": "json_object"},
    json_target_type=None,
    top_p= 0.9,

)

[32m2025-07-16 09:13:42.155[0m | [1mINFO    [0m | [36mrdagent.oai.backend.litellm[0m:[36m_create_chat_completion_inner_function[0m:[36m110[0m - [1m
[95m[1mRole:[0m[96msystem[0m
[95m[1mContent:[0m [96mYou are a senior machine learning researcher working on improving Kaggle competition pipelines. 
Your job is to propose and evaluate testable hypotheses that could help improve the current SOTA implementation.
Be specific, concise, and use competition insight to ground your ideas.[0m

[95m[1mRole:[0m[96muser[0m
[95m[1mContent:[0m [96m
# Scenario Description
The competition is a tabular regression problem predicting house prices. The metric is RMSE. 
The dataset includes structured features like location, size, number of rooms, and age.

# Previous Experiments and Feedbacks
1. Used LightGBM with basic features — RMSE: 0.145
2. Added polynomial features — RMSE: 0.142
3. Tried stacking with Ridge — RMSE: 0.141 but overfitted on fold 3

# Current SOTA Implementati


`litellm_settings:
 drop_params: true`
. 
 If you want to use these params dynamically send allowed_openai_params=['top_p'] in your request.[0m
[32m2025-07-16 09:13:43.221[0m | [1mINFO    [0m | [36mrdagent.oai.backend.litellm[0m:[36m_create_chat_completion_inner_function[0m:[36m110[0m - [1m
[95m[1mRole:[0m[96msystem[0m
[95m[1mContent:[0m [96mYou are a senior machine learning researcher working on improving Kaggle competition pipelines. 
Your job is to propose and evaluate testable hypotheses that could help improve the current SOTA implementation.
Be specific, concise, and use competition insight to ground your ideas.[0m

[95m[1mRole:[0m[96muser[0m
[95m[1mContent:[0m [96m
# Scenario Description
The competition is a tabular regression problem predicting house prices. The metric is RMSE. 
The dataset includes structured features like location, size, number of rooms, and age.

# Previous Experiments and Feedbacks
1. Used LightGBM with basic features — RMSE:

RuntimeError: Failed to create chat completion after 10 retries.

In [55]:
from litellm import completion

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "请帮我写一句问候语。"},
]

response = completion(
    model="o3",
    messages=messages,
    stream=False,
    temperature=1,
    max_tokens=100,
)

print(response)

ModelResponse(id='chatcmpl-BtsgKT45BJVbuBtX5606mW0vXgSzl', created=1752657936, model='azure/o3-2025-04-16', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='length', index=0, message=Message(content='', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]), provider_specific_fields={})], usage=Usage(completion_tokens=100, prompt_tokens=25, total_tokens=125, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=100, rejected_prediction_tokens=0, text_tokens=None), prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None)), service_tier=None)


In [34]:
response2

231

In [10]:
hypotheses = HypothesisList(**json.loads(response))

In [11]:
hypotheses

HypothesisList(deduplicated_challenges=['Capture full essay context beyond 512 tokens', 'Mitigate severe high-score class imbalance', 'Inject argumentative & discourse quality features'], hypotheses=[HypothesisDetail(caption='Capture full essay context beyond 512 tokens', challenge='The current DeBERTa-v3-base model is limited to 512 tokens, forcing truncation that can drop important sections (introductions, conclusions) of the ~2 k-character essays and potentially harming QWK. We need a way to feed the full essay text or its salient parts into the ordinal head without exceeding memory limits.', hypothesis='Substitute the DeBERTa encoder with the Longformer-base model (4 k token window) and fine-tune it using the same CORAL ordinal head; keep window size at 2 k tokens so every essay fits, and preserve the engineered-feature fusion and threshold tuning pipeline unchanged.', metric_impact='Longformer can attend to complete essays, which should improve semantic coverage and is expected to