# Ensemble Solutions

Ideas taken from:
- https://www.kaggle.com/code/minglv/ultimate-ensemble-fusion/notebook
- https://www.kaggle.com/code/verracodeguacas/guarded-assembly-map-3

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 30/09/2025   | Martin | Created   | Notebook created to try solutions from Kaggle | 

# Content

* [Introduction](#introduction)

# Introduction

In [None]:
%load_ext watermark

In [1]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import torch
import polars as pl
import numpy as np
import gc

from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from torch.utils.data import DataLoader
from datasets import Dataset
from peft import PeftModel




# Load Data

In [49]:
path = "data/raw"
train = pl.read_csv(f"{path}/train.csv")
test = pl.read_csv(f"{path}/test.csv")

# Preprocessing

In [50]:
# Add new labels
le = LabelEncoder()
train = train.with_columns(
  Target = pl.col('Category') + ":" + pl.col('Misconception'),
  Correct = pl.col("Category").str.split("_").list.last() == "Correct"
)
train = train.with_columns(
  Label = pl.col("Target").map_batches(le.fit_transform)
)

# Get known answers
temp = train.filter(
  pl.col('Correct')
)
temp = temp.group_by(['QuestionId', 'MC_Answer']).count().sort('count', descending=True)
temp = temp.unique(subset=['QuestionId'])
temp = temp.drop('count')
temp = temp.with_columns(
  Correct=1
)

# Evaluate correctness of test set answer
test = test.join(
  temp,
  how='left',
  on=['QuestionId', 'MC_Answer']
)
test = test.fill_null(0)

  temp = temp.group_by(['QuestionId', 'MC_Answer']).count().sort('count', descending=True)


In [51]:
def format_input(df):
  df = df.with_columns(
    pl.when(pl.col("Correct") == 1)
      .then(pl.lit("This answer is correct."))
      .otherwise(pl.lit("This answer is wrong."))
      .alias("Correctness")
  )

  return df.with_columns(
    pl.format(
      "Question:\n{}\nAnswer:\n{}\nCorrect:\n{}\nExplanation:\n{}",
      df['QuestionText'],
      df['MC_Answer'],
      df['Correctness'],
      df['StudentExplanation']
    ).alias("text")
  )

In [52]:
train = format_input(train)
test = format_input(test)

In [None]:
def get_predictions(model_path, test, model_type="standard"):
  print(f"Loading model from {model_path}")

  # Clear memory
  torch.cuda.empty_cache()
  gc.collect()

  # Load tokenizer
  tokenizer = AutoTokenizer.from_pretrained(model_path)
  if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
  
  # Try PEFT, else fallback standard
  if model_type == 'peft':
    try:
      print("Attempting PEFT model loading")
      base_model_paths = [
        model_path,
        "/kaggle/input/gemma2-9b-it-bf16", 
        "google/gemma-2-9b-it"
      ]
      model = None
      for base_path in base_model_paths:
        try:
          print(f"Trying base model: {base_path}")
          base_model = AutoModelForSequenceClassification(
            base_path,
            num_labels=65,
            torch_dtype=torch.bfloat16,
            device_map="auto",
          )
          model = PeftModel.from_pretrained(base_model, model_path)
          print(f"PEFT loaded successfully with base: {base_path}")
          break
        except Exception as e:
          print(f"Failed with base {base_path}: {str(e)[:100]}")
          continue
      if model is None:
        raise Exception("All PEFT attempts failed")
    except Exception as e:
      print(f"Falling back to standard model")
      model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        device_map="auto",
        torch_dtype=torch.bfloat16
      )
  else:
    print(f"Using standard model")
    model = AutoModelForSequenceClassification.from_pretrained(
      model_path,
      device_map="auto",
      torch_dtype=torch.bfloat16,
      num_labels=65
    )
  
  model.config.pad_token_id = tokenizer.pad_token_id

  def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=256)
  
  test_tokenized = test.map(tokenize, batched=True)

  # Trainer and inference
  trainer = Trainer(
    model=model,
    args=TrainingArguments(
      output_dir='results',
      do_predict=True,
      per_device_eval_batch_size=2,
      fp16=True,
      report_to="none"
    ),
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer)
  )

  print("Inference")
  predictions = trainer.predict(test_tokenized)
  logits = predictions.predictions

  del model, trainer, tokenizer, test_tokenized
  gc.collect()
  torch.cuda.empty_cache()
  
  # print(f"Inference completed. Shape: {logits.shape}")
  return logitsdef get_predictions(model_path, test, model_type="standard"):
  print(f"Loading model from {model_path}")

  # Clear memory
  torch.cuda.empty_cache()
  gc.collect()

  # Load tokenizer
  tokenizer = AutoTokenizer.from_pretrained(model_path)
  if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
  
  # Try PEFT, else fallback standard
  if model_type == 'peft':
    try:
      print("Attempting PEFT model loading")
      base_model_paths = [
        model_path,
        "/kaggle/input/gemma2-9b-it-bf16", 
        "google/gemma-2-9b-it"
      ]
      model = None
      for base_path in base_model_paths:
        try:
          print(f"Trying base model: {base_path}")
          base_model = AutoModelForSequenceClassification(
            base_path,
            num_labels=65,
            torch_dtype=torch.bfloat16,
            device_map="auto",
          )
          model = PeftModel.from_pretrained(base_model, model_path)
          print(f"PEFT loaded successfully with base: {base_path}")
          break
        except Exception as e:
          print(f"Failed with base {base_path}: {str(e)[:100]}")
          continue
      if model is None:
        raise Exception("All PEFT attempts failed")
    except Exception as e:
      print(f"Falling back to standard model")
      model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        device_map="auto",
        torch_dtype=torch.bfloat16
      )
  else:
    print(f"Using standard model")
    model = AutoModelForSequenceClassification.from_pretrained(
      model_path,
      device_map="auto",
      torch_dtype=torch.bfloat16,
      num_labels=65
    )
  
  model.config.pad_token_id = tokenizer.pad_token_id

  def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=256)
  
  test_tokenized = test.map(tokenize, batched=True)

  # Trainer and inference
  trainer = Trainer(
    model=model,
    args=TrainingArguments(
      output_dir='results',
      do_predict=True,
      per_device_eval_batch_size=2,
      fp16=True,
      report_to="none"
    ),
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer)
  )

  print("Inference")
  predictions = trainer.predict(test_tokenized)
  logits = predictions.predictions

  del model, trainer, tokenizer, test_tokenized
  gc.collect()
  torch.cuda.empty_cache()
  
  # print(f"Inference completed. Shape: {logits.shape}")
  return logits

In [55]:
ds_test = Dataset.from_polars(test)

In [None]:
# Model Ensemble
# paths
model_1 = "deepseek-ai/deepseek-math-7b-base"
model_2 = "justsomerandomdude264/Math_Homework_Solver_Llama318B"
model_3 = "Qwen/Qwen2.5-Math-1.5B-Instruct"

# =============== Model 1 ===============
predictions_1 = get_predictions(model_1, ds_test, "standard")

# =============== Model 2 ===============
predictions_2 = get_predictions(model_2, ds_test, "standard")

# =============== Model 3 ===============
predictions_3 = get_predictions(model_3, ds_test, "standard")

# Prediction

In [None]:
# Weighting results
model_1_weight = 0.1
model_2_weight = 0.6
model_3_weight = 0.3

ensemble_prediction = (model_1_weight * predictions_1 + model_2_weight * predictions_2 + model_3_weight * predictions_3)

top_indices = np.argsort()

In [None]:
%watermark