### Step 1: Import Libraries
Major Prabhat Pandey, DA25M002 | MTech AI & DS

In [3]:
# Libraries for data manipulation and machine learning
# Learned during MTech coursework - applying ensemble methods
import numpy as np
import pandas as pd
import json
from pathlib import Path

# ML Models - Using gradient boosting variants for regression
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

print("Libraries imported successfully - Ready for model training")

Libraries imported successfully - Ready for model training


### Step 2: Load Data
Preprocessed embeddings (768-dimensional) from google/embeddinggemma-300m

In [4]:
# Define file paths for training and test datasets
# Organized workspace structure for reproducibility
WORKSPACE_ROOT = Path(__file__).parent if '__file__' in globals() else Path.cwd()
TRAIN_DATA_PATH = WORKSPACE_ROOT / "train_data_fixed_embeddings.json"
TEST_DATA_PATH = WORKSPACE_ROOT / "test_data_fixed_embeddings.json"
METRIC_EMBEDDINGS_PATH = WORKSPACE_ROOT / "metric_name_embeddings.npy"

print(f"Loading training data from: {TRAIN_DATA_PATH}")
with open(TRAIN_DATA_PATH, 'r', encoding='utf-8') as f:
    train_data = json.load(f)

print(f"Loading test data from: {TEST_DATA_PATH}")
with open(TEST_DATA_PATH, 'r', encoding='utf-8') as f:
    test_data = json.load(f)

print(f"Loading metric embeddings from: {METRIC_EMBEDDINGS_PATH}")
metric_embeddings = np.load(METRIC_EMBEDDINGS_PATH)

print(f"\nData loaded successfully:")
print(f"   Training samples: {len(train_data):,}")
print(f"   Test samples: {len(test_data):,}")
print(f"   Metric embeddings shape: {metric_embeddings.shape}")

Loading training data from: c:\Users\lonew\OneDrive\Desktop\KAGGLE CHALLENGE\final_submission\train_data_fixed_embeddings.json
Loading test data from: c:\Users\lonew\OneDrive\Desktop\KAGGLE CHALLENGE\final_submission\test_data_fixed_embeddings.json
Loading test data from: c:\Users\lonew\OneDrive\Desktop\KAGGLE CHALLENGE\final_submission\test_data_fixed_embeddings.json
Loading metric embeddings from: c:\Users\lonew\OneDrive\Desktop\KAGGLE CHALLENGE\final_submission\metric_name_embeddings.npy

Data loaded successfully:
   Training samples: 5,000
   Test samples: 3,638
   Metric embeddings shape: (145, 768)
Loading metric embeddings from: c:\Users\lonew\OneDrive\Desktop\KAGGLE CHALLENGE\final_submission\metric_name_embeddings.npy

Data loaded successfully:
   Training samples: 5,000
   Test samples: 3,638
   Metric embeddings shape: (145, 768)


### Step 3: Feature Engineering
Siamese network features: 7,680 dimensions (base + differences + products)

In [5]:
def create_siamese_features(metric_emb, prompt_emb, response_emb):
    """
    Create Siamese network-style features from embeddings.
    Technique inspired by similarity learning architectures.
    
    Args:
        metric_emb: Metric name embedding (768d)
        prompt_emb: Prompt embedding (768d)
        response_emb: Response embedding (768d)
    
    Returns:
        Combined feature vector (7,680d)
    
    Note: This captures both individual embeddings and their relationships
    through difference and product operations 
    """
    features = []
    
    # Base features - preserving original embedding information (3,072 dimensions)
    features.append(metric_emb)    # 768d
    features.append(prompt_emb)    # 768d
    features.append(response_emb)  # 768d
    
    # Siamese features - capturing embedding relationships (4,608 dimensions)
    features.append(metric_emb - prompt_emb)           # 768d - directional difference
    features.append(metric_emb - response_emb)         # 768d - directional difference
    features.append(np.abs(metric_emb - prompt_emb))   # 768d - magnitude of difference
    features.append(np.abs(metric_emb - response_emb)) # 768d - magnitude of difference
    features.append(metric_emb * prompt_emb)           # 768d - element-wise interaction
    features.append(metric_emb * response_emb)         # 768d - element-wise interaction
    
    return np.concatenate(features)

def prepare_features(data, metric_embeddings):
    """
    Prepare feature matrix from raw data samples.
    Vectorized approach for efficiency on large datasets.
    """
    # Create metric name to index mapping
    with open(WORKSPACE_ROOT / "metric_names.json", 'r', encoding='utf-8') as f:
        metric_names = json.load(f)
    metric_to_id = {name: idx for idx, name in enumerate(metric_names)}
    
    X = []
    y = []
    
    for sample in data:
        # Get metric embedding using metric name
        metric_name = sample['metric_name']
        metric_idx = metric_to_id[metric_name]
        metric_emb = metric_embeddings[metric_idx]
        prompt_emb = np.array(sample['prompt_embedding'])
        response_emb = np.array(sample['response_embedding'])
        
        # Generate feature vector using Siamese architecture
        features = create_siamese_features(metric_emb, prompt_emb, response_emb)
        X.append(features)
        
        if 'score' in sample:
            y.append(float(sample['score']))  # Convert to float
    
    return np.array(X), np.array(y) if y else None

print("Creating training features with Siamese architecture...")
X_train, y_train = prepare_features(train_data, metric_embeddings)

print("Creating test features...")
X_test, _ = prepare_features(test_data, metric_embeddings)

print(f"\nFeatures created:")
print(f"   Training shape: {X_train.shape}")
print(f"   Test shape: {X_test.shape}")
print(f"   Training target mean: {y_train.mean():.3f}, std: {y_train.std():.3f}")

Creating training features with Siamese architecture...
Creating test features...
Creating test features...

Features created:
   Training shape: (5000, 6912)
   Test shape: (3638, 6912)
   Training target mean: 9.120, std: 0.942

Features created:
   Training shape: (5000, 6912)
   Test shape: (3638, 6912)
   Training target mean: 9.120, std: 0.942


### Step 4: Train Ensemble Models
5 models: CatBoost, XGBoost, LightGBM, Random Forest, CatBoost Deep

In [6]:
models = []

# Model 1: CatBoost with moderate depth
# Hyperparameters tuned through experimentation
# GPU acceleration enabled for faster training
print("Training Model 1/5: CatBoost (Siamese features)")
model1 = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.03,
    depth=6,
    l2_leaf_reg=3,
    random_seed=42,
    task_type='GPU',
    devices='0',
    verbose=False
)
model1.fit(X_train, y_train)
models.append(('CatBoost_Siamese', model1))
print("   Model 1 trained successfully")

Training Model 1/5: CatBoost (Siamese features)
   Model 1 trained successfully
   Model 1 trained successfully


In [7]:
# Model 2: XGBoost with matching hyperparameters
# Using same depth and regularization for diversity
# GPU acceleration enabled for faster training
print("Training Model 2/5: XGBoost (Siamese features)")
model2 = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=6,
    reg_lambda=3,
    random_state=42,
    tree_method='gpu_hist',
    gpu_id=0,
    verbosity=0
)
model2.fit(X_train, y_train)
models.append(('XGBoost_Siamese', model2))
print("   Model 2 trained successfully")

Training Model 2/5: XGBoost (Siamese features)
   Model 2 trained successfully
   Model 2 trained successfully


In [8]:
# Model 3: LightGBM for faster training
# Efficient on high-dimensional feature space
# GPU acceleration enabled for faster training
print("Training Model 3/5: LightGBM (Siamese features)")
model3 = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=6,
    reg_lambda=3,
    random_state=42,
    device='gpu',
    gpu_platform_id=0,
    gpu_device_id=0,
    verbose=-1
)
model3.fit(X_train, y_train)
models.append(('LightGBM_Siamese', model3))
print("   Model 3 trained successfully")

Training Model 3/5: LightGBM (Siamese features)
   Model 3 trained successfully
   Model 3 trained successfully


In [9]:
# Model 4: Random Forest as baseline
# Different architecture provides ensemble diversity
print("Training Model 4/5: Random Forest (Basic features)")
model4 = RandomForestRegressor(
    n_estimators=500,
    max_depth=10,
    random_state=42,
    n_jobs=-1,
    verbose=0
)
model4.fit(X_train, y_train)
models.append(('RandomForest_Basic', model4))
print("   Model 4 trained successfully")

Training Model 4/5: Random Forest (Basic features)
   Model 4 trained successfully
   Model 4 trained successfully


In [10]:
# Model 5: Deep CatBoost model
# Increased depth captures complex non-linear patterns
# Using CPU to avoid GPU memory issues with deeper model
print("Training Model 5/5: CatBoost Deep (All features)")
model5 = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.03,
    depth=8,
    l2_leaf_reg=5,
    random_seed=42,
    task_type='CPU',
    thread_count=-1,
    verbose=False
)
model5.fit(X_train, y_train)
models.append(('CatBoost_Deep', model5))
print("   Model 5 trained successfully")

print(f"\nAll 5 models trained - ensemble ready for predictions")


Training Model 5/5: CatBoost Deep (All features)
   Model 5 trained successfully

All 5 models trained - ensemble ready for predictions
   Model 5 trained successfully

All 5 models trained - ensemble ready for predictions


### Step 5: Generate Ensemble Predictions
Simple averaging across all models

In [11]:
print("Generating predictions from each model...\n")
train_predictions = []
test_predictions = []

# Generate predictions and evaluate each model
for name, model in models:
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    train_predictions.append(train_pred)
    test_predictions.append(test_pred)
    
    # Calculate individual model performance
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    print(f"   {name}: Train RMSE = {train_rmse:.4f}")

# Create ensemble using simple average
# Equal weighting performed best in validation experiments
print("\nCreating ensemble (simple average)...")
ensemble_train = np.mean(train_predictions, axis=0)
ensemble_test = np.mean(test_predictions, axis=0)

ensemble_train_rmse = np.sqrt(mean_squared_error(y_train, ensemble_train))
print(f"\nEnsemble created:")
print(f"   Ensemble Train RMSE: {ensemble_train_rmse:.4f}")
print(f"   Ensemble Test Mean: {ensemble_test.mean():.3f}")
print(f"   Ensemble Test Std: {ensemble_test.std():.3f}")

Generating predictions from each model...

   CatBoost_Siamese: Train RMSE = 0.4104
   CatBoost_Siamese: Train RMSE = 0.4104
   XGBoost_Siamese: Train RMSE = 0.1089
   XGBoost_Siamese: Train RMSE = 0.1089




   LightGBM_Siamese: Train RMSE = 0.1105
   RandomForest_Basic: Train RMSE = 0.7129
   RandomForest_Basic: Train RMSE = 0.7129
   CatBoost_Deep: Train RMSE = 0.1500

Creating ensemble (simple average)...

Ensemble created:
   Ensemble Train RMSE: 0.2566
   Ensemble Test Mean: 9.104
   Ensemble Test Std: 0.296
   CatBoost_Deep: Train RMSE = 0.1500

Creating ensemble (simple average)...

Ensemble created:
   Ensemble Train RMSE: 0.2566
   Ensemble Test Mean: 9.104
   Ensemble Test Std: 0.296


### Step 6: Apply Spread Calibration
Linear calibration: alpha=1.00, beta=-0.2 | Floor and clip to [0,10]

In [12]:
# Optimal calibration parameters found through experimentation
ALPHA = 1.00   # Spread scaling factor
BETA = -0.2    # Bias adjustment

print(f"Calibration parameters:")
print(f"   Alpha (spread): {ALPHA}")
print(f"   Beta (bias): {BETA}")

def apply_spread_calibration(predictions, alpha, beta):
    """
    Apply spread calibration formula to raw predictions.
    
    Process:
    1. Linear transformation: predictions * alpha + beta
    2. Floor to integer values
    3. Clip to valid range [0, 10]
    
    This ensures predictions match the expected score distribution
    while maintaining ranking order.
    
    """
    # Apply linear transformation
    calibrated = predictions * alpha + beta
    
    # Floor to integer for discrete scores
    calibrated = np.floor(calibrated)
    
    # Clip to valid scoring range
    calibrated = np.clip(calibrated, 0, 10).astype(int)
    
    return calibrated

# Apply calibration to test predictions
print("\nCalibrating test predictions...")
test_calibrated = apply_spread_calibration(ensemble_test, ALPHA, BETA)

print(f"\nCalibration complete:")
print(f"   Before calibration: mean={ensemble_test.mean():.3f}, std={ensemble_test.std():.3f}")
print(f"   After calibration: mean={test_calibrated.mean():.3f}, std={test_calibrated.std():.3f}")
print(f"   Range: [{test_calibrated.min()}, {test_calibrated.max()}]")

Calibration parameters:
   Alpha (spread): 1.0
   Beta (bias): -0.2

Calibrating test predictions...

Calibration complete:
   Before calibration: mean=9.104, std=0.296
   After calibration: mean=8.360, std=0.502
   Range: [2, 9]


### Step 7: Create Submission File

In [16]:
# Generate test IDs since test data doesn't include them
# Kaggle submission typically needs sequential IDs starting from 0
test_ids = list(range(len(test_data)))

# Create submission DataFrame with required format
submission = pd.DataFrame({
    'id': test_ids,
    'score': test_calibrated
})

# Save to CSV file
OUTPUT_PATH = 'submission.csv'
submission.to_csv(OUTPUT_PATH, index=False)

print(f"Submission file created: {OUTPUT_PATH}")
print(f"   Rows: {len(submission):,}")
print(f"   Columns: {list(submission.columns)}")

# Display submission statistics
print(f"\nSubmission statistics:")
print(f"   Mean: {submission['score'].mean():.3f}")
print(f"   Std: {submission['score'].std():.3f}")
print(f"   Min: {submission['score'].min()}")
print(f"   Max: {submission['score'].max()}")

Submission file created: submission.csv
   Rows: 3,638
   Columns: ['id', 'score']

Submission statistics:
   Mean: 8.360
   Std: 0.502
   Min: 2
   Max: 9


### Step 8: Score Distribution