## this notebook from https://www.kaggle.com/code/imaadmahmood/llm-finetuning

In [2]:
# import zipfile
# import os
# from pyexpat import features
#
# from lark.tools import flags
#
# zip_path = "../../llm-classification-finetuning.zip"
# extract_path = "../../kaggle"
#
# # Create folder if it doesn't exist
# os.makedirs(extract_path, exist_ok=True)
#
# # Open and extract
# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_path)
#
# print(f"Unzipped to {extract_path}")

Unzipped to ../kaggle


In [3]:
import pandas as pd

sample_submission = pd.read_csv("../../kaggle/LLM_Classification_FineTuning/sample_submission.csv")
sample_submission

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.333333,0.333333,0.333333
1,211333,0.333333,0.333333,0.333333
2,1233961,0.333333,0.333333,0.333333


In [4]:
test_df = pd.read_csv("../../kaggle/LLM_Classification_FineTuning/test.csv")
train_df = pd.read_csv("../../kaggle/LLM_Classification_FineTuning/train.csv")

train_df.head(2)

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0


In [5]:
train_df.columns.to_list()

['id',
 'model_a',
 'model_b',
 'prompt',
 'response_a',
 'response_b',
 'winner_model_a',
 'winner_model_b',
 'winner_tie']

In [6]:
train_df.dtypes

id                 int64
model_a           object
model_b           object
prompt            object
response_a        object
response_b        object
winner_model_a     int64
winner_model_b     int64
winner_tie         int64
dtype: object

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import log_loss
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import re
import warnings

warnings.filterwarnings("ignore")

# Set random seed for reproducibility
np.random.seed(42)


class OptimizedLLMPredictor:
    def __init__(self):
        """Initialize the predictor with optimized setting"""
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.models = {}
        self.feature_names = []

        print("OptimizedLLMPredictor Initialized!")

    def extract_fast_features(self, df: pd.DataFrame, is_train=True):
        """Extract optimized features with focus on speed and performance"""

        print("Starting feature engineering...")
        features: pd.DataFrame = df[['id']].copy()

        for resp in ['response_a', 'response_b']:
            text_col = df[resp].astype(str)

            # Basic length metrics
            features[f'{resp}_len'] = text_col.str.len()
            features[f'{resp}_words'] = text_col.str.split().str.len()
            features[f'{resp}_sentences'] = text_col.str.count(r'[.!?]+')

            # Advanced metrics
            features[f'{resp}_avg_word_len'] = features[f'{resp}_len'] / (features[f'{resp}_words'] + 1)
            features[f'{resp}_punct_ratio'] = text_col.str.count(r'\w\s') / (features[f'{resp}_len'] + 1)
            features[f'{resp}_upper_ratio'] = text_col.str.count(r'[A-Z]') / (features[f'{resp}_len'] + 1)

            # Structure indicators
            features[f'{resp}_newlines'] = text_col.str.count(r"\n")
            features[f'{resp}_code_blocks'] = text_col.str.count(r"```")
            features[f'{resp}_bullets'] = text_col.str.count(r'^\s*[-*•]\s', flags=re.MULTILINE)
            features[f'{resp}_numbers'] = text_col.str.count(r'^\s*\d+\.\s', flags=re.MULTILINE)

            # Quality indicators
            features[f'{resp}_questions'] = text_col.str.count(r'\?')
            features[f'{resp}_exclamations'] = text_col.str.count(r'\!')
        print("Basic text features extracted")

        prompt_text = df['prompt'].astype(str)
        features['prompt_len'] = prompt_text.str.len()
        features['prompt_words'] = prompt_text.str.split().str.len()
        features['prompt_questions'] = prompt_text.str.count(r'\?')

        print("Prompt features extracted")

        # Length comparisons
        features['len_ratio_a_b'] = features['response_a_len'] / features['response_b_len']
        features['len_diff_a_b'] = features['response_a_len'] - features['response_b_len']
        features['word_ratio_a_b'] = features['response_a_words'] / (features['response_b_words'] + 1)
        features['word_diff_a_b'] = features['response_a_words'] - features['response_b_words']

        # Quality comparisons
        features['struct_diff_a_b'] = (features['response_a_bullets'] + features['response_a_numbers']) - (
                features['response_b_bullets'] + features['response_b_numbers'])
        features['engagement_diff_a_b'] = (features['response_a_questions'] + features['response_a_exclamations']) - (
                features['response_b_questions'] + features['response_b_exclamations'])
        print("Comparative features extracted")

        if is_train and 'model_a' in df.columns:
            # Fast model encoding
            for model_col in ['model_a', 'model_b']:
                if model_col not in self.label_encoders:
                    self.label_encoders[model_col] = LabelEncoder()
                    features[f'{model_col}_id'] = self.label_encoders[model_col].fit_transform(df[model_col])
                    features[f'{model_col}_id'] = self.label_encoders[model_col].transform(df[model_col])
                else:
                    features[f'{model_col}_id'] = self.label_encoders[model_col].transform(df[model_col])

            # Quick model stats
            model_wins = df.groupby('model_a')['winner_model_a'].mean()
            model_wins_b = df.groupby('model_b')['winner_model_b'].mean()

            features['model_a_win_rate'] = df['model_a'].map(model_wins).fillna(0.33)
            features['model_b_win_rate'] = df['model_b'].map(model_wins_b).fillna(0.33)

            print("Model features extracted")
        elif not is_train:
            # For test data, create dummy model features to maintain consistency
            features['model_a_id'] = 0  # default encoding for unknown models
            features['model_b_id'] = 0
            features['model_a_win_rate'] = 0.33  # default win rate
            features['model_b_win_rate'] = 0.33

            print("Dummy model features added for test consistency")

        def fast_word_overlap(row):
            words_a = set(str(row['response_a']).lower().split())
            words_b = set(str(row['response_b']).lower().split())
            if len(words_a) == 0 or len(words_b) == 0:
                return 0
            return len(words_a & words_b) / len(words_a | words_b)

        features['word_overlap'] = df.apply(fast_word_overlap, axis=1)

        print("Similarity features extracted")

        features = features.fillna(0)
        self.feature_names = [col for col in features.columns if col != 'id']

        print(f" Feature engineering completed! Total features: {len(self.feature_names)}")

        print("=" * 60)

        return features

    def train_optimized_ensemble(self, X, y):
        """Train optimized ensemble with focus on speed and performance"""

        print("Training optimized ensemble models...")
        print("=" * 60)

        y_multiclass = np.argmax(y.values, axis=1)

        # Model 1: LightGBM (Primary Model)
        print("Training LightGBM (Primary Model) ...")
        self.models['lgb'] = lgb.LGBMClassifier(
            n_estimators=800,
            learning_rate=0.08,
            max_depth=6,
            num_leaves=31,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
            objective='multiclass',
            num_class=3,
            verbose=-1,
            force_col_wise=True  # Optimization for speed
        )
        self.models['lgb'].fit(X, y_multiclass)

        # Model 2: XGBoost (Secondary Model)
        print("Training XGBoost")
        self.models['xgb'] = xgb.XGBClassifier(
            n_estimators=600,
            learning_rate=0.08,
            max_depth=6,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
            objective='multi:softprob',
            eval_metric='mlogloss',
            verbosity=0,
            tree_method='hist'  # Faster training
        )
        self.models['xgb'].fit(X, y_multiclass)
        print("XGBoost training completed!")

        # Model 3: CatBoost (Robust Model)
        print("Training CatBoost...")
        self.models['catboost'] = CatBoostClassifier(
            iterations=500,
            learning_rate=0.1,
            depth=6,
            random_state=42,
            verbose=False,
            loss_function='MultiClass',
            task_type="CPU",
            logging_level='Silent',
            train_dir=None  # No log files
        )
        self.models['catboost'].fit(X, y_multiclass)
        print("CatBoost training completed!")

        # Ensemble Weights (Optimized)
        self.ensemble_weights = {
            'lgb': 0.5,  # Primary model
            'xgb': 0.35,  # Strong secondary
            'catboost': 0.15  # Robustness
        }

        print(" Ensemble training completed!")
        print("=" * 60)

    def predict_optimized(self, X):
        """Make optimized ensemble predictions with feature validation"""

        print("Making ensemble predictions...")

        if hasattr(self, 'feature_names'):
            if X.shape[1] != len(self.feature_names):
                print(f"Feature mismatch detected!")
                print(f"Expected {len(self.feature_names)} features")
                print(f"Received: {X.shape[1]} features")

                # Ensure X has the same columns as training
                if isinstance(X, pd.DataFrame):
                    missing_features = set(self.feature_names) - set(X.columns)
                    extra_features = set(X.columns) - set(self.feature_names)

                    if missing_features:
                        print(f"Adding missing feature: {missing_features}")
                        for feature in missing_features:
                            X[feature] = 0

                    if extra_features:
                        print(f"Removing extra features: {extra_features}")
                        X = X.drop(columns=list(extra_features))

                    # Reorder columns to match training
                    X = X[self.feature_names]
        print(f"Feature validation completed. Shape {X.shape}")

        predictions = np.zeros((X.shape[0], 3))

        for name, model in self.models.items():
            pred = model.predict_proba(X)
            predictions += self.ensemble_weights[name] * pred
            print(f"{name} predictions added (weight: {self.ensemble_weights[name]}")

        print("Ensemble predictions completed!")
        print("=" * 60)
        return predictions

    def quick_validation(self, X, y, n_splits=3):
        """Quick cross-validation for performance check"""

        print("Running quick validation")
        print("="*60)

        y_multiclass = np.argmax(y.values, axis=1)
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

        # Only validate primary model for speed
        model = self.models['lgb']
        scores = []

        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_multiclass)):
            X_fold_train, X_fold_val = X.iloc[train_idx], X.iloc[val_idx]
            y_fold_train, y_fold_val = y_multiclass[train_idx], y_multiclass[val_idx]

            # Quick training
            fold_model = lgb.LGBMClassifier(
                n_estimators=200, learning_rate=0.1, max_depth=6,
                random_state=42, verbose=-1, objective='multiclass', num_class=3
            )

            fold_model.fit(X_fold_train, y_fold_train)

            # Prediction and scoring
            pred_proba = fold_model.predict_proba(X_fold_val)
            y_val_onehot = np.eye(3)[y_fold_val]
            score = log_loss(y_val_onehot, pred_proba)
            scores.append(score)

            print(f"Fold {fold+1} Log Loss: {score:.4f}")
        avg_score = np.mean(scores)
        std_score = np.std(scores)

        print(f"Average CV Score: {avg_score:.4f} (+/- {std_score: .4f})")
        print("=" * 60)
        return avg_score

In [14]:
def main():
    """Optimized main pipeline for maximum performance"""

    print("STARTING LLM PREFERENCE PREDICTION PIPELINE")
    print("="*60)

    print("Loading competition data...")
    test_df = pd.read_csv("../../kaggle/LLM_Classification_FineTuning/test.csv")
    train_df = pd.read_csv("../../kaggle/LLM_Classification_FineTuning/train.csv")
    sample_submission = pd.read_csv('../../kaggle/LLM_Classification_FineTuning/sample_submission.csv')

    print(f"Train shape: {train_df.shape}")
    print(f"Test shape: {test_df.shape}")
    print("=" * 60)

    predictor = OptimizedLLMPredictor()

    train_features = predictor.extract_fast_features(train_df, is_train=True)
    X_train = train_features.drop(['id'], axis=1)
    y_train = train_df[['winner_model_a', 'winner_model_b', 'winner_tie']].copy()

    # Store feature names for consistency
    predictor.feature_names = list(X_train.columns)

    print(f"Final training shape: {X_train.shape}")
    print(f"Feature names stored: {len(predictor.feature_names)} features")
    print("=" * 60)

    predictor.train_optimized_ensemble(X_train, y_train)

    try:
        cv_score = predictor.quick_validation(X_train, y_train)
        performance_indicator = "🔥 EXCELLENT" if cv_score < 1.05 else "✅ GOOD" if cv_score < 1.10 else "⚠️ NEEDS IMPROVEMENT"
        print(f"Model Performance: {performance_indicator}")
    except Exception as e:
        print(f"Validation skipped: {str(e)}")

    print("=" * 60)

    print("Processing test data...")
    test_features = predictor.extract_fast_features(test_df, is_train=False)
    X_test = test_features.drop(['id'], axis=1)

    print(f"Test features shape: {X_test.shape}")
    print(f"Expected shape: ({X_test.shape[0]}, {len(predictor.feature_names)})")

    # Make predictions with feature validation
    predictions = predictor.predict_optimized(X_test)

    print("Creating optimized submission...")

    submission = pd.DataFrame({
        'id': test_df['id'],
        'winner_model_a': predictions[:, 0],
        'winner_model_b': predictions[:, 1],
        'winner_tie': predictions[:, 2]
    })

    # Normalize probabilities to ensure they sum to 1
    prob_sums = submission[['winner_model_a', 'winner_model_b', 'winner_tie']].sum(axis=1)
    submission[['winner_model_a', 'winner_model_b', 'winner_tie']] = \
        submission[['winner_model_a', 'winner_model_b', 'winner_tie']].div(prob_sums, axis=0)

    # Save submission
    submission.to_csv('submission.csv', index=False)

    print("Submission saved as 'submission.csv'")
    print("=" * 60)

    print("SUBMISSION SUMMARY")
    print("=" * 60)
    print(f"Shape: {submission.shape}")
    print(f"Columns: {list(submission.columns)}")

    # Check probability distributions
    prob_stats = submission[['winner_model_a', 'winner_model_b', 'winner_tie']].describe()
    print("\nProbability Distributions:")
    print(prob_stats.round(4))

    print("\nSample Predictions:")
    print(submission.head().round(4))

    # Final validation
    prob_sums_check = submission[['winner_model_a', 'winner_model_b', 'winner_tie']].sum(axis=1)
    print(f"\nProbability sums (should be ~1.0): Min={prob_sums_check.min():.6f}, Max={prob_sums_check.max():.6f}")

    print("=" * 80)
    print("PIPELINE COMPLETED SUCCESSFULLY!")
    print("Ready for submission to leaderboard!")
    print("=" * 80)

    return predictor, submission

In [15]:
if __name__ == '__main__':
    predictor, submission = main()

STARTING LLM PREFERENCE PREDICTION PIPELINE
Loading competition data...
Train shape: (57477, 9)
Test shape: (3, 4)
OptimizedLLMPredictor Initialized!
Starting feature engineering...
Basic text features extracted
Prompt features extracted
Comparative features extracted
Model features extracted
Similarity features extracted
 Feature engineering completed! Total features: 38
Final training shape: (57477, 38)
Feature names stored: 38 features
Training optimized ensemble models...
Training LightGBM (Primary Model) ...
Training XGBoost
XGBoost training completed!
Training CatBoost...
CatBoost training completed!
 Ensemble training completed!
Running quick validation
Fold 1 Log Loss: 1.0078
Fold 2 Log Loss: 1.0046
Fold 3 Log Loss: 1.0063
Average CV Score: 1.0062 (+/-  0.0013)
Model Performance: 🔥 EXCELLENT
Processing test data...
Starting feature engineering...
Basic text features extracted
Prompt features extracted
Comparative features extracted
Dummy model features added for test consistenc