In [1]:
import zipfile
import os
from pyexpat import features

from lark.tools import flags

zip_path = "llm-classification-finetuning.zip"
extract_path = "./kaggle"

# Create folder if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Open and extract
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Unzipped to {extract_path}")

Unzipped to ./kaggle


In [2]:
import pandas as pd

sample_submission = pd.read_csv("./kaggle/sample_submission.csv")
sample_submission

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.333333,0.333333,0.333333
1,211333,0.333333,0.333333,0.333333
2,1233961,0.333333,0.333333,0.333333


In [7]:
test_df = pd.read_csv("./kaggle/test.csv")
train_df = pd.read_csv("./kaggle/train.csv")

train_df.head(2)

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0


In [9]:
train_df.columns.to_list()

Index(['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b',
       'winner_model_a', 'winner_model_b', 'winner_tie'],
      dtype='object')

In [14]:
train_df.dtypes

id                 int64
model_a           object
model_b           object
prompt            object
response_a        object
response_b        object
winner_model_a     int64
winner_model_b     int64
winner_tie         int64
dtype: object

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import log_loss
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import re
import warnings

warnings.filterwarnings("ignore")

# Set random seed for reproducibility
np.random.seed(42)


class OptimizedLLMPredictor:
    def __init__(self):
        """Initialize the predictor with optimized setting"""
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.models = {}
        self.feature_names = []

        print("OptimizedLLMPredictor Initialized!")

    def extract_fast_features(self, df: pd.DataFrame, is_train=True):
        """Extract optimized features with focus on speed and performance"""

        print("Starting feature engineering...")
        features: pd.DataFrame = df[['id']].copy()

        for resp in ['response_a', 'response_b']:
            text_col = df[resp].astype(str)

            # Basic length metrics
            features[f'{resp}_len'] = text_col.str.len()
            features[f'{resp}_words'] = text_col.str.split().str.len()
            features[f'{resp}_sentences'] = text_col.str.count(r'[.!?]+')

            # Advanced metrics
            features[f'{resp}_avg_word_len'] = features[f'{resp}_len'] / (features[f'{resp}_words'] + 1)
            features[f'{resp}_punct_ratio'] = text_col.str.count(r'\w\s') / (features[f'{resp}_len'] + 1)
            features[f'{resp}_upper_ratio'] = text_col.str.count(r'[A-Z]') / (features[f'{resp}_len'] + 1)

            # Structure indicators
            features[f'{resp}_newlines'] = text_col.str.count(r"\n")
            features[f'{resp}_code_blocks'] = text_col.str.count(r"```")
            features[f'{resp}_bullets'] = text_col.str.count(r'^\s*[-*•]\s', flags=re.MULTILINE)
            features[f'{resp}_numbers'] = text_col.str.count(r'^\s*\d+\.\s', flags=re.MULTILINE)

            # Quality indicators
            features[f'{resp}_questions'] = text_col.str.count(r'\?')
            features[f'{resp}_exclamations'] = text_col.str.count(r'\!')
        print("Basic text features extracted")

        prompt_text = df['prompt'].astype(str)
        features['prompt_len'] = prompt_text.str.len()
        features['prompt_words'] = prompt_text.str.split().str.len()
        features['prompt_questions'] = prompt_text.str.count(r'\?')

        print("Prompt features extracted")

        # Length comparisons
        features['len_ratio_a_b'] = features['response_a_len'] / features['response_b_len']
        features['len_diff_a_b'] = features['response_a_len'] - features['response_b_len']
        features['word_ratio_a_b'] = features['response_a_words'] / (features['response_b_words'] + 1)
        features['word_diff_a_b'] = features['response_a_words'] - features['response_b_words']

        # Quality comparisons
        features['struct_diff_a_b'] = (features['response_a_bullets'] + features['response_a_numbers']) - (
                features['response_b_bullets'] + features['response_b_numbers'])
        features['engagement_diff_a_b'] = (features['response_a_questions'] + features['response_a_exclamations']) - (
                features['response_b_questions'] + features['response_b_exclamations'])
        print("Comparative features extracted")

        if is_train and 'model_a' in df.columns:
            # Fast model encoding
            for model_col in ['model_a', 'model_b']:
                if model_col not in self.label_encoders:
                    self.label_encoders[model_col] = LabelEncoder()
                    features[f'{model_col}_id'] = self.label_encoders[model_col].fit_transform(df[model_col])
                    features[f'{model_col}_id'] = self.label_encoders[model_col].transform(df[model_col])
                else:
                    features[f'{model_col}_id'] = self.label_encoders[model_col].transform(df[model_col])

            # Quick model stats
            model_wins = df.groupby('model_a')['winner_model_a'].mean()
            model_wins_b = df.groupby('model_b')['winner_model_b'].mean()

            features['model_a_win_rate'] = df['model_a'].map(model_wins).fillna(0.33)
            features['model_b_win_rate'] = df['model_b'].map(model_wins_b).fillna(0.33)

            print("Model features extracted")
        elif not is_train:
            # For test data, create dummy model features to maintain consistency
            features['model_a_id'] = 0  # default encoding for unknown models
            features['model_b_id'] = 0
            features['model_a_win_rate'] = 0.33  # default win rate
            features['model_b_win_rate'] = 0.33

            print("Dummy model features added for test consistency")

        def fast_word_overlap(row):
            words_a = set(str(row['response_a']).lower().split())
            words_b = set(str(row['response_b']).lower().split())
            if len(words_a) == 0 or len(words_b) == 0:
                return 0
            return len(words_a & words_b) / len(words_a | words_b)

        features['word_overlap'] = df.apply(fast_word_overlap, axis=1)

        print("Similarity features extracted")

        features = features.fillna(0)
        self.feature_names = [col for col in features.columns if col != 'id']

        print(f" Feature engineering completed! Total features: {len(self.feature_names)}")

        print("=" * 60)

        return features

    def train_optimized_ensemble(self, X, y):
        """Train optimized ensemble with focus on speed and performance"""

        print("Training optimized ensemble models...")
        print("=" * 60)

        y_multiclass = np.argmax(y.values, axis=1)


        # Model 1: LightGBM (Primary Model)
        print("Training LightGBM (Primary Model) ...")
        self.models['lgb'] = lgb.LGBMClassifier(
            n_estimators=800,
            learning_rate=0.08,
            max_depth=6,
            num_leaves=31,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
            objective='multiclass',
            num_class=3,
            verbose=-1,
            force_col_wise=True # Optimization for speed
        )
        self.models['lgb'].fit(X, y_multiclass)

        # Model 2: XGBoost (Secondary Model)
        print("Training XGBoost")
        self.models['xgb'] = xgb.XGBClassifier(
            n_estimators=600,
            learning_rate=0.08,
            max_depth=6,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
            objective='multi:softprob',
            eval_metric='mlogloss',
            verbosity=0,
            tree_method='hist' # Faster training
        )

        # Model 3: CatBoost (Robust Model)
        print("Training CatBoost...")


