In [None]:
!pip install -q -U sentence-transformers --no-index --find-links ../input/lmsys-pip/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, precision_score, accuracy_score

import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping

from scipy.sparse import hstack

from sentence_transformers import SentenceTransformer

In [None]:
df_train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
print(df_train.shape)

In [None]:
# Concatenate strings in list
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)

df_train.loc[:, 'prompt'] = df_train['prompt'].apply(process)
df_train.loc[:, 'response_a'] = df_train['response_a'].apply(process)
df_train.loc[:, 'response_b'] = df_train['response_b'].apply(process)

df_train['text'] = 'User prompt: ' + df_train['prompt'] +  '\n\nModel A :\n' + df_train['response_a'] +'\n\n--------\n\nModel B:\n'  + df_train['response_b']


# display(df_test.head(3))

# Data Preprocessing

### 1. Embedding

In [None]:
model = SentenceTransformer("/kaggle/input/sentence-transformers/minilm-l6-v2/all-MiniLM-L6-v2")

In [None]:
texts = df_train['text'].tolist()
embeddings = model.encode(texts)
df_train['embeddings'] = embeddings.tolist()

X = np.vstack(df_train['embeddings'])

In [None]:
target_columns = ['winner_model_a', 'winner_model_b', 'winner_tie']
y = df_train[target_columns].idxmax(axis=1) 
print(y.shape)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(y_encoded.shape)

In [None]:
# split data into training and validation
X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y_encoded,
                                                  test_size=0.2,
                                                  random_state=42)
print(X_train.shape, X_val.shape)

In [None]:
# LightGBM parameters
params = {
    'n_estimators': 100,
    'max_depth': 12,
    'num_leaves': 500,
    'subsample': 0.7,
    'min_child_samples': 200,
    'objective': 'multiclass',
    'early_stopping_rounds': 100,
    'num_class': 3,
    'metric': 'multi_logloss',
    'random_state': 42,
    'learning_rate': 0.01,
    'device': 'gpu', 
    'verbose': -1, 
}


lgbm_embedding = lgb.LGBMClassifier(**params)
lgbm_embedding.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='multi_logloss')

# Save model
lgbm_embedding.booster_.save_model('lgbm_embedding.txt')

### 2. TfidfVectorizer + CountVectorizer

In [None]:
# TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            token_pattern=None,
            strip_accents='unicode',
            analyzer = 'word'
)

# CountVectorizer
count_vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    strip_accents='unicode',
    analyzer='word'
)

In [None]:
columns_to_vectorize = ['prompt', 'response_a', 'response_b']
# columns_to_vectorize = ['response_a', 'response_b']
# training data fit
train_vector = df_train[columns_to_vectorize].astype(str).apply(lambda x: ' '.join(x), axis=1)
tfidf_vectorizer = tfidf_vectorizer.fit(train_vector)
count_vectorizer = count_vectorizer.fit(train_vector)

# training data transform
count_vectorized_columns = [count_vectorizer.transform(df_train[c]) for c in columns_to_vectorize]
tfidf_vectorized_columns = [tfidf_vectorizer.transform(df_train[c]) for c in columns_to_vectorize]
combined_train = hstack(count_vectorized_columns + tfidf_vectorized_columns)
print(combined_train.shape)

# split data into training and validation
X_train, X_val, y_train, y_val = train_test_split(combined_train, y_encoded, test_size=0.2, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

In [None]:
# LightGBM parameters
params = {
    'n_estimators': 100,
    'max_depth': 12,
    'num_leaves': 800,
    'subsample': 0.7,
    'min_child_samples': 300,
    'objective': 'multiclass',
    'early_stopping_rounds': 300,
    'num_class': 3,
    'metric': 'multi_logloss',
    'random_state': 42,
    'learning_rate': 0.01,
    'device': 'gpu', 
    'verbose': -1, 
}


lgbm_tfidf = lgb.LGBMClassifier(**params)
lgbm_tfidf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='multi_logloss')

# Save model
lgbm_tfidf.booster_.save_model('lgbm_tfidf.txt')

# Testing Data
## Combines the predictions from two different models using weighted averaging 

In [None]:
# load file
df_test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

# data preprocessing
# 1.embedding
df_test.loc[:, 'prompt'] = df_test['prompt'].apply(process)
df_test.loc[:, 'response_a'] = df_test['response_a'].apply(process)
df_test.loc[:, 'response_b'] = df_test['response_b'].apply(process)
df_test['text'] = 'User prompt: ' + df_test['prompt'] +  '\n\nModel A :\n' + df_test['response_a'] +'\n\n--------\n\nModel B:\n'  + df_test['response_b']

texts = df_test['text'].tolist()
embeddings = model.encode(texts)
df_test['embeddings'] = embeddings.tolist()
X = np.vstack(df_test['embeddings'])

# 2.vectorizing 
count_vectorized_columns = [count_vectorizer.transform(df_test[c]) for c in columns_to_vectorize]
tfidf_vectorized_columns = [tfidf_vectorizer.transform(df_test[c]) for c in columns_to_vectorize]
combined_test = hstack(count_vectorized_columns + tfidf_vectorized_columns)
print(combined_test.shape)

# combined predictions
pred_embedding = lgbm_embedding.predict_proba(X)
pred_tfidf = lgbm_tfidf.predict_proba(combined_test)

embedding_weight = 0.7

preds = embedding_weight * pred_embedding + (1 - embedding_weight) * pred_tfidf

In [None]:
submission = pd.DataFrame({
    'id': df_test["id"],
    'winner_model_a': preds[:, 0],
    'winner_model_b': preds[:, 1], 
    'winner_tie': preds[:, 2]
})

print(submission.head())
submission.to_csv('submission.csv', index=False)