# Notebook Versions

- Version 1 (11/23/2024)
   * Baseline modeling 1.0


- Version 2 (11/23/2024)
   * Fixing bug.
 

- Version 3 (11/23/2024)
   * Fixing bug. 
 
     
# Loading Libraries

In [1]:
%%time
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import gc
import difflib

import warnings
warnings.filterwarnings('ignore')

from tqdm.notebook import tqdm

import re

from functools import partial
from scipy.stats import kurtosis, skew, gmean, mode

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, FunctionTransformer, PowerTransformer, PolynomialFeatures
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import KNNImputer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score, cross_val_predict, RepeatedKFold
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, cohen_kappa_score, log_loss, f1_score, r2_score, accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.calibration import CalibrationDisplay, CalibratedClassifierCV
from sklearn.inspection import PartialDependenceDisplay, permutation_importance
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Ridge, RidgeCV
from collections import Counter
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC, LinearSVR

from category_encoders import TargetEncoder

import ydf
from ydf import RandomForestLearner

import xgboost as xgb

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool

from sklearn.neural_network import MLPClassifier

import optuna

CPU times: user 3.21 s, sys: 534 ms, total: 3.75 s
Wall time: 6.41 s


# Reading Data

In [2]:
%%time
train = pd.read_parquet('../input/wsdm-cup-multilingual-chatbot-arena/train.parquet')
test = pd.read_parquet('../input/wsdm-cup-multilingual-chatbot-arena/test.parquet')

print('The dimension of the train dataset is:', train.shape)
print('The dimension of the test dataset is:', test.shape)

The dimension of the train dataset is: (48439, 8)
The dimension of the test dataset is: (3, 5)
CPU times: user 1.46 s, sys: 1.21 s, total: 2.66 s
Wall time: 3.23 s


In [3]:
train.head()

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,00007cff95d7f7974642a785aca248b0f26e60d3312fac...,vieš po Slovensky?,"Áno, hovorím po slovensky. Ako vám môžem pomôcť?","Áno, veď som tu! Môžem ti pomôcť s otázkami al...",model_a,o1-preview,reka-core-20240904,Slovak
1,00010ed04b536f56ebe43eef1100c13906abea12bf9855...,You will be given a piece of news. Analyze it ...,Let's break down the news and analyze it accor...,"```json\n{\n ""contains_orgs"": true,\n ""orgs""...",model_a,gemma-2-27b-it,gemini-1.5-flash-002,Russian
2,0003800d510e38803efba5ceaec122bc66408fe367b0be...,"Dört basamaklı, rakamları birbirinden ve sıfır...","Bu soruyu çözmek için, verilen koşulları adım ...",Bu problemi adım adım çözelim:\n\n1) ABCD - DC...,model_a,gpt-4-0125-preview,claude-3-5-sonnet-20240620,Turkish
3,00072026c68f5418ef2da238394e418ce72a534b9b22d5...,"현재 추천된 탑 3 종목인 Cabaletta Bio (CABA), Rocket Ph...","죄송하지만 저는 금융 조언을 제공할 수 없습니다. 저는 AI 모델이며, 투자 결정에...",현재 추천된 탑 3 종목에 순위를 매기기 위해서는 여러 가지 요소들을 고려해야 합니...,model_b,gemma-2-2b-it,llama-3.1-nemotron-70b-instruct,English
4,0007ce7cf6bc1b5a8f8a4669b854fb12030863c970d9dc...,Please be boring,"Alright, I'll be as boring as possible.\n\nTod...","Understood. Here is a straightforward, unadorn...",model_a,reka-flash-20240722,grok-2-2024-08-13,English


In [4]:
test

Unnamed: 0,id,prompt,response_a,response_b,scored
0,327228,Caso Clínico: Un hombre de 70 años con anteced...,**Diagnóstico Diferencial de Anemia en Pacient...,"Basándonos en el caso clínico presentado, pode...",False
1,1139415,Peel Company received a cash dividend from a ...,The correct answer is **(a) No No**. Here's ...,The correct answer is **(a) No No**. Here's wh...,False
2,1235630,Há um grave problema com o relógio da torre da...,Dois problemas interessantes!\n\n**Problema 1:...,Vamos resolver os dois problemas em sequência....,False


# Baseline Modeling 1.0

First, we engineer a few basic features as follows:

In [5]:
def fe(df, train=False):

    df['prompt_len'] = df['prompt'].str.len()
    df['prompt_words'] = df['prompt'].str.split().str.len()
    df['prompt_question'] = df['prompt'].str.contains('\?').astype(int)
    
    df['response_a_len'] = df['response_a'].str.len()
    df['response_b_len'] = df['response_b'].str.len()

    df['response_a_words'] = df['response_a'].str.split().str.len()
    df['response_b_words'] = df['response_b'].str.split().str.len()

    df['prompt_response_a_len_ratio'] = df['prompt_len'] / df['response_a_len']
    df['prompt_response_b_len_ratio'] = df['prompt_len'] / df['response_b_len']

    if train:
        df['winner'] = df['winner'].map({'model_a': 0, 'model_b': 1})

    return df

Next, we proceed to run a standard cross-validation experiment.

In [6]:
%%time
train = fe(train, train=True)
test = fe(test)

X = train[['prompt_len', 'prompt_words', 'prompt_question', 'response_a_len', 'response_b_len', 'response_a_words', 'response_b_words', 'prompt_response_a_len_ratio', 'prompt_response_b_len_ratio']]
y = train['winner']

test_cv = test[['prompt_len', 'prompt_words', 'prompt_question', 'response_a_len', 'response_b_len', 'response_a_words', 'response_b_words', 'prompt_response_a_len_ratio', 'prompt_response_b_len_ratio']]

SEED = 42
ydf.verbose(-1)
scores, ydf_test_preds = [], []
skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=SEED)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
            
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    train_data = pd.concat([X_train, y_train], axis=1)
    test_data = pd.concat([X_test, y_test], axis=1)

    ydf_md = RandomForestLearner(label='winner', 
                                 num_threads=10, 
                                 num_trees=1000, 
                                 max_depth=15).train(train_data)
    ydf_pred = ydf_md.predict(test_data)

    score = accuracy_score(y_test, np.where(ydf_pred>0.5, 1, 0))
    print('Fold:', i, 'accuracy:', score)
    scores.append(score)

    ydf_test_preds.append(ydf_md.predict(test_cv))

print('The 10 fold average out-of-fold accuracy is:', np.mean(scores))

Fold: 0 accuracy: 0.5862923203963666
Fold: 1 accuracy: 0.5920726672171759
Fold: 2 accuracy: 0.5889760528488852
Fold: 3 accuracy: 0.5949628406275805
Fold: 4 accuracy: 0.583195706028076
Fold: 5 accuracy: 0.597440132122213
Fold: 6 accuracy: 0.5955821635012386
Fold: 7 accuracy: 0.5867052023121387
Fold: 8 accuracy: 0.5842279108175062
Fold: 9 accuracy: 0.5909560189964898
The 10 fold average out-of-fold accuracy is: 0.590041101486767
CPU times: user 34min 23s, sys: 6.52 s, total: 34min 29s
Wall time: 9min 12s


In [7]:
%%time
submission = pd.read_csv('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/sample_submission.csv')
submission['winner'] = np.where(np.mean(ydf_test_preds, axis=0)>0.5, 1, 0)
submission['winner'] = submission['winner'].map({0: 'model_a', 1: 'model_b'})
print(submission.head())

        id   winner
0   327228  model_b
1  1139415  model_b
2  1235630  model_a
CPU times: user 5.45 ms, sys: 1 ms, total: 6.45 ms
Wall time: 19.6 ms


In [8]:
submission.to_csv('submission.csv', index=False)