In [1]:
import sklearn

import numpy as np 
import pandas as pd
from tqdm import tqdm
import json

import matplotlib.pyplot as plt
import matplotlib as mpl
import plotly.express as px

from sklearn.model_selection import train_test_split

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import ModelsUtils as Utils

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#sequence_length = 64
sequence_length = 256
#sequence_length = 512
sample_size = 0.01
BUILD_DATASET = True    # Will load from file pre-preprocessed data if False
MINI_RUN = True         # Test run with very little data

#model_name = "microsoft/mdeberta-v3-base" # For multilingual purpose


In [3]:
BASE_PATH = './kaggle/input/llm-classification-finetuning'
CUSTOM_BASE_PATH = '../Data'

## Files

`train.csv`
- `id`
- `model_[a/b]`: Model identity, present in train.csv but not in test.csv.
- `prompt`: Input prompt given to both models.
- `response_[a/b]`: Model_[a/b]'s response to the prompt.
- `winner_model_[a/b/tie]`: Binary columns indicating the judge's selection (ground truth target).

`test.csv`
- `id`: Unique identifier for each row.
- `prompt`: Input prompt given to both models.
- `response_[a/b]`: Model_[a/b]'s response to the prompt.

> !!!! Note that each interaction may have multiple prompts and responses, but this notebook will use only one prompt per interaction. You can choose to use all prompts and responses. Additionally, prompts and responses in the dataframe are provided as string-formatted lists, so they need to be converted to literal lists using eval().

> !!! TODO : use all prompts

In [4]:
# Load Train Data
df = pd.DataFrame()

if BUILD_DATASET:
    df = pd.read_csv(f'{BASE_PATH}/train.csv')
else:
    if MINI_RUN:
        df = pd.read_csv(f'{CUSTOM_BASE_PATH}/train_preprocessed_mini.csv')
    else:
        df = pd.read_csv(f'{CUSTOM_BASE_PATH}/train_preprocessed_full.csv')

## Train Data

In [5]:
# Sample data
if BUILD_DATASET & MINI_RUN:
    df = df.sample(frac=sample_size)

if BUILD_DATASET :
    # Take the first prompt and its associated response
    #df["prompt"] = df.prompt.map(lambda x: eval(x)[0])
    df["prompt"] = df.prompt.map(lambda x: ' '.join(eval(x.replace("null","''"))))
    df["response_a"] = df.response_a.map(lambda x: ' '.join(eval(x.replace("null","''"))))
    df["response_b"] = df.response_b.map(lambda x: ' '.join(eval(x.replace("null", "''"))))

    # Label conversion
    df["class_name"] = df[["winner_model_a", "winner_model_b", "winner_tie"]].idxmax(axis=1)
    df["class_label"] = df.class_name.map(Utils.name2label)

In [6]:
df['prompt'] = df['prompt'].astype(str)
df['response_a'] = df['response_a'].astype(str)
df['response_b'] = df['response_b'].astype(str)

## Encoding

In [7]:
if BUILD_DATASET :
    df = df.apply(Utils.reencode, axis=1)  # Apply the make_pairs function to each row in df
    display(df.head(2))  # Display the first 2 rows of df

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,class_name,class_label,encode_fail
4165,316711818,qwen-14b-chat,mistral-7b-instruct,List the largest global religious terrorist or...,1. Islamic State (ISIS\/ISIL) - Islam\n 2. T...,"1. Islamic State (ISIS) - Approximately 30,00...",0,0,1,winner_tie,2,False
20711,1541658215,gpt-4-0613,claude-2.0,shopify how do i get section settings translat...,To translate Shopify section settings to other...,Here are a few ways to get section settings tr...,0,0,1,winner_tie,2,False


In [8]:
df.encode_fail.value_counts(normalize=False)

encode_fail
False    564
True      11
Name: count, dtype: int64

## EDA

In [9]:
model_df = pd.concat([df.model_a, df.model_b])
counts = model_df.value_counts().reset_index()
counts.columns = ['LLM', 'Count']

# Create a bar plot with custom styling using Plotly
fig = px.bar(counts, x='LLM', y='Count',
                title='Distribution of LLMs',
                color='Count', color_continuous_scale='viridis', width=1000)

fig.update_layout(xaxis_tickangle=-45)  # Rotate x-axis labels for better readability

fig.show()

### Winning distribution

In [10]:
counts = df['class_name'].value_counts().reset_index()
counts.columns = ['Winner', 'Win Count']

fig = px.bar(counts, x='Winner', y='Win Count',
                title='Winner distribution for Train Data',
                labels={'Winner': 'Winner', 'Win Count': 'Win Count'},
                color='Winner', color_continuous_scale='viridis', width=1000)

fig.update_layout(xaxis_title="Winner", yaxis_title="Win Count")

fig.show()

### Winning distribution ratio per model

In [11]:
models_a = df.query('winner_model_a == 1').groupby(['model_a'])['winner_model_a'].count().reset_index() 
models_a.columns = ['model', 'wins']
models_a['losses'] = df.query('winner_model_a == 0').groupby(['model_a'])['winner_model_a'].count().reset_index()['winner_model_a']

models_b = df.query('winner_model_b == 1').groupby(['model_b'])['winner_model_b'].count().reset_index() 
models_b.columns = ['model', 'wins']
models_b['losses'] = df.query('winner_model_b == 0').groupby(['model_b'])['winner_model_b'].count().reset_index()['winner_model_b']

models = models_a
models[['wins', 'losses']] = models_a[['wins', 'losses']] + models_b[['wins', 'losses']]

In [12]:
models['winsRatio'] = (models['wins'] / (models['wins'] + models['losses']))
models.sort_values(by='winsRatio', ascending=False, inplace=True)

In [13]:
#models.sort_values(by='wins', ascending=False, inplace=True)

fig = px.bar(
    data_frame = models,
    x = "model",
    y = ["winsRatio"],
    opacity = 0.9,
    #orientation = "v",
    #barmode = 'stack',
    title='Wins ratio per model',
)

fig.update_layout(xaxis_tickangle=-45)  # Rotate x-axis labels for better readability

fig.show()

> Data we can create from this dataset:
> - ratio length response/prompt
> - embeddings cosine similarity
> - check embbeding vector difference between prompt/response (create a 'mean' difference vector from all best response) and check cosine similarity distribution.

In [14]:
df.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,class_name,class_label,encode_fail
4165,316711818,qwen-14b-chat,mistral-7b-instruct,List the largest global religious terrorist or...,1. Islamic State (ISIS\/ISIL) - Islam\n 2. T...,"1. Islamic State (ISIS) - Approximately 30,00...",0,0,1,winner_tie,2,False
20711,1541658215,gpt-4-0613,claude-2.0,shopify how do i get section settings translat...,To translate Shopify section settings to other...,Here are a few ways to get section settings tr...,0,0,1,winner_tie,2,False
2166,167265396,mistral-7b-instruct,mixtral-8x7b-instruct-v0.1,"read the nanoGPT reposotory, write some code t...",The nanoGPT repository contains the code for t...,"I have reviewed the nanoGPT repository, and I ...",0,1,0,winner_model_b,1,False
16826,1254354571,gpt-4-1106-preview,gemini-pro-dev-api,anti-racism === racism but anti-racism still p...,"""Anti-racism"" and ""racism"" are not equivalent ...",This statement is false. Anti-racism is the op...,0,0,1,winner_tie,2,False
45886,3421248584,mistral-medium,pplx-70b-online,provide me with a random question to which you...,Question: Why do cats purr?\n\nAnswer: Cats pu...,"Here's a random question: ""What is a black hol...",0,0,1,winner_tie,2,False


## Feature engineering

#### 1. Response Length

In [15]:
def add_length_features(df):
    df['resp1_length'] = df['response_a'].apply(len)
    df['resp2_length'] = df['response_b'].apply(len)
    df['length_diff'] = df['resp1_length'] - df['resp2_length']  # Difference in lengths
    return df

#### 2. Lexical Diversity

In [16]:
def lexical_diversity(text):
    tokens = text.split()  # Tokenize by whitespace
    return len(set(tokens)) / len(tokens) if len(tokens) > 0 else 0

def add_lexical_features(df):
    df['resp1_lexical_div'] = df['response_a'].apply(lexical_diversity)
    df['resp2_lexical_div'] = df['response_b'].apply(lexical_diversity)
    df['lexical_div_diff'] = df['resp1_lexical_div'] - df['resp2_lexical_div']
    return df

#### 3. Sentiment analysis

In [17]:
from transformers import pipeline

# Load sentiment analysis pipeline (ensure it's multilingual)
sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", device=0)

def get_sentiment(text):
    result = sentiment_analyzer(text[:512])  # Truncate to 512 tokens for BERT-based models
    return result[0]['label']

def add_sentiment_features(df):
    df['resp1_sentiment'] = df['response_a'].apply(get_sentiment)
    df['resp2_sentiment'] = df['response_b'].apply(get_sentiment)
    # Convert sentiments to numeric scale (e.g., positive=1, neutral=0, negative=-1)
    sentiment_map = {'positive': 1, 'neutral': 0, 'negative': -1}
    df['resp1_sentiment_num'] = df['resp1_sentiment'].map(sentiment_map)
    df['resp2_sentiment_num'] = df['resp2_sentiment'].map(sentiment_map)
    df['sentiment_diff'] = df['resp1_sentiment_num'] - df['resp2_sentiment_num']
    return df


#### 4. Semantic Similarity

In [18]:
# Load a multilingual sentence transformer model
embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2__')

#embedder.save('paraphrase-multilingual-MiniLM-L12-v2__')

def calculate_similarity(prompt, response):
    embeddings = embedder.encode([prompt, response])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def add_similarity_features(df):
    df['resp1_similarity'] = df.apply(lambda x: calculate_similarity(x['prompt'], x['response_a']), axis=1)
    df['resp2_similarity'] = df.apply(lambda x: calculate_similarity(x['prompt'], x['response_b']), axis=1)
    df['similarity_diff'] = df['resp1_similarity'] - df['resp2_similarity']
    return df

#### 5. Question-Answer Alignment

In [19]:
from keybert import KeyBERT

# Use KeyBERT for keyword extraction
#kw_model = KeyBERT()

kw_model = KeyBERT(model='paraphrase-multilingual-MiniLM-L12-v2__')

def get_keyword_overlap(prompt, response):
    prompt_keywords = set([kw[0] for kw in kw_model.extract_keywords(prompt)])
    response_keywords = set([kw[0] for kw in kw_model.extract_keywords(response)])
    overlap = len(prompt_keywords & response_keywords)
    return overlap / len(prompt_keywords) if len(prompt_keywords) > 0 else 0

def add_keyword_overlap_features(df):
    df['resp1_keyword_overlap'] = df.apply(lambda x: get_keyword_overlap(x['prompt'], x['response_a']), axis=1)
    df['resp2_keyword_overlap'] = df.apply(lambda x: get_keyword_overlap(x['prompt'], x['response_b']), axis=1)
    df['keyword_overlap_diff'] = df['resp1_keyword_overlap'] - df['resp2_keyword_overlap']
    return df

#### 6. Language-Specific Formality or Tone (TODO for each language)

In [20]:
# # Example with spaCy and third-party plugins
# import spacy

# # Load spaCy models for specific languages
# nlp_en = spacy.load("en_core_web_sm")  # English example

# def detect_formality(text):
#     doc = nlp_en(text)
#     formality_score = sum(1 for token in doc if token.pos_ in ["VERB", "ADV"]) / len(doc)
#     return formality_score if len(doc) > 0 else 0

# def add_formality_features(df):
#     df['resp1_formality'] = df['response_a'].apply(detect_formality)
#     df['resp2_formality'] = df['response_b'].apply(detect_formality)
#     df['formality_diff'] = df['resp1_formality'] - df['resp2_formality']
#     return df

# # wont add until all languages supported

#### 7. Named Entity Recognition (NER)

In [21]:
def count_entities(text, nlp_model):
    doc = nlp_model(text)
    return len(doc.ents)

def add_ner_features(df, nlp_model):
    df['resp1_entities'] = df['response_a'].apply(lambda x: count_entities(x, nlp_model))
    df['resp2_entities'] = df['response_b'].apply(lambda x: count_entities(x, nlp_model))
    df['entity_diff'] = df['resp1_entities'] - df['resp2_entities']
    return df

### Extract all features

In [23]:
def extract_all_features(df):
    df = add_length_features(df)
    df = add_lexical_features(df)
    #df = add_sentiment_features(df)
    df = add_similarity_features(df)
    df = add_keyword_overlap_features(df)
    #df = add_formality_features(df)
    #df = add_ner_features(df)
    return df

# test will need preprocess no matter what, can't pre load them from kaggle
#test_df = extract_all_features(test_df)

if BUILD_DATASET:
    df = extract_all_features(df)
    if MINI_RUN :
        df.to_csv(f'{CUSTOM_BASE_PATH}/train_preprocessed_mini.csv', index = False)
    else:
        df.to_csv(f'{CUSTOM_BASE_PATH}/train_preprocessed_full.csv', index = False)
    