-------------------------------------------------------------

In [1]:
import sklearn

import numpy as np 
import pandas as pd
from tqdm import tqdm
import json

import matplotlib.pyplot as plt
import matplotlib as mpl
import plotly.express as px

from sklearn.model_selection import train_test_split

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

from ModelsUtils import *


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import transformers as trsf
import sentence_transformers as sntc_trasfmr
print("Transformers:", trsf.__version__)
print("Sentence Transformers:", sntc_trasfmr.__version__)

Transformers: 4.46.3
Sentence Transformers: 2.4.0


In [3]:
import torch

from torch.utils.data import Dataset, DataLoader

import torch.optim as optim
from torch.nn.functional import cross_entropy

from transformers import AutoModel, AutoTokenizer

print('Torch version:', torch.__version__)
print('Torch is build with CUDA:', torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Torch device : {device}')
print('------------------------------')

Torch version: 2.5.1+cu118
Torch is build with CUDA: True
Torch device : cuda
------------------------------


In [4]:
#sequence_length = 64
sequence_length = 256
#sequence_length = 512
BATCH_SIZE = 1
sample_size = 0.01
EPOCHS = 2
BUILD_DATASET = False#True # Will load from file pre-preprocessed data if False
MINI_RUN = True # Test run with very little data

model_name = "microsoft/mdeberta-v3-base" # For multilingual purpose

In [5]:
BASE_PATH = './kaggle/input/llm-classification-finetuning'

In [6]:
# Load Test Data
test_df = pd.read_csv(f'{BASE_PATH}/test.csv')

## Test Data

In [7]:
# Take the first prompt and response
test_df["prompt"] = test_df.prompt.map(lambda x: ' '.join(eval(x.replace("null","''"))))
test_df["response_a"] = test_df.response_a.map(lambda x: ' '.join(eval(x.replace("null","''"))))
test_df["response_b"] = test_df.response_b.map(lambda x: ' '.join(eval(x.replace("null", "''"))))

# Show Sample
test_df.head()

Unnamed: 0,id,prompt,response_a,response_b
0,136060,"I have three oranges today, I ate an orange ye...",You have two oranges today.,You still have three oranges. Eating an orange...
1,211333,You are a mediator in a heated political debat...,Thank you for sharing the details of the situa...,Mr Reddy and Ms Blue both have valid points in...
2,1233961,How to initialize the classification head when...,When you want to initialize the classification...,To initialize the classification head when per...


In [8]:
test_df['prompt'] = test_df['prompt'].astype(str)
test_df['response_a'] = test_df['response_a'].astype(str)
test_df['response_b'] = test_df['response_b'].astype(str)

## (re)Encoding

In [9]:
test_df = test_df.apply(reencode, axis=1)  # Apply the make_pairs function to each row in df
display(test_df.head(2))  # Display the first 2 rows of df

Unnamed: 0,id,prompt,response_a,response_b,encode_fail
0,136060,"I have three oranges today, I ate an orange ye...",You have two oranges today.,You still have three oranges. Eating an orange...,False
1,211333,You are a mediator in a heated political debat...,Thank you for sharing the details of the situa...,Mr Reddy and Ms Blue both have valid points in...,False


## Feature engineering (for test set)

#### 1. Response Length

In [10]:
def add_length_features(df):
    df['resp1_length'] = df['response_a'].apply(len)
    df['resp2_length'] = df['response_b'].apply(len)
    df['length_diff'] = df['resp1_length'] - df['resp2_length']  # Difference in lengths
    return df

#### 2. Lexical Diversity

In [11]:
def lexical_diversity(text):
    tokens = text.split()  # Tokenize by whitespace
    return len(set(tokens)) / len(tokens) if len(tokens) > 0 else 0

def add_lexical_features(df):
    df['resp1_lexical_div'] = df['response_a'].apply(lexical_diversity)
    df['resp2_lexical_div'] = df['response_b'].apply(lexical_diversity)
    df['lexical_div_diff'] = df['resp1_lexical_div'] - df['resp2_lexical_div']
    return df

#### 3. Sentiment analysis

In [12]:
from transformers import pipeline

# Load sentiment analysis pipeline (ensure it's multilingual)
sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", device=0)

def get_sentiment(text):
    result = sentiment_analyzer(text[:512])  # Truncate to 512 tokens for BERT-based models
    return result[0]['label']

def add_sentiment_features(df):
    df['resp1_sentiment'] = df['response_a'].apply(get_sentiment)
    df['resp2_sentiment'] = df['response_b'].apply(get_sentiment)
    # Convert sentiments to numeric scale (e.g., positive=1, neutral=0, negative=-1)
    sentiment_map = {'positive': 1, 'neutral': 0, 'negative': -1}
    df['resp1_sentiment_num'] = df['resp1_sentiment'].map(sentiment_map)
    df['resp2_sentiment_num'] = df['resp2_sentiment'].map(sentiment_map)
    df['sentiment_diff'] = df['resp1_sentiment_num'] - df['resp2_sentiment_num']
    return df


#### 4. Semantic Similarity

In [13]:
# Load a multilingual sentence transformer model
embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

def calculate_similarity(prompt, response):
    embeddings = embedder.encode([prompt, response])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def add_similarity_features(df):
    df['resp1_similarity'] = df.apply(lambda x: calculate_similarity(x['prompt'], x['response_a']), axis=1)
    df['resp2_similarity'] = df.apply(lambda x: calculate_similarity(x['prompt'], x['response_b']), axis=1)
    df['similarity_diff'] = df['resp1_similarity'] - df['resp2_similarity']
    return df

#### 5. Question-Answer Alignment

In [14]:
from keybert import KeyBERT

# Use KeyBERT for keyword extraction
kw_model = KeyBERT()

def get_keyword_overlap(prompt, response):
    prompt_keywords = set([kw[0] for kw in kw_model.extract_keywords(prompt)])
    response_keywords = set([kw[0] for kw in kw_model.extract_keywords(response)])
    overlap = len(prompt_keywords & response_keywords)
    return overlap / len(prompt_keywords) if len(prompt_keywords) > 0 else 0

def add_keyword_overlap_features(df):
    df['resp1_keyword_overlap'] = df.apply(lambda x: get_keyword_overlap(x['prompt'], x['response_a']), axis=1)
    df['resp2_keyword_overlap'] = df.apply(lambda x: get_keyword_overlap(x['prompt'], x['response_b']), axis=1)
    df['keyword_overlap_diff'] = df['resp1_keyword_overlap'] - df['resp2_keyword_overlap']
    return df

#### 6. Language-Specific Formality or Tone (TODO for each language)

In [15]:
# # Example with spaCy and third-party plugins
# import spacy

# # Load spaCy models for specific languages
# nlp_en = spacy.load("en_core_web_sm")  # English example

# def detect_formality(text):
#     doc = nlp_en(text)
#     formality_score = sum(1 for token in doc if token.pos_ in ["VERB", "ADV"]) / len(doc)
#     return formality_score if len(doc) > 0 else 0

# def add_formality_features(df):
#     df['resp1_formality'] = df['response_a'].apply(detect_formality)
#     df['resp2_formality'] = df['response_b'].apply(detect_formality)
#     df['formality_diff'] = df['resp1_formality'] - df['resp2_formality']
#     return df

# # wont add until all languages supported

#### 7. Named Entity Recognition (NER)

In [16]:
def count_entities(text, nlp_model):
    doc = nlp_model(text)
    return len(doc.ents)

def add_ner_features(df, nlp_model):
    df['resp1_entities'] = df['response_a'].apply(lambda x: count_entities(x, nlp_model))
    df['resp2_entities'] = df['response_b'].apply(lambda x: count_entities(x, nlp_model))
    df['entity_diff'] = df['resp1_entities'] - df['resp2_entities']
    return df

### Extract all features

In [17]:
def extract_all_features(df):
    df = add_length_features(df)
    df = add_lexical_features(df)
    #df = add_sentiment_features(df)
    df = add_similarity_features(df)
    df = add_keyword_overlap_features(df)
    #df = add_formality_features(df)
    #df = add_ner_features(df)
    return df

# test will need preprocess no matter what, can't pre load them from kaggle
test_df = extract_all_features(test_df)

## Tokenizer

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



## Call

In [19]:
dataset_test = ChatbotArenaDataset(test_df, tokenizer, max_length=sequence_length, test=True)
dataloader_test = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=True)

In [20]:
# Initialize model, optimizer
model = PreferencePredictionModel(transformer_name=model_name, feature_dim=4, num_classes=3)

In [21]:
# load best epoch
checkpoint = torch.load(f'./Prebuilt/PreferencePredictionModel.pt')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [22]:
prediction = predict(model, dataloader_test)

In [23]:
prediction

[[0.4150875210762024, 0.09427947551012039, 0.4906330406665802],
 [0.9964725375175476, 0.0015812570927664638, 0.0019460591720417142],
 [1.0, 1.1172556213523421e-09, 2.0532313627086296e-09]]

In [24]:
sub_df = test_df[["id"]].copy()
sub_df[class_names] = prediction
sub_df.head()

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.415088,0.09427948,0.490633
1,211333,0.996473,0.001581257,0.001946059
2,1233961,1.0,1.117256e-09,2.053231e-09


In [25]:
sub_df.to_csv("submission.csv", index=False)