# Track 3
**(✨BONUS✨): Open Text Representation**. In this track, you can use any combination of the two previous or another representation method. This could include methods not covered in class.

## TF-IDF + SBERT

In [None]:
# Install these versions of the libraries in case of conflicts between them
# Terminal
# pip install nltk==3.9.1 pandas==2.2.3 scikit-learn==1.6.1 numpy==1.26.4 sentence-transformers==3.4.1
# Jupyter Notebook
# %pip install nltk==3.9.1 pandas==2.2.3 scikit-learn==1.6.1 numpy==1.26.4 sentence-transformers==3.4.1

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# 1. Loading Data
train_prompts = pd.read_csv("train_prompts.csv")
train_responses = pd.read_csv("train_responses.csv")
dev_prompts   = pd.read_csv("dev_prompts.csv")
dev_responses = pd.read_csv("dev_responses.csv")
test_prompts  = pd.read_csv("test_prompts.csv")  # For development, assume test has responses

# 2. Combine Datasets and Split (80% Train, 20% Test)
combined_prompts   = pd.concat([train_prompts, dev_prompts], ignore_index=True)
combined_responses = pd.concat([train_responses, dev_responses], ignore_index=True)
train_prompts, test_prompts, train_responses, test_responses = train_test_split(
    combined_prompts, combined_responses, test_size=0.2, random_state=100
)

# 3. Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)  # Return a joined string for TF-IDF

train_prompts["processed_text"] = train_prompts["user_prompt"].astype(str).apply(preprocess_text)
test_prompts["processed_text"]  = test_prompts["user_prompt"].astype(str).apply(preprocess_text)

# 4. Vectorization & Two-Phase Retrieval
# Phase 1: TF-IDF Retrieval
tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(train_prompts["processed_text"])
tfidf_test  = tfidf_vectorizer.transform(test_prompts["processed_text"])

# Phase 2: SBERT Re-Ranking
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
# Precompute SBERT embeddings for training prompts (use original text for full semantic content)
train_sbert = np.vstack(train_prompts["user_prompt"].astype(str).apply(lambda x: sbert_model.encode(x)).values)


# Retrieve candidate responses:
top_N = 100  # number of candidates from TF-IDF phase

retrieved_responses = []
for i in range(tfidf_test.shape[0]):
    # Get TF-IDF similarity scores between the test prompt and all training prompts
    tfidf_sim = cosine_similarity(tfidf_test[i], tfidf_train).flatten()
    # Select top_N indices based on TF-IDF scores
    candidate_indices = np.argsort(tfidf_sim)[-top_N:]
    
    # Compute SBERT similarity for the test prompt against the candidate set
    test_embedding = sbert_model.encode(test_prompts.iloc[i]["user_prompt"])
    candidate_embeddings = train_sbert[candidate_indices]
    sbert_sim = cosine_similarity(test_embedding.reshape(1, -1), candidate_embeddings).flatten()
    
    # Combine scores: re-rank solely based on SBERT 
    best_candidate_idx = candidate_indices[np.argmax(sbert_sim)]
    
    # Retrieve the corresponding response from the training responses
    retrieved_response = train_responses.iloc[best_candidate_idx]["model_response"]
    retrieved_responses.append(retrieved_response)

# 5. Compute BLEU Score on Test Split
test_split = test_prompts.copy()
test_split["retrieved_response"] = retrieved_responses
test_split["model_response"] = test_responses["model_response"].astype(str)
test_split["retrieved_response"] = test_split["retrieved_response"].astype(str)

smoothing_function = SmoothingFunction()
test_split["bleu_score"] = test_split.apply(
    lambda x: sentence_bleu(
        [x["model_response"].split()],
        x["retrieved_response"].split(),
        weights=(0.5, 0.5, 0, 0),
        smoothing_function=smoothing_function.method3
    ),
    axis=1
)

print("Average BLEU on Test Split (Two-Phase TF-IDF + SBERT):", test_split["bleu_score"].mean())


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lucamilani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucamilani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Average BLEU on Test Split (Two-Phase TF-IDF + SBERT): 0.10239148667597395


## Creating the track_3_test.csv

In [8]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# 1. Loading Data
train_prompts = pd.read_csv("train_prompts.csv")
train_responses = pd.read_csv("train_responses.csv")
dev_prompts = pd.read_csv("dev_prompts.csv")
dev_responses = pd.read_csv("dev_responses.csv")
test_prompts = pd.read_csv("test_prompts.csv")

# Remove duplicates if they exist
train_prompts = train_prompts.drop_duplicates(subset=['conversation_id'])
dev_prompts = dev_prompts.drop_duplicates(subset=['conversation_id'])
test_prompts = test_prompts.drop_duplicates(subset=['conversation_id'])

# 2. Combine TRAIN and DEV datasets for training
combined_prompts = pd.concat([train_prompts, dev_prompts], ignore_index=True)
combined_responses = pd.concat([train_responses, dev_responses], ignore_index=True)

# 3. Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)  # Return a joined string for TF-IDF

# Preprocess prompts
combined_prompts["processed_text"] = combined_prompts["user_prompt"].astype(str).apply(preprocess_text)
test_prompts["processed_text"] = test_prompts["user_prompt"].astype(str).apply(preprocess_text)

# 4. Vectorization & Two-Phase Retrieval
# Phase 1: TF-IDF Retrieval
tfidf_vectorizer = TfidfVectorizer()
tfidf_combined = tfidf_vectorizer.fit_transform(combined_prompts["processed_text"])
tfidf_test = tfidf_vectorizer.transform(test_prompts["processed_text"])

# Phase 2: SBERT Re-Ranking
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
# Precompute SBERT embeddings for combined prompts (use original text for full semantic content)
combined_sbert = np.vstack(combined_prompts["user_prompt"].astype(str).apply(lambda x: sbert_model.encode(x)).values)

# Retrieve candidate responses
top_N = 100  # number of candidates from TF-IDF phase
retrieved_indices = []

for i in range(tfidf_test.shape[0]):
    # Get TF-IDF similarity scores between the test prompt and all combined prompts
    tfidf_sim = cosine_similarity(tfidf_test[i], tfidf_combined).flatten()
    # Select top_N indices based on TF-IDF scores
    candidate_indices = np.argsort(tfidf_sim)[-top_N:]
    
    # Compute SBERT similarity for the test prompt against the candidate set
    test_embedding = sbert_model.encode(test_prompts.iloc[i]["user_prompt"])
    candidate_embeddings = combined_sbert[candidate_indices]
    sbert_sim = cosine_similarity(test_embedding.reshape(1, -1), candidate_embeddings).flatten()
    
    # Select the best candidate index
    best_candidate_idx = candidate_indices[np.argmax(sbert_sim)]
    retrieved_indices.append(best_candidate_idx)

# Create submission CSV
submission = pd.DataFrame({
    'conversation_id': test_prompts['conversation_id'].reset_index(drop=True),
    'response_id': combined_prompts.iloc[retrieved_indices]['conversation_id'].reset_index(drop=True)
})

# Save submission CSV
submission.to_csv('track_3_test.csv', index=False)

print("Submission CSV created successfully:")
print(submission.head())
print(f"Total rows: {len(submission)}")



Submission CSV created successfully:
                    conversation_id                       response_id
0  0cf125095fa74e129f9b7b6054d2993e  a0addd7b3ccd4de4a7e65ca4ecc853cb
1  e6296e2a7a554a3db3152704d065498e  36c96269917546819714296935de4793
2  ee22ccf57c064f5f955f1fd2f9ed5e90  80e76b8afd034f2bbfe5da8c80eab817
3  f5ef6be6d11746e39ec404496c307ab8  66bb4159f47c48ebabcd028de3b944a7
4  1fcea667861046d1834b17e7851dcca4  717f4d21de9c4ff6863049d546fc310e
Total rows: 5000
