# Track 1
**Discrete Text Representation**: Choose a discrete representation method we have seen in class, such as n-gram word or character-level representations, Count Vectorizer, or TF-IDF.

## TF-IDF

In [None]:
# Install these versions of the libraries in case of conflicts between them
# Terminal
# pip install pandas==2.2.3 numpy==1.26.4 scikit-learn==1.6.1 nltk==3.9.1
# Jupyter Notebook
# %pip install pandas==2.2.3 numpy==1.26.4 scikit-learn==1.6.1 nltk==3.9.1

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import re
import string

# 1. Loading Data
train_prompts = pd.read_csv("train_prompts.csv")
train_responses = pd.read_csv("train_responses.csv")
dev_prompts = pd.read_csv("dev_prompts.csv")
dev_responses = pd.read_csv("dev_responses.csv")

# 2. Combine datasets and split 80% train and 20% test
combined_prompts = pd.concat([train_prompts, dev_prompts], ignore_index=True)
combined_responses = pd.concat([train_responses, dev_responses], ignore_index=True)
train_prompts, test_prompts, train_responses, test_responses = train_test_split(
    combined_prompts, combined_responses, test_size=0.2, random_state=100
)

# 3. Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation while keeping apostrophes, hyphens, some meaningful characters
    text = re.sub(r'[^\w\s\'-?!.:@]', '', text)
    
    # Normalize spaces and handle multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

train_prompts["user_prompt"] = train_prompts["user_prompt"].apply(preprocess_text)
test_prompts["user_prompt"] = test_prompts["user_prompt"].apply(preprocess_text)

# 4. TF-IDF and Retrieving
# No stop word removal to preserve all potential meaningful tokens
# Using unigrams only for simplicity
vectorizer = TfidfVectorizer(
    stop_words=None,  
    ngram_range=(1, 1),  
)
tfidf_matrix = vectorizer.fit_transform(train_prompts["user_prompt"])
test_tfidf = vectorizer.transform(test_prompts["user_prompt"])

similarities = cosine_similarity(test_tfidf, tfidf_matrix)
retrieved_indices = np.argmax(similarities, axis=1)
retrieved_responses = train_responses.iloc[retrieved_indices]["model_response"].values

# 5. Compute BLEU Score on Test Split
test_split = test_prompts.copy()
test_split["retrieved_response"] = retrieved_responses
test_split["model_response"] = test_responses["model_response"].astype(str)
test_split["retrieved_response"] = test_split["retrieved_response"].astype(str)

smoothingfunction = SmoothingFunction()
test_split["bleu_score"] = test_split.apply(
    lambda x: sentence_bleu(
        [x["model_response"].split()],
        x["retrieved_response"].split(),
        weights=(0.5, 0.5, 0, 0),
        smoothing_function=smoothingfunction.method3
    ),
    axis=1
)

print("Average BLEU on Test Split:", test_split["bleu_score"].mean())

Average BLEU on Test Split: 0.08988348156035171


## Creating the track_1_test.csv

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# 1. Loading Data
train_prompts = pd.read_csv("train_prompts.csv")
train_responses = pd.read_csv("train_responses.csv")
dev_prompts = pd.read_csv("dev_prompts.csv")
dev_responses = pd.read_csv("dev_responses.csv")
test_prompts = pd.read_csv("test_prompts.csv")

# Remove duplicates if they exist
train_prompts = train_prompts.drop_duplicates(subset=['conversation_id'])
dev_prompts = dev_prompts.drop_duplicates(subset=['conversation_id'])
test_prompts = test_prompts.drop_duplicates(subset=['conversation_id'])

# 2. Combine TRAIN and DEV datasets for training
combined_prompts = pd.concat([train_prompts, dev_prompts], ignore_index=True)
combined_responses = pd.concat([train_responses, dev_responses], ignore_index=True)

# 3. Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation while keeping apostrophes, hyphens, some meaningful characters
    text = re.sub(r'[^\w\s\'-?!.:@]', '', text)
    
    # Normalize spaces and handle multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

combined_prompts["user_prompt"] = combined_prompts["user_prompt"].apply(preprocess_text)
test_prompts["user_prompt"] = test_prompts["user_prompt"].apply(preprocess_text)

# 4. TF-IDF and Retrieving
vectorizer = TfidfVectorizer(
    stop_words=None,  # No stop word removal
    ngram_range=(1, 1),  # Unigrams only
)
tfidf_matrix = vectorizer.fit_transform(combined_prompts["user_prompt"])
test_tfidf = vectorizer.transform(test_prompts["user_prompt"])

# Compute similarities and find most similar prompts
similarities = cosine_similarity(test_tfidf, tfidf_matrix)
retrieved_indices = np.argmax(similarities, axis=1)

# 5. Create submission CSV
submission = pd.DataFrame({
    'conversation_id': test_prompts['conversation_id'].reset_index(drop=True),
    'response_id': combined_prompts.iloc[retrieved_indices]['conversation_id'].reset_index(drop=True)
})

# Save submission CSV
submission.to_csv('track_1_test.csv', index=False)


Submission CSV created successfully:
                    conversation_id                       response_id
0  0cf125095fa74e129f9b7b6054d2993e  084ff7f8d7b64fa39032743aae1b64d2
1  e6296e2a7a554a3db3152704d065498e  65df79369c95468fbf53a7a9064c9a76
2  ee22ccf57c064f5f955f1fd2f9ed5e90  556ad1d8aff84d268c267dfc4d076de0
3  f5ef6be6d11746e39ec404496c307ab8  66bb4159f47c48ebabcd028de3b944a7
4  1fcea667861046d1834b17e7851dcca4  cad50072d7874e66b7cc223ba1f91fd8
Total rows: 5000
