# Track 2
**Distributed Static Text Representation**. Choose a static distributed representation method we have seen in class, such as Word2vec, Doc2Vec, or pretrained embeddings like FastText.

## Spacy / Google news Word2vec

In [None]:
# Install these versions of the libraries in case of conflicts between them
# Terminal
# pip install pandas==2.2.3 numpy==1.26.4 scikit-learn==1.6.1 nltk==3.9.1 spacy==3.8.4 gensim==4.3.3
# Jupyter Notebook
# %pip install pandas==2.2.3 numpy==1.26.4 scikit-learn==1.6.1 nltk==3.9.1 spacy==3.8.4 gensim==4.3.3

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import spacy
import gensim.downloader as api

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# 1. Loading Data
train_prompts = pd.read_csv("train_prompts.csv")
train_responses = pd.read_csv("train_responses.csv")
dev_prompts = pd.read_csv("dev_prompts.csv")
dev_responses = pd.read_csv("dev_responses.csv")
test_prompts = pd.read_csv("test_prompts.csv")  # For development, assume test has responses

# 2. Combine Datasets and Split (80% Train, 20% Test)
combined_prompts = pd.concat([train_prompts, dev_prompts], ignore_index=True)
combined_responses = pd.concat([train_responses, dev_responses], ignore_index=True)
train_prompts, test_prompts, train_responses, test_responses = train_test_split(
    combined_prompts, combined_responses, test_size=0.2, random_state=100
)

# 3. Preprocessing with Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens for improved normalization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Process the prompts with the updated preprocessing
train_prompts["tokens"] = train_prompts["user_prompt"].astype(str).apply(preprocess_text)
test_prompts["tokens"] = test_prompts["user_prompt"].astype(str).apply(preprocess_text)

# 4. Vectorization & Retrieval using Google News Word2Vec
# Load spaCy's medium English model
nlp = spacy.load("en_core_web_md")

# Google News Word2Vec model via gensim
google_news_model = api.load("word2vec-google-news-300")

def get_spacy_vector(tokens):
    # Convert list of tokens to string and get spaCy's document vector
    return nlp(" ".join(tokens)).vector

def get_google_news_vector(tokens, vector_size=300):
    # Average the Google News Word2Vec embeddings.
    vectors = []
    for word in tokens:
        # Google News embeddings are case-sensitive. Tried different casings.
        if word in google_news_model:
            vectors.append(google_news_model[word])
        elif word.upper() in google_news_model:
            vectors.append(google_news_model[word.upper()])
        elif word.capitalize() in google_news_model:
            vectors.append(google_news_model[word.capitalize()])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

# Precompute vectors for the training set using both methods
train_prompts["spacy_vector"] = train_prompts["tokens"].apply(get_spacy_vector)
train_prompts["google_news_vector"] = train_prompts["tokens"].apply(lambda tokens: get_google_news_vector(tokens))

# Convert training vectors to matrices for similarity computation
train_spacy_matrix = np.vstack(train_prompts["spacy_vector"].values)
train_google_news_matrix = np.vstack(train_prompts["google_news_vector"].values)

# Precompute test vectors similarly
test_prompts["spacy_vector"] = test_prompts["tokens"].apply(get_spacy_vector)
test_prompts["google_news_vector"] = test_prompts["tokens"].apply(lambda tokens: get_google_news_vector(tokens))

# Retrieval using an ensemble of spaCy and Google News Word2Vec (equal weights)
retrieved_responses = []
w_spacy = 0.5
w_google_news = 0.5

for idx, row in test_prompts.iterrows():
    test_spacy_vec = row["spacy_vector"].reshape(1, -1)
    test_google_news_vec = row["google_news_vector"].reshape(1, -1)
    
    sim_spacy = cosine_similarity(test_spacy_vec, train_spacy_matrix)[0]
    sim_google_news = cosine_similarity(test_google_news_vec, train_google_news_matrix)[0]
    
    combined_sim = w_spacy * sim_spacy + w_google_news * sim_google_news
    best_match_idx = np.argmax(combined_sim)
    retrieved_response = train_responses.iloc[best_match_idx]["model_response"]
    retrieved_responses.append(retrieved_response)


# 5. Compute BLEU Score on Test Split
test_split = test_prompts.copy()
test_split["retrieved_response"] = retrieved_responses
test_split["model_response"] = test_responses["model_response"].astype(str)
test_split["retrieved_response"] = test_split["retrieved_response"].astype(str)

smoothing_function = SmoothingFunction()
test_split["bleu_score"] = test_split.apply(
    lambda x: sentence_bleu(
        [x["model_response"].split()],
        x["retrieved_response"].split(),
        weights=(0.5, 0.5, 0, 0),
        smoothing_function=smoothing_function.method3
    ),
    axis=1
)

print("Average BLEU on Test Split (Ensemble spaCy + FastText):", test_split["bleu_score"].mean())

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lucamilani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucamilani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lucamilani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Average BLEU on Test Split (Ensemble spaCy + FastText): 0.085622132385527


## Creating the track_2_test.csv

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import gensim.downloader as api

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# 1. Loading Data
train_prompts = pd.read_csv("train_prompts.csv")
train_responses = pd.read_csv("train_responses.csv")
dev_prompts = pd.read_csv("dev_prompts.csv")
dev_responses = pd.read_csv("dev_responses.csv")
test_prompts = pd.read_csv("test_prompts.csv")

# Remove duplicates if they exist
train_prompts = train_prompts.drop_duplicates(subset=['conversation_id'])
dev_prompts = dev_prompts.drop_duplicates(subset=['conversation_id'])
test_prompts = test_prompts.drop_duplicates(subset=['conversation_id'])

# 2. Combine TRAIN and DEV datasets for training
combined_prompts = pd.concat([train_prompts, dev_prompts], ignore_index=True)
combined_responses = pd.concat([train_responses, dev_responses], ignore_index=True)

# 3. Preprocessing with Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens for improved normalization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Load spaCy's medium English model
nlp = spacy.load("en_core_web_md")

# Load Google News Word2Vec model
google_news_model = api.load("word2vec-google-news-300")

def get_google_news_vector(tokens, vector_size=300):
    # Average the Google News Word2Vec embeddings
    vectors = []
    for word in tokens:
        # Google News embeddings are case-sensitive. Tried different casings.
        if word in google_news_model:
            vectors.append(google_news_model[word])
        elif word.upper() in google_news_model:
            vectors.append(google_news_model[word.upper()])
        elif word.capitalize() in google_news_model:
            vectors.append(google_news_model[word.capitalize()])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

# Preprocess prompts
combined_prompts["tokens"] = combined_prompts["user_prompt"].astype(str).apply(preprocess_text)
test_prompts["tokens"] = test_prompts["user_prompt"].astype(str).apply(preprocess_text)

# Compute vectors for combined prompts
combined_prompts["google_news_vector"] = combined_prompts["tokens"].apply(lambda tokens: get_google_news_vector(tokens))
test_prompts["google_news_vector"] = test_prompts["tokens"].apply(lambda tokens: get_google_news_vector(tokens))

# Convert training vectors to matrix
combined_matrix = np.vstack(combined_prompts["google_news_vector"].values)
test_matrix = np.vstack(test_prompts["google_news_vector"].values)

# Compute similarities
similarities = cosine_similarity(test_matrix, combined_matrix)
retrieved_indices = np.argmax(similarities, axis=1)

# Create submission CSV
submission = pd.DataFrame({
    'conversation_id': test_prompts['conversation_id'].reset_index(drop=True),
    'response_id': combined_prompts.iloc[retrieved_indices]['conversation_id'].reset_index(drop=True)
})

# Save submission CSV
submission.to_csv('track_2_test.csv', index=False)

print("Submission CSV created successfully:")
print(submission.head())
print(f"Total rows: {len(submission)}")

Submission CSV created successfully:
                    conversation_id                       response_id
0  0cf125095fa74e129f9b7b6054d2993e  3d3e33d6cb114ff990e82cea8c1db716
1  e6296e2a7a554a3db3152704d065498e  86ee55deab3e4197b5b2df0f94d9c5ef
2  ee22ccf57c064f5f955f1fd2f9ed5e90  1a9bbe337a78466b93b001ff7af8c4c0
3  f5ef6be6d11746e39ec404496c307ab8  66bb4159f47c48ebabcd028de3b944a7
4  1fcea667861046d1834b17e7851dcca4  26fddc26f1a94412b4c3e6eb79af4ef2
Total rows: 5000
