# Programming Assignemnt 2

In [2]:
# Import relevant libraries
import os
from transformers import GPT2Model, GPT2Tokenizer
import torch
import pandas as pd
import IPython.display as display

## Task 1

In [3]:
# Download the dataset
# Check if the file exists
if not os.path.exists('word-test.v1.txt'):
    os.system('wget https://www.cs.fsu.edu/~liux/courses/deepRL/assignments/word-test.v1.txt')

# Read the dataset
with open('word-test.v1.txt') as f:
    lines = f.readlines()

# Parse the dataset
data = {}
temporary_data = [] # temporary list to store the data for each group
groups = []

for line in lines:
    if line[0] == ':':
        if len(temporary_data) > 0:
            data[groups[-1]] = temporary_data
        groups.append(line.split()[-1]) # get the group name excluding the colon
        temporary_data = [] # reset the temporary data list
        continue
    if line[0] == '/':
        continue # Skip the comments
    tokens = line.split()
    if len(tokens) == 4:
        temporary_data.append(tokens)

# Print the list of groups
print(f"List of groups: {groups}")

# Print the first group
print(f"First group: {data['capital-common-countries']}")

# We will use the following groups:
# capital-common-countries, currency, family
# Collect the query and candidate words for each group
query_words = {}
candidate_words = {}
groups_to_use = ['capital-common-countries', 'currency', 'family']

for group in groups_to_use:
    query_words[group] = set()
    candidate_words[group] = set()
    for line in data[group]:
        query_words[group].add(line[0])
        candidate_words[group].add(line[1])
        query_words[group].add(line[2])
        candidate_words[group].add(line[3])
    # turn the sets into lists
    query_words[group] = list(query_words[group])
    candidate_words[group] = list(candidate_words[group])

# Print the candidate words
print(f"List of candidate words: {candidate_words}")
print(f"List of query words: {query_words}")

List of groups: ['capital-common-countries', 'capital-world', 'currency', 'city-in-state', 'family', 'gram1-adjective-to-adverb', 'gram2-opposite', 'gram3-comparative', 'gram4-superlative', 'gram5-present-participle', 'gram6-nationality-adjective', 'gram7-past-tense', 'gram8-plural', 'gram9-plural-verbs']
First group: [['Athens', 'Greece', 'Baghdad', 'Iraq'], ['Athens', 'Greece', 'Bangkok', 'Thailand'], ['Athens', 'Greece', 'Beijing', 'China'], ['Athens', 'Greece', 'Berlin', 'Germany'], ['Athens', 'Greece', 'Bern', 'Switzerland'], ['Athens', 'Greece', 'Cairo', 'Egypt'], ['Athens', 'Greece', 'Canberra', 'Australia'], ['Athens', 'Greece', 'Hanoi', 'Vietnam'], ['Athens', 'Greece', 'Havana', 'Cuba'], ['Athens', 'Greece', 'Helsinki', 'Finland'], ['Athens', 'Greece', 'Islamabad', 'Pakistan'], ['Athens', 'Greece', 'Kabul', 'Afghanistan'], ['Athens', 'Greece', 'London', 'England'], ['Athens', 'Greece', 'Madrid', 'Spain'], ['Athens', 'Greece', 'Moscow', 'Russia'], ['Athens', 'Greece', 'Oslo', '

In [63]:
# Load pre-trained model and tokenizer
model = GPT2Model.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def get_word_embedding(word):
    """ Given a word, it returns the embedding of the word using the GPT-2 model.
    
    Args:
        word: str, the word for which the embedding is to be computed.

    Returns:
        word_embedding: torch.Tensor of shape (1, dimension), the embedding of the word.

    """

    # Check if the input is str
    if not isinstance(word, str):
        raise ValueError("Input must be a string.")

    # Tokenize the input
    input_ids = tokenizer.encode(word, return_tensors='pt')
    # Get the hidden states
    with torch.no_grad():
        outputs = model(input_ids)
    # Get the hidden states of the last layer
    last_hidden_states = outputs.last_hidden_state
    # Get the word embedding
    word_embedding = last_hidden_states.mean(dim=1)
    return word_embedding

# Get the embedding of the query words
query_word_embeddings = {}
for group in groups_to_use:
    query_word_embeddings[group] = []
    for word in query_words[group]:
        query_word_embeddings[group].append(get_word_embedding(word))
    # stack the embeddings
    query_word_embeddings[group] = torch.stack(query_word_embeddings[group])
    print(f"Query word embeddings for group {group}: {query_word_embeddings[group].shape}")

# Get the embedding of the candidate words
candidate_word_embeddings = {}
for group in groups_to_use:
    candidate_word_embeddings[group] = []
    for word in candidate_words[group]:
        candidate_word_embeddings[group].append(get_word_embedding(word))
    # stack the embeddings
    candidate_word_embeddings[group] = torch.stack(candidate_word_embeddings[group])
    print(f"Candidate word embeddings for group {group}: {candidate_word_embeddings[group].shape}")

# Create query-candidate difference matrices
query_candidate_difference_matrices = {}
for group in groups_to_use:
    # Create the query-candidate difference matrix
    query_candidate_difference_matrices[group] = query_word_embeddings[group][:, None, :, :] - \
                                                 candidate_word_embeddings[group][None, :, :, :]
    print(f"Query-candidate difference matrix for group {group}: {query_candidate_difference_matrices[group].shape}")

# Compute the q-c q-c cosine similarity tensors
cos_similarity_tensors = {}
for group in groups_to_use:
    cos_similarity_tensors[group] = torch.nn.functional.cosine_similarity(query_candidate_difference_matrices[group][:, :, None, None, :, :], 
                                                                             query_candidate_difference_matrices[group][None, None, :, :, :, :], dim=-1).squeeze()
    print(f"Q-C Q-C cosine similarity tensor for group {group}: {cos_similarity_tensors[group].shape}")

# Compute the l2 similarity matrix
l2_similarity_tensors = {}
for group in groups_to_use:
    l2_similarity_tensors[group] = torch.norm(query_candidate_difference_matrices[group][:, :, None, None, :, :]-query_candidate_difference_matrices[group][None, None, :, :, :, :], dim=-1).squeeze()
    print(f"L2 similarity tensor for group {group}: {l2_similarity_tensors[group].shape}")


Query word embeddings for group capital-common-countries: torch.Size([23, 1, 768])
Query word embeddings for group currency: torch.Size([30, 1, 768])
Query word embeddings for group family: torch.Size([23, 1, 768])
Candidate word embeddings for group capital-common-countries: torch.Size([23, 1, 768])
Candidate word embeddings for group currency: torch.Size([28, 1, 768])
Candidate word embeddings for group family: torch.Size([23, 1, 768])
Query-candidate difference matrix for group capital-common-countries: torch.Size([23, 23, 1, 768])
Query-candidate difference matrix for group currency: torch.Size([30, 28, 1, 768])
Query-candidate difference matrix for group family: torch.Size([23, 23, 1, 768])
Q-C Q-C cosine similarity tensor for group capital-common-countries: torch.Size([23, 23, 23, 23])
Q-C Q-C cosine similarity tensor for group currency: torch.Size([30, 28, 30, 28])
Q-C Q-C cosine similarity tensor for group family: torch.Size([23, 23, 23, 23])
L2 similarity tensor for group capi

In [44]:
# Create the template dataframe for the table of results
k_values = [1, 2, 5, 10, 20]
temp_data = {
    'k': k_values,
    'Accuracy Using Cosine Similarity (larger is closer)': [0] * len(k_values),
    'Accuracy Using L2 Distance (smaller is closer)': [0] * len(k_values)
}
template_df = pd.DataFrame(temp_data)
template_df.set_index('k', inplace=True)

# Calculate k-accuracy for each group
for group in groups_to_use:
    df = template_df.copy()
    for line in data[group]:
        query_1, candidate_1, query_2, answer = line
        query_1_idx = query_words[group].index(query_1)
        candidate_1_idx = candidate_words[group].index(candidate_1)
        query_2_idx = query_words[group].index(query_2)
        answer_idx = candidate_words[group].index(answer)
        # Calculate the cosine similarity
        cosine_similarities = cos_similarity_tensors[group][query_1_idx, candidate_1_idx, query_2_idx, :]
        # Calculate the l2 distance
        l2_distances = l2_similarity_tensors[group][query_1_idx, candidate_1_idx, query_2_idx, :]
        # Sort the cosine similarities
        _, cosine_similarity_indices = torch.sort(cosine_similarities, descending=True)
        # Sort the l2 distances
        _, l2_distance_indices = torch.sort(l2_distances, descending=False)
        # Calculate the accuracy
        for k in k_values:
            if answer_idx in cosine_similarity_indices[:k]:
                df.loc[k, 'Accuracy Using Cosine Similarity (larger is closer)'] += 1
            if answer_idx in l2_distance_indices[:k]:
                df.loc[k, 'Accuracy Using L2 Distance (smaller is closer)'] += 1
    # Normalize the accuracy
    df['Accuracy Using Cosine Similarity (larger is closer)'] /= len(data[group])
    df['Accuracy Using L2 Distance (smaller is closer)'] /= len(data[group])
    print(f"Results for group {group}:")
    display.display(df)

Results for group capital-common-countries:


Unnamed: 0_level_0,Accuracy Using Cosine Similarity (larger is closer),Accuracy Using L2 Distance (smaller is closer)
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.031621,0.173913
2,0.134387,0.399209
5,0.312253,0.565217
10,0.5,0.687747
20,0.889328,0.942688


Results for group currency:


Unnamed: 0_level_0,Accuracy Using Cosine Similarity (larger is closer),Accuracy Using L2 Distance (smaller is closer)
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,0.0
2,0.049654,0.06351
5,0.172055,0.248268
10,0.340647,0.445727
20,0.722864,0.735566


Results for group family:


Unnamed: 0_level_0,Accuracy Using Cosine Similarity (larger is closer),Accuracy Using L2 Distance (smaller is closer)
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.039526,0.438735
2,0.158103,0.583004
5,0.322134,0.810277
10,0.494071,0.922925
20,0.87747,0.956522
