In [1]:
import pandas as pd
from collections import defaultdict
import numpy as np
import csv
import os

os.listdir('data')

['dist_sim_data.txt',
 'EN-wform.w.2.ppmi.svd.500.rcv_vocab.txt',
 'EN_syn_verb.txt',
 'GoogleNews-vectors-rcv_vocab.txt',
 'SAT-package-V3.txt']

In [2]:
google_vec_df = pd.read_csv(os.path.join('data','GoogleNews-vectors-rcv_vocab.txt'), sep=' ', header=None)
google_vec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,Lundazi,0.106934,0.144531,-0.081543,0.111816,-0.016846,-0.075684,-0.196289,0.040283,-0.359375,...,-0.273438,-0.062012,-0.152344,-0.072754,-0.129883,0.05249,-0.347656,-0.055908,0.056152,0.196289
1,Eket,-0.25,-0.017944,-0.08252,-0.031128,-0.143555,-0.292969,0.012756,0.154297,-0.229492,...,-0.051025,0.165039,-0.384766,-0.433594,-0.310547,0.171875,-0.460938,-0.099121,-0.120605,-0.318359
2,Asir,-0.073242,0.103027,-0.175781,0.102539,0.283203,0.080566,0.02356,-0.188477,-0.333984,...,-0.180664,-0.115234,0.220703,-0.049805,-0.249023,0.542969,-0.128906,-0.101074,0.167969,0.4375
3,Simha,0.088379,0.116211,-0.137695,0.121582,0.129883,-0.554688,0.302734,-0.124512,0.002457,...,-0.132812,0.140625,-0.267578,-0.122559,-0.155273,-0.123535,-0.318359,0.179688,0.146484,0.367188
4,HRCP,-0.316406,0.023438,0.158203,0.03418,-0.119629,-0.134766,0.142578,0.029053,-0.21582,...,0.074219,0.011902,0.008606,-0.018677,-0.013428,0.289062,-0.194336,0.093262,0.006927,-0.063477


In [3]:
google_words = list(google_vec_df[0])
google_word_dict = {k:v for v, k in enumerate(google_words)}
len(google_words)

140922

In [4]:
google_vecs = google_vec_df.drop(0, axis=1).values
google_vecs.shape

(140922, 300)

In [5]:
ppmi_df = pd.read_csv(os.path.join('data','EN-wform.w.2.ppmi.svd.500.rcv_vocab.txt'), header=None, sep=' ', quoting=csv.QUOTE_NONE)
ppmi_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,500
0,neo-classic,0.183098,0.129868,0.176383,-0.053295,-0.05552,0.043304,-0.133636,0.092237,-0.094436,...,0.014659,0.001169,0.036292,-0.008851,0.027492,-0.009265,-0.052609,-0.044142,-0.059494,0.012797
1,auberge,0.187866,0.017577,0.115123,0.056515,-0.142405,0.116539,-0.062118,0.135252,-0.078248,...,-0.009394,0.01332,0.001646,0.027234,0.01183,0.013212,0.007839,0.024726,-0.017699,-0.014483
2,deeps,0.284431,0.153209,0.247658,0.01008,-0.218465,0.113874,-0.183892,0.058796,-0.098555,...,0.016969,0.034882,-0.005492,-0.063476,-0.023179,-0.047171,-0.061298,-0.054737,-0.041229,0.038749
3,1997-2007,0.160917,0.017448,0.272126,0.139675,0.033443,0.131233,-0.163524,0.094085,-0.220436,...,-0.000998,0.061768,-0.032469,-0.010853,-0.019737,-0.016588,-0.03473,-0.011669,0.017729,0.037919
4,refectory,0.27882,0.083047,0.156682,0.047539,-0.244677,0.088986,-0.08845,0.180136,0.123281,...,-0.006584,-0.037627,-0.040274,-0.009409,-0.020078,-0.003393,0.025949,-0.011941,-0.027034,-0.007744


In [6]:
ppmi_words = ppmi_df[0].values.astype(str)
ppmi_word_dict = {k: v for v, k in enumerate(ppmi_words)}
len(ppmi_words)

65362

In [7]:
ppmi_vecs = ppmi_df.drop(0, axis=1).values
ppmi_vecs.shape

(65362, 500)

In [8]:
def google_get_dist(word1, word2, dist_func):
    if word1 not in google_word_dict or word2 not in google_word_dict:
        return np.inf
    idx1 = google_word_dict[word1]
    idx2 = google_word_dict[word2]
    return dist_func(google_vecs[idx1], google_vecs[idx2])

def ppmi_get_dist(word1, word2, dist_func):
    word1 = word1.replace('_', '-')
    word2 = word2.replace('_', '-')
    if word1 not in ppmi_word_dict or word2 not in ppmi_word_dict:
        return np.inf
    idx1 = ppmi_word_dict[word1]
    idx2 = ppmi_word_dict[word2]
    return dist_func(ppmi_vecs[idx1], ppmi_vecs[idx2])

# read in the original synonym data

In [9]:
syn_dict = defaultdict(set)
syn_verb = pd.read_csv(os.path.join('data','EN_syn_verb.txt'), sep='\t')
syn_verb.head()

Unnamed: 0,Input.word,Answer.suggestion
0,to_interpret,to_clarify
1,to_interpret,to_explain
2,to_interpret,to_explain
3,to_interpret,to_understand
4,to_interpret,to_clarify


# remove the leading 'to_'

In [10]:
syn_verb = syn_verb[syn_verb['Answer.suggestion'] != '0'].applymap(lambda word: word.split('_', 1)[1])
syn_verb.head()

Unnamed: 0,Input.word,Answer.suggestion
0,interpret,clarify
1,interpret,explain
2,interpret,explain
3,interpret,understand
4,interpret,clarify


# remove synonym pairs which have out-of-voc words

In [11]:
for index, row in syn_verb.iterrows():
    word1 = row['Input.word']
    word2 = row['Answer.suggestion']
    if word1 not in google_word_dict or word1.replace('_', '-') not in ppmi_word_dict:
        continue
    if word2 not in google_word_dict or word2.replace('_', '-') not in ppmi_word_dict:
        continue
    syn_dict[word1].add(word2)
    syn_dict[word2].add(word1)
question_word_set = set(syn_dict)

# generate the dataset

In [12]:
import random
random.seed(123)
question_set = []
columns = ['given_word', 'choice1', 'choice2', 'choice3', 'choice4', 'choice5', 'correct_answer']

for i in range(1000):
    given_word = random.sample(question_word_set, 1)[0]
    answer = random.sample(syn_dict[given_word], 1)[0]
    choices = random.sample(question_word_set.difference(syn_dict[given_word]).difference({given_word}), 4)
    choices.append(answer)
    random.shuffle(choices)
    question = [given_word, *choices, answer]
    question_set.append(question)
len(syn_dict)

455

# save the dataset to file

In [13]:
dataset = pd.DataFrame(question_set, columns = columns)
dataset.to_csv('synonym_dataset.csv')

# check the dataset generated

In [14]:
dataset.head()

Unnamed: 0,given_word,choice1,choice2,choice3,choice4,choice5,correct_answer
0,blink,whack,document,accomplish,wink,reveal,wink
1,phrase,study,term,proof,toil,categorize,term
2,dye,ink,worsen,edit,determine,organize,ink
3,travel,journey,attack,designate,hold,engrave,journey
4,devolve,transfer,protrude,criticize,slam,resonate,transfer


In [15]:
len(dataset)

1000

# read the dataset back

In [16]:
synonym_dataset = pd.read_csv('synonym_dataset.csv')

In [17]:
synonym_dataset.head()

Unnamed: 0.1,Unnamed: 0,given_word,choice1,choice2,choice3,choice4,choice5,correct_answer
0,0,blink,whack,document,accomplish,wink,reveal,wink
1,1,phrase,study,term,proof,toil,categorize,term
2,2,dye,ink,worsen,edit,determine,organize,ink
3,3,travel,journey,attack,designate,hold,engrave,journey
4,4,devolve,transfer,protrude,criticize,slam,resonate,transfer


# Test the accuracy of both approaches

In [18]:
from scipy.spatial.distance import cosine, euclidean
google_cosine_count, google_euclidean_count, ppmi_cosine_count, ppmi_euclidean_count = 0, 0, 0, 0
# google_out_of_v_count, ppmi_out_of_v_count = 0, 0
for index, row in synonym_dataset.iterrows():
    word = row['given_word']
    choices = [row['choice' + num] for num in list('12345')]
    correct_answer = row['correct_answer']
    
    google_cosine_dists = [google_get_dist(word, candidate, cosine) for candidate in choices]
#     if any([dist == np.inf for dist in google_cosine_dists]):
#         google_out_of_v_count += 1
#     else:
    google_cosine_answer = choices[np.argmin(google_cosine_dists)]
    if google_cosine_answer == correct_answer:
        google_cosine_count += 1
    google_euclidean_dists = [google_get_dist(word, candidate, euclidean) for candidate in choices]
    google_euclidean_answer = choices[np.argmin(google_euclidean_dists)]
    if google_euclidean_answer == correct_answer:
        google_euclidean_count += 1
    
    ppmi_cosine_dists = [ppmi_get_dist(word, candidate, cosine) for candidate in choices]
#     if any([dist == np.inf for dist in ppmi_cosine_dists]):
#         ppmi_out_of_v_count += 1
#     else:
    ppmi_cosine_answer = choices[np.argmin(ppmi_cosine_dists)]
    if ppmi_cosine_answer == correct_answer:
        ppmi_cosine_count += 1
    ppmi_euclidean_dists = [ppmi_get_dist(word, candidate, euclidean) for candidate in choices]
    ppmi_euclidean_answer = choices[np.argmin(ppmi_euclidean_dists)]
    if ppmi_euclidean_answer == correct_answer:
        ppmi_euclidean_count += 1

# print("google's accuracy on the dataset after removing %d questions that contains out-of-vocabulary words:\
#        %.3f using cosine, %.3f using euclidean"\
#        %(google_out_of_v_count, google_cosine_count / (1000 - google_out_of_v_count),\
#         google_euclidean_count / (1000 - google_out_of_v_count)))
# print("Classic Approach's accuracy on the dataset after removing %d questions that contains out-of-vocabulary words:\
#        %.3f using cosine, %.3f using euclidean"\
#        %(ppmi_out_of_v_count, ppmi_cosine_count / (1000 - ppmi_out_of_v_count),\
#         ppmi_euclidean_count / (1000 - ppmi_out_of_v_count)))

print("Google's accuracy on the dataset:           %.3f using cosine, %.3f using euclidean"\
       %(google_cosine_count / 1000, google_euclidean_count / 1000))
print("Classic Approach's accuracy on the dataset: %.3f using cosine, %.3f using euclidean"\
       %(ppmi_cosine_count / 1000, ppmi_euclidean_count / 1000))


Google's accuracy on the dataset:           0.680 using cosine, 0.532 using euclidean
Classic Approach's accuracy on the dataset: 0.525 using cosine, 0.525 using euclidean


# The SAT Questions

In [20]:
sat_questions = []
with open(os.path.join('data','SAT-package-V3.txt'), 'r') as f:
    content = f.read()
    entries = content.split('\n\n')[1:]
    for entry in entries:
        question = [i for i in entry.split('\n')[1:] if i]
        answer = question[-1]
        question = [x.split()[:-1] for x in question[:-1]]
        question.append(answer)
        sat_questions.append(question)

sat_questions[:5]

[[['lull', 'trust'],
  ['balk', 'fortitude'],
  ['betray', 'loyalty'],
  ['cajole', 'compliance'],
  ['hinder', 'destination'],
  ['soothe', 'passion'],
  'c'],
 [['ostrich', 'bird'],
  ['lion', 'cat'],
  ['goose', 'flock'],
  ['ewe', 'sheep'],
  ['cub', 'bear'],
  ['primate', 'monkey'],
  'a'],
 [['word', 'language'],
  ['paint', 'portrait'],
  ['poetry', 'rhythm'],
  ['note', 'music'],
  ['tale', 'story'],
  ['week', 'year'],
  'c'],
 [['coop', 'poultry'],
  ['aquarium', 'fish'],
  ['forest', 'wildlife'],
  ['crib', 'nursery'],
  ['fence', 'yard'],
  ['barn', 'tool'],
  'a'],
 [['legend', 'map'],
  ['subtitle', 'translation'],
  ['bar', 'graph'],
  ['figure', 'blueprint'],
  ['key', 'chart'],
  ['footnote', 'information'],
  'd']]

Here, I choose to make prediction based on the **cosine similarity of the difference** between each pair of word vectors.

In [21]:
def similarity(sample1, sample2, word1, word2):
    if sample1 not in google_word_dict or sample2 not in google_word_dict:
        return np.inf
    if word1 not in google_word_dict or word2 not in google_word_dict:
        return np.inf
    
    vec1 = google_vecs[google_word_dict[sample1]] - google_vecs[google_word_dict[sample2]]
    vec2 = google_vecs[google_word_dict[word1]] - google_vecs[google_word_dict[word2]]
    return cosine(vec1, vec2)

In [22]:
passed_question = 0
correct_count = 0
for question in sat_questions:
    correct_answer = ord(question[-1]) - ord('a')
    sample1, sample2 = question[0]
    candidates = question[1:-1]
    dists = [similarity(sample1, sample2, word1, word2) for word1, word2 in candidates]
    if any([x == np.inf for x in dists]):
        passed_question += 1
        continue
    answer = np.argmin(dists)
    if answer == correct_answer:
        correct_count += 1

print("With %d out of %d questions including out-of-voc words are passed, %d out of %d are answered correctly.\nFinal accuracy is %.3f"\
      %(passed_question, len(sat_questions), correct_count, len(sat_questions) - passed_question,\
        correct_count / (len(sat_questions) - passed_question)))

With 115 out of 374 questions including out-of-voc words are passed, 114 out of 259 are answered correctly.
Final accuracy is 0.440


### We can see that this accuracy is significantly higher than random guess (0.2).