Introduction to NLP course (2017-2018).

Homework 3: Distributional semantic models.

Objectives:

1) Obtain co-occurrence vector representations with the followin properties:
- window size 1, pmi, svd (50)
- window size 3, no modifications
- window size 3, pmi, no svd
- window size 3, no pmi, svd (50)
- window size 3, pmi, svd (50)

2) Obtain word2vec embeddings with the following properties
- window size 1, 50 dimensions
- window size 1, 200 dimensions
- window size 3, 50 dimensions
- window size 3, 200 dimensions
- window size 5, 50 dimensions

3) Compare the performance of the 10 representations in 1 and 2 on the following tasks:
- similarity between "man" and "woman"
- the 5 most similar words to "car"
- for DISSECT representations , correlation with gold standard
- for Word2Vec, the similarity between "queen" and "king + woman - man"

In [2]:
# Import section
import nltk
from nltk.corpus import gutenberg
from nltk import FreqDist
from nltk.collocations import *
import re
from collections import Counter
import numpy as np
import operator
from scipy import spatial

# Dissect
from composes.semantic_space.space import Space
from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
from composes.transformation.dim_reduction.svd import Svd
from composes.similarity.cos import CosSimilarity
from composes.utils import scoring_utils

# Gensim
import gensim 

In [3]:
## Load the corpus
corpus = gutenberg.words()

In [4]:
def getSpace(windowSize):
    ## generate the raw co-occurrence count within a window of 1
    cooc = BigramCollocationFinder.from_words(corpus,window_size=windowSize + 1).ngram_fd.items()

    ## convert the list of collocates in a dictionary

    # Initialize the dict
    cooc_dict = {}
    print("Going to loop through the list")
    # Loop through the list
    for pair,freq in cooc:
        # Check and initialie the variables
        word1,word2 = pair
        # Check if entries for the words exist
        # If not, create them
        if word1 not in cooc_dict:
            cooc_dict[word1]={}

        if word2 not in cooc_dict:
            cooc_dict[word2]={}

        # Check if entries for the particular combination exists
        # If not, initialize them
        if word2 not in cooc_dict[word1]:
            cooc_dict[word1][word2]=0
        if word1 not in cooc_dict[word2]:
            cooc_dict[word2][word1]=0
        # Update the dict variables
        cooc_dict[word1][word2]+=freq
        cooc_dict[word2][word1]+=freq

    ## Generate the row, col and data variables for the DISSECT

    # Initialize the variables
    rows = []
    cols = []
    data = []
    print("Going to loop through the dictionary")
    # Loop through the dictionary
    for word_1 in cooc_dict:
        # Add an entry to the rows variable
        # there should be no duplications, but we check anyway
        if word_1 not in rows:
            rows.append(word_1)
        # Loop through the entries in the dict
        for word_2 in cooc_dict[word_1]:
            # Add an entry in the cols, if it's not already added
            if word_2 not in cols:
                cols.append(word_2)
            # Add the value to the data
            data.append(word_1 + " " + word_2 + " " + str(cooc_dict[word_1][word_2]))

    ## Output the row,col,data to files

    # Define the base name
    fname = "gutenberg_surface_3"

    # Generate tuples of fname data for the files
    out = []
    out.append((fname + ".rows",rows))
    out.append((fname + ".cols",cols))
    out.append((fname + ".sm",data))
    print("Going to loop through out var")
    # Loop through the out var
    for (filename,content) in out:
        # Open the file
        with open(filename,"w") as out_file:
            # Loop through the rows variable
            for entry in content:
                # Remove non unicode chars
                entry = entry.encode('utf8', 'replace')
                # Write the entry
                out_file.write(entry)
                # Add newline
                out_file.write("\n")

    # Path to the folder where the data files are
    my_path = ""

    # Loading the matrix from the three different files
    my_space = Space.build(data = my_path + "gutenberg_surface_3.sm",
                           rows = my_path + "gutenberg_surface_3.rows",
                           cols = my_path + "gutenberg_surface_3.cols",
                           format = "sm")
    
    return my_space

In [5]:
# START TASK 1-1

space_ws1 = getSpace(1)

# Transforming the semantic space using PPMI
my_ppmi_space = space_ws1.apply(PpmiWeighting())

ws1_pmi_svd50_Task_1_1  = my_ppmi_space.apply(Svd(50))
# END TASK 1-1

# START TASK 1-2to5
space_ws3 = getSpace(3)
# Transforming the semantic space using PPMI
ppmi_space = space_ws3.apply(PpmiWeighting())

ws3_Task_1_2 = space_ws3
ws3_pmi_Task_1_3 = ppmi_space
ws3_svd50_Task_1_4 = space_ws3.apply(Svd(50))
ws3_pmi_svd50_Task_1_5 = ppmi_space.apply(Svd(50))

# END TASK 1-2to5

print("Task 1 prepared")

Going to loop through the list
Going to loop through the dictionary
Going to loop through out var
Progress...1000000
Going to loop through the list
Going to loop through the dictionary
Going to loop through out var
Progress...1000000
Progress...2000000
Progress...3000000
Task 1 prepared


In [6]:
# START TASK 3    
    
# Comparing similarity between "man" and "woman"
print("Calculating similarity between man and woman")
print("window size 1, pmi, svd (50)")
print("ws1_pmi_svd50_Task_1_1", ws1_pmi_svd50_Task_1_1.get_sim("man", "woman", CosSimilarity()))

print("window size 3")
print("ws3_Task_1_2", ws3_Task_1_2.get_sim("man", "woman", CosSimilarity()))

print("window size 3, pmi")
print("ws3_pmi_Task_1_3", ws3_pmi_Task_1_3.get_sim("man", "woman", CosSimilarity()))

print("window size 3, svd (50)")
print("ws3_svd50_Task_1_4", ws3_svd50_Task_1_4.get_sim("man", "woman", CosSimilarity()))

print("window size 3, pmi, svd (50)")
print("ws3_pmi_svd50_Task_1_5", ws3_pmi_svd50_Task_1_5.get_sim("man", "woman", CosSimilarity()))

Calculating similarity between man and woman
window size 1, pmi, svd (50)
('ws1_pmi_svd50_Task_1_1', 0.90197623371148516)
window size 3
('ws3_Task_1_2', 0.96862311057068851)
window size 3, pmi
('ws3_pmi_Task_1_3', 0.10418482739863029)
window size 3, svd (50)
('ws3_svd50_Task_1_4', 0.98125139575855458)
window size 3, pmi, svd (50)
('ws3_pmi_svd50_Task_1_5', 0.78366326238532669)


In [7]:
# Comparing the 5 most similar words to "car"

print("Obtaining the 5 most similar words to 'car'")
print("window size 1, pmi, svd (50)")
print("ws1_pmi_svd50_Task_1_1", ws1_pmi_svd50_Task_1_1.get_neighbours("car", 5, CosSimilarity()))

print("window size 3")
print("ws3_Task_1_2", ws3_Task_1_2.get_neighbours("car", 5, CosSimilarity()))

print("window size 3, pmi")
print("ws3_pmi_Task_1_3", ws3_pmi_Task_1_3.get_neighbours("car", 5, CosSimilarity()))

print("window size 3, svd (50)")
print("ws3_svd50_Task_1_4", ws3_svd50_Task_1_4.get_neighbours("car", 5, CosSimilarity()))

print("window size 3, pmi, svd (50)")
print("ws3_pmi_svd50_Task_1_5", ws3_pmi_svd50_Task_1_5.get_neighbours("car", 5, CosSimilarity()))

Obtaining the 5 most similar words to 'car'
window size 1, pmi, svd (50)
('ws1_pmi_svd50_Task_1_1', [('car', 1.0), ('table', 0.85092128215815366), ('floor', 0.84634566222010721), ('chimney', 0.84395823872514431), ('dining', 0.84119473120215704)])
window size 3
('ws3_Task_1_2', [('car', 1.0), ('key', 0.93305619090166425), ('wall', 0.93037210089756406), ('street', 0.93034341502114104), ('sea', 0.92498355681372335)])
window size 3, pmi
('ws3_pmi_Task_1_3', [('car', 1.0), ('bicycles', 0.12817727288119293), ('popping', 0.12467056194780535), ('corpusants', 0.1071900718295629), ('stoical', 0.10669112901298419)])
window size 3, svd (50)
('ws3_svd50_Task_1_4', [('car', 1.0000000000000002), ('key', 0.98703744774949487), ('lawn', 0.98307380285577894), ('street', 0.97942380897437853), ('level', 0.97878276935629771)])
window size 3, pmi, svd (50)
('ws3_pmi_svd50_Task_1_5', [('car', 1.0), ('stick', 0.88739669849138947), ('window', 0.88371037927555984), ('lawn', 0.87665219281880669), ('corner', 0.875

In [8]:
my_path = ""
# Comparing the similarity with "gold standard"
fname = my_path + "synonyms.txt"
# Load the pairs
word_pairs = io_utils.read_tuple_list(fname, fields=[0,1])
# Load the score
gold = io_utils.read_list(fname, field=2)
# Predict similarity

def printSimilarity(model):

    print("Comparing similarity with 'gold standard'")
    print("window size 1, pmi, svd (50)")
    predicted_ppmi_svd = [round(sim,2) for sim in model.get_sims(word_pairs, CosSimilarity())]
    print ("Pairs:",word_pairs)
    print ("Gold scores",gold)
    print ("\n PPMI and SVD matrix:")
    print ("Predicted scores",predicted_ppmi_svd)
    print ("Spearman correlation:",scoring_utils.score(gold, predicted_ppmi_svd, "spearman"))
    print ("Pearson correlation:",scoring_utils.score(gold, predicted_ppmi_svd, "pearson"))

models = [ws1_pmi_svd50_Task_1_1, ws3_Task_1_2, ws3_pmi_Task_1_3, ws3_svd50_Task_1_4, ws3_pmi_svd50_Task_1_5] 

map(lambda model: printSimilarity(model), models)

IOError: [Errno 2] No such file or directory: 'synonyms.txt'