Introduction to NLP course (2017-2018).

Homework 3: Distributional semantic models.

Objectives:

1) Obtain co-occurrence vector representations with the followin properties:
- window size 1, pmi, svd (50)
- window size 3, no modifications
- window size 3, pmi, no svd
- window size 3, no pmi, svd (50)
- window size 3, pmi, svd (50)

2) Obtain word2vec embeddings with the following properties
- window size 1, 50 dimensions
- window size 1, 200 dimensions
- window size 3, 50 dimensions
- window size 3, 200 dimensions
- window size 5, 50 dimensions

3) Compare the performance of the 10 representations in 1 and 2 on the following tasks:
- similarity between "man" and "woman"
- the 5 most similar words to "car"
- for DISSECT representations , correlation with gold standard
- for Word2Vec, the similarity between "queen" and "king + woman - man"

In [6]:
# Import section
import nltk
from nltk.corpus import gutenberg
from nltk import FreqDist
from nltk.collocations import *
import re
from collections import Counter
import numpy as np
import operator
from scipy import spatial

# Dissect
from composes.semantic_space.space import Space
from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
from composes.transformation.dim_reduction.svd import Svd
from composes.similarity.cos import CosSimilarity
from composes.utils import scoring_utils

# Gensim
import gensim
from gensim.models import Word2Vec

## Word2Vec

In [13]:
## Load the corpus
corpus = gutenberg.sents()

def word_2_vec_models(corpus, sizes, windows):
    models = {}
    for size, window in zip(sizes,windows):
        model = Word2Vec(corpus, size=size, window=window, workers=4, compute_loss=True)
        model.train(corpus, total_examples=len(corpus), epochs=10)
        models['w2v_size_' + str(size) + '_window_' + str(window)] = model
    return models

sizes = [50, 200, 50, 200, 50]
windows = [1, 1, 3, 3, 5]

w2v_models = word_2_vec_models(corpus, sizes, windows)

In [15]:
# print loss of models
for key, model in w2v_models.iteritems():
    print model.get_latest_training_loss()

0.0
0.0
0.0
0.0
0.0


## Compare performance

In [43]:
import pandas as pd
pd.set_option('display.expand_frame_repr', False)

performance_df = {k: [] for k in w2v_models}

for key, model in w2v_models.iteritems():
    performance_df[key].append(model.wv.similarity(w1="man", w2="woman"))
    top_5_cars = [(el1, round(el2,2)) for el1, el2 in model.wv.most_similar(positive="car", topn=5)]
    performance_df[key].append(top_5_cars)
    
    custom_queen = np.add(np.subtract(model["king"],model["man"]),model["woman"])
    performance_df[key].append(1 - spatial.distance.cosine(custom_queen,model["queen"]))

performance_df = pd.DataFrame(performance_df, index=['man_woman', 'top_5_car', 'queen'])
performance_df.reset_index().T

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


Unnamed: 0,0,1,2
index,man_woman,top_5_car,queen
w2v_size_200window_1,0.674201,"[(shutters, 0.72), (bushes, 0.71), (forwards, ...",0.440707
w2v_size_200window_3,0.64307,"[(lane, 0.76), (bushes, 0.74), (coach, 0.73), ...",0.410389
w2v_size_50window_1,0.860462,"[(bushes, 0.76), (shutters, 0.76), (bows, 0.75...",0.468468
w2v_size_50window_3,0.813659,"[(lane, 0.87), (deck, 0.82), (bushes, 0.82), (...",0.457162
w2v_size_50window_5,0.702896,"[(coach, 0.84), (lane, 0.84), (boat, 0.82), (w...",0.640341
