# Homework original system: Word similarity

In [10]:
__author__ = "Pierre Jaumier"
__version__ = "CS224u, Stanford, Fall 2020"

`VSM` = Vector Space-Model

In [40]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from collections import defaultdict
import csv
import itertools
from scipy.stats import spearmanr
from IPython.display import display
import numpy as np
import os
import pandas as pd
import vsm
import utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
from hw1_utils import *

In [13]:
best_results = [(0.4913448227592214, ['giga5', 'ppmi']),
    (0.5048935638458103, ['giga5', 'ppmi', 'lsa', '500']),
    (0.4439572029666641, ['giga20', 'ttest']),
    (0.514953519371117, ['giga5', 'ppmi', 'ttest', 'lsa', '1000'])]

In [14]:
imdb5 = pd.read_csv(
    os.path.join(VSM_HOME, 'imdb_window5-scaled.csv.gz'), index_col=0)
imdb20 = pd.read_csv(
    os.path.join(VSM_HOME, 'imdb_window20-flat.csv.gz'), index_col=0)
giga5 = pd.read_csv(
    os.path.join(VSM_HOME, 'giga_window5-scaled.csv.gz'), index_col=0)
giga20 = pd.read_csv(
    os.path.join(VSM_HOME, 'giga_window20-flat.csv.gz'), index_col=0)

In [6]:
giga20.shape

(5000, 5000)

In [7]:
count_dfs = {'imdb5':imdb5, 'imdb20':imdb20, 'giga5':giga5, 'giga20':giga20}

In [17]:
def format_result(x):
    return "{:.4f}".format(x)

def macro_average_score(df, parameters):
    series = full_word_similarity_evaluation(df)
    score = series['Macro-average']
    print(format_result(score), '\t'.join(parameters))
    return score

## Baseline PPMI

In [27]:
results = []
for name, count_df in count_dfs.items():
    # Modèle
    df_pmi = vsm.pmi(count_df)
    
    # Evaluation
    series = full_word_similarity_evaluation(df_pmi)
    
    # Affichage
    score = series['Macro-average']
    parameters = [name, 'ppmi']
    results.append((score, parameters))
    print(format_result(score), '\t'.join(parameters))
    
print('\nMeilleure combinaison:\n\t',max(results))

0.4764 imdb5	ppmi
0.3539 imdb20	ppmi
0.4913 giga5	ppmi
0.4186 giga20	ppmi

Meilleure combinaison:
	 (0.4913448227592214, ['giga5', 'ppmi'])


In [23]:
print(max(results))

(0.4913448227592214, ['giga5', 'ppmi'])


## Baseline PPMI - LSA

In [29]:
results = []
for name, count_df in count_dfs.items():
    # Modèle
    df_pmi = vsm.pmi(count_df)

    for k in [10, 100, 500, 1000]:
        df_lsa = vsm.lsa(df_pmi, k)
        
        # Evaluation
        series = full_word_similarity_evaluation(df_lsa)
    
       # Affichage
        score = series['Macro-average']
        parameters = [name, 'ppmi', 'lsa', str(k)]
        results.append((score, parameters))
        print(format_result(score), '\t'.join(parameters))
    
print('\nMeilleure combinaison:\n\t',max(results))

0.3499 imdb5	ppmi	lsa	10
0.5001 imdb5	ppmi	lsa	100
0.5013 imdb5	ppmi	lsa	500
0.4920 imdb5	ppmi	lsa	1000
0.3030 imdb20	ppmi	lsa	10
0.4127 imdb20	ppmi	lsa	100
0.3760 imdb20	ppmi	lsa	500
0.3565 imdb20	ppmi	lsa	1000
0.3677 giga5	ppmi	lsa	10
0.4841 giga5	ppmi	lsa	100
0.5049 giga5	ppmi	lsa	500
0.5011 giga5	ppmi	lsa	1000
0.3465 giga20	ppmi	lsa	10
0.4184 giga20	ppmi	lsa	100
0.4262 giga20	ppmi	lsa	500
0.4235 giga20	ppmi	lsa	1000

Meilleure combinaison:
	 (0.5048935638458103, ['giga5', 'ppmi', 'lsa', '500'])


## t-test reweighting

In [30]:
results = []
for name, count_df in count_dfs.items():
    # Modèle
    df_ttest = ttest(count_df)
    
    # Evaluation
    series = full_word_similarity_evaluation(df_ttest)
    
    # Affichage
    score = series['Macro-average']
    parameters = [name, 'ttest']
    results.append((score, parameters))
    print(format_result(score), '\t'.join(parameters))
    
print('\nMeilleure combinaison:\n\t',max(results))

0.3875 imdb5	ttest
0.4087 imdb20	ttest
0.3969 giga5	ttest
0.4440 giga20	ttest

Meilleure combinaison:
	 (0.4439572029666641, ['giga20', 'ttest'])


In [11]:
# Meilleure combinaison:
# 	 (0.5048935638458103, ['giga5', 'ppmi', 'lsa', '500'])

count_df = giga5
# Modèle
df_pmi = vsm.pmi(count_df)
df_lsa = vsm.lsa(df_pmi, k=500)
        
# Evaluation
series = full_word_similarity_evaluation(df_lsa)
    
# Affichage
score = series['Macro-average']
parameters = ['giga5', 'ppmi', 'lsa', str(500)]
print(format_result(score), '\t'.join(parameters))

0.5049 giga5	ppmi	lsa	500


In [33]:
count_df = giga5
df_pmi = ttest(vsm.pmi(count_df))

series = full_word_similarity_evaluation(df_lsa)
score = series['Macro-average']
parameters = ['giga5', 'ppmi', 'ttest']
results.append((score, parameters))
print(format_result(score), '\t'.join(parameters))
        
for k in [500]:
    df_lsa = vsm.lsa(df_pmi, k)

    series = full_word_similarity_evaluation(df_lsa)
    score = series['Macro-average']
    parameters = ['giga5', 'ppmi', 'ttest', 'lsa', str(k)]
    results.append((score, parameters))
    print(format_result(score), '\t'.join(parameters))

0.3539 giga5	ppmi	ttest
0.5134 giga5	ppmi	ttest	lsa	500


## Modèle ppmi ttest lsa

In [9]:
results = []
for name, count_df in count_dfs.items():
    # Modèle
    df_pmi = vsm.pmi(count_df)
    df_ttest = ttest(df_pmi)

    for k in [100, 300, 500, 750, 1000]:
        try:
            df_lsa = vsm.lsa(df_ttest, k)
            series = full_word_similarity_evaluation(df_lsa)
            score = series['Macro-average']
        except:
            print("Pb de convergence de lsa pour {} et k={}".format(name, k))
            score = 0

       # Affichage
        parameters = [name, 'ppmi', 'ttest', 'lsa', str(k)]
        results.append((score, parameters))
        print(format_result(score), '\t'.join(parameters))
    
print('\nMeilleure combinaison:\n\t',max(results))

0.4704 imdb5	ppmi	ttest	lsa	100
0.5085 imdb5	ppmi	ttest	lsa	300
0.5119 imdb5	ppmi	ttest	lsa	500
0.5084 imdb5	ppmi	ttest	lsa	750
0.5089 imdb5	ppmi	ttest	lsa	1000
0.4262 imdb20	ppmi	ttest	lsa	100
0.4525 imdb20	ppmi	ttest	lsa	300
0.4529 imdb20	ppmi	ttest	lsa	500
0.4496 imdb20	ppmi	ttest	lsa	750
0.4477 imdb20	ppmi	ttest	lsa	1000
0.4667 giga5	ppmi	ttest	lsa	100
0.5056 giga5	ppmi	ttest	lsa	300
0.5134 giga5	ppmi	ttest	lsa	500
0.5146 giga5	ppmi	ttest	lsa	750
0.5150 giga5	ppmi	ttest	lsa	1000
0.4653 giga20	ppmi	ttest	lsa	750
0.4645 giga20	ppmi	ttest	lsa	1000

Meilleure combinaison:
	 (0.514953519371117, ['giga5', 'ppmi', 'ttest', 'lsa', '1000'])


In [31]:
count_df = giga5
df_pmi = vsm.pmi(count_df)
df_ttest = ttest(df_pmi)

parameters = ['giga5', 'ppmi', 'ttest']
score = macro_average_score(df_ttest, parameters)
result = score, parameters
print('\nMeilleure combinaison:\n\t', result)

0.5151 giga5	ppmi	ttest

Meilleure combinaison:
	 (0.5151364833538356, ['giga5', 'ppmi', 'ttest'])


Utilisation de glove avec la meilleure configuration  
(Le but de glove est d'avoir des vecteurs dont le produit scalaire est proportionnel à la log-probabilité de la co-occurrence)  
Renvoie des vecteurs de dim 300

In [None]:
# Depuis le cours vsm_02

In [34]:
from torch_glove import TorchGloVe
glove_model = TorchGloVe()
imdb5_glv = glove_model.fit(imdb5)

Finished epoch 1000 of 1000; error is 226226.6328125

In [36]:
macro_average_score(imdb5_glv, ['imdb5', 'glove'])

0.3225 imdb5	glove


0.32252879041096516

In [41]:
# Retrofitting
from nltk.corpus import wordnet as wn

In [42]:
import nltk
nltk.download('wordnet')
wn_edges = get_wordnet_edges()

[nltk_data] Downloading package wordnet to /home/neo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [44]:
def convert_edges_to_indices(edges, Q):
    lookup = dict(zip(Q.index, range(Q.shape[0])))
    index_edges = defaultdict(set)
    for start, finish_nodes in edges.items():
        s = lookup.get(start)
        if s:
            f = {lookup[n] for n in finish_nodes if n in lookup}
            if f:
                index_edges[s] = f
    return index_edges

In [49]:
X_glove = imdb5_glv
wn_index_edges = convert_edges_to_indices(wn_edges, X_glove)

In [50]:
from retrofitting import Retrofitter
wn_retro = Retrofitter(verbose=True)

In [51]:
X_retro = wn_retro.fit(X_glove, wn_index_edges)

Converged at iteration 8; change was 0.0058 

In [52]:
macro_average_score(X_retro, ['imdb5', 'glove', 'retro'])

0.3488 imdb5	glove	retro


0.3488462883538586

In [60]:
df_ttest.shape

(5000, 5000)

In [54]:
## Autoencoder

In [55]:
from torch_autoencoder import TorchAutoencoder

In [57]:
count_df = giga5
df_pmi = vsm.pmi(count_df)
df_ttest = ttest(df_pmi)
df_lsa = vsm.lsa(df_ttest, k=1000)
#series = full_word_similarity_evaluation(df_lsa)
parameters = ['giga5', 'ppmi', 'ttest', 'lsa', '1000']
score = macro_average_score(df_ttest, parameters)

0.5151 giga5	ppmi	ttest	lsa	1000


In [58]:
df_ae = TorchAutoencoder(max_iter=1000, hidden_dim=300, eta=0.01).fit(df_lsa)

Stopping after epoch 19. Training loss did not improve more than tol=1e-05. Final error is 1.3323446864887956e-05.

In [59]:
parameters = ['giga5', 'ppmi', 'ttest', 'lsa', '1000']
score = macro_average_score(df_ae, parameters)

0.3409 giga5	ppmi	ttest	lsa	1000


C'est pas vraiment probant...

Pas facile d'utiliser glove et quand ça marche, on reste toujours bien en deçà des autres méthodes quant aux résultats

In [None]:
# Solution proposée par la meilleure team

In [65]:
count_df = imdb5
df_pmi = vsm.pmi(count_df)
df_ttest = ttest(df_pmi)

parameters = ['imdb5', 'ppmi', 'ttest']
score = macro_average_score(df_ttest, parameters)

0.5101 imdb5	ppmi	ttest


In [66]:
wn_retro = Retrofitter(verbose=True)
df_retro = wn_retro.fit(df_ttest, wn_index_edges)
parameters = ['imdb5', 'ppmi', 'ttest', 'retro']
score = macro_average_score(df_retro, parameters)

Converged at iteration 3; change was 0.0041 

0.5646 imdb5	ppmi	ttest	retro


In [67]:
df_retro_l2 = df_retro.apply(vsm.length_norm, axis=1)
df_ae = TorchAutoencoder(max_iter=100, hidden_dim=1000, eta=0.001).fit(df_retro_l2)
parameters = ['imdb5', 'ppmi', 'ttest', 'retro', 'l2', 'ae']
score = macro_average_score(df_ae, parameters)

Stopping after epoch 33. Training loss did not improve more than tol=1e-05. Final error is 0.0005137750194990076.

0.5782 imdb5	ppmi	ttest	retro	l2	ae


Bravo, en effet c'est mieux. A noter que l'on peut faire du rétrofitting sans passer par glove ;)

### Your original system [3 points]

This question asks you to design your own model. You can of course include steps made above (ideally, the above questions informed your system design!), but your model should not be literally identical to any of the above models. Other ideas: retrofitting, autoencoders, GloVe, subword modeling, ... 

Requirements:

1. Your code must operate on one or more of the count matrices in `data/vsmdata`. You can choose which subset of them; this is an important design feature of your system. __Other pretrained vectors cannot be introduced__.

1. Retrofitting is permitted.

1. Your code must be self-contained, so that we can work with your model directly in your homework submission notebook. If your model depends on external data or other resources, please submit a ZIP archive containing these resources along with your submission.

In the cell below, please provide a brief technical description of your original system, so that the teaching team can gain an understanding of what it does. This will help us to understand your code and analyze all the submissions to identify patterns and strategies. We also ask that you report the best score your system got during development, just to help us understand how systems performed overall.

VSM: retrofitting

## Bake-off [1 point]

For the bake-off, we will release two additional datasets. The announcement will go out on the discussion forum. We will also release reader code for these datasets that you can paste into this notebook. You will evaluate your custom model $M$ (from the previous question) on these new datasets using `full_word_similarity_evaluation`. Rules:

1. Only one evaluation is permitted.
1. No additional system tuning is permitted once the bake-off has started.

The cells below this one constitute your bake-off entry.

People who enter will receive the additional homework point, and people whose systems achieve the top score will receive an additional 0.5 points. We will test the top-performing systems ourselves, and only systems for which we can reproduce the reported results will win the extra 0.5 points.

Late entries will be accepted, but they cannot earn the extra 0.5 points. Similarly, you cannot win the bake-off unless your homework is submitted on time.

The announcement will include the details on where to submit your entry.

In [None]:
# Enter your bake-off assessment code into this cell.
# Please do not remove this comment.
if 'IS_GRADESCOPE_ENV' not in os.environ:
    pass
    # Please enter your code in the scope of the above conditional.
    ##### YOUR CODE HERE


In [None]:
# On an otherwise blank line in this cell, please enter
# your "Macro-average" value as reported by the code above.
# Please enter only a number between 0 and 1 inclusive.
# Please do not remove this comment.
if 'IS_GRADESCOPE_ENV' not in os.environ:
    pass
    # Please enter your score in the scope of the above conditional.
    ##### YOUR CODE HERE
