In [1]:
cd ..

/Users/louis.guitton/workspace/papers/papers


https://towardsdatascience.com/word-distance-between-word-embeddings-cc3e9cf1d632

In [None]:
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager
# !jupyter labextension install @bokeh/jupyter_bokeh

In [2]:
import spacy
from spacy.language import Language
from textacy.spacier.doc_extensions import to_bag_of_words, to_bag_of_terms
from textacy import preprocessing as textacy_preprocessing
import wmd
from pyemd import emd
import sklearn
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool

output_notebook()

%matplotlib inline

# Solution based on wmd-relax

In [3]:
nlp = spacy.load('en_core_web_md')
nlp.add_pipe(wmd.WMD.SpacySimilarityHook(nlp), last=True)

In [4]:
doc1 = nlp("Politician speaks to the media in Illinois.")
doc2 = nlp("The president greets the press in Chicago.")
doc3 = nlp("The band gave a concert in Japan")
print(doc1.similarity(doc2))
print(doc1.similarity(doc3))

6.070105075836182
7.920230865478516


# My implementation, based on pyemd

In [5]:
class WMDSimilarityModel(object):
    def __init__(self, nlp, **kwargs):
        self.nlp = nlp

    def __call__(self, doc):
        doc.user_hooks["similarity"] = self.compute_similarity
        return doc
    
    def compute_similarity(self, doc1, doc2):
        weights = pd.DataFrame([doc1._.nbow, doc2._.nbow]).fillna(0)
        vocabulary = weights.columns
        
        evec = pd.DataFrame([nlp.vocab[tok].vector for tok in vocabulary], index=vocabulary)
        
        evec_square = evec @ evec.T
        dists = np.sqrt(np.diag(evec_square).reshape(1, -1) + np.diag(evec_square).reshape(-1, 1) - 2 * evec @ evec.T)
        
        w1 = weights.iloc[0].values.copy(order='C')
        w2 = weights.iloc[1].values.copy(order='C')
        dists = dists.values.copy(order='C')
        
        return emd(w1, w2, dists)

In [6]:
nlp = spacy.load('en_core_web_md')

In [7]:
def bag_of_words(doc):
    """Normalised Bag of Words."""
    # https://github.com/chartbeat-labs/textacy/blob/606c2ec8dc4e3183217f71d4b2b1d4e8bf2f49c5/textacy/spacier/doc_extensions.py
    doc.set_extension("nbow", getter=lambda d: to_bag_of_words(d, weighting="freq", as_strings=True), force=True)
    return doc

In [8]:
nlp.add_pipe(bag_of_words, after="parser")

In [9]:
nlp.add_pipe(WMDSimilarityModel(nlp), last=True)

In [10]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x12e2f2d10>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x12e446de0>),
 ('bag_of_words', <function __main__.bag_of_words(doc)>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x12e446d70>),
 ('WMDSimilarityModel', <__main__.WMDSimilarityModel at 0x1637742d0>)]

In [11]:
doc1 = nlp("Politician speaks to the media in Illinois.")
doc2 = nlp("The president greets the press in Chicago.")
doc3 = nlp("The band gave a concert in Japan")
print(doc1.similarity(doc2))
print(doc1.similarity(doc3))

3.148351993149684
4.265628528887262


  app.launch_new_instance()


# Play around with Football data

In [12]:
text = """
Barcelona confirm another injury for Ousmane Dembele

 Barcelona have confirmed that winger Ousmane Dembélé has suffered yet another injury.
Dembélé hasn’t played since November 27 due to a hamstring problem, with the club now confirming another setback in training.
“During this morning’s training session, Ousmane Dembélé felt some discomfort in his right leg as a consequence of muscle fatigue,” read an official statement.
“He will continue with his rehabilitation.”
The LaLiga club have given no official time frame on his potential return date.
"""

In [13]:
doc1 = nlp(textacy_preprocessing.remove_accents(
            textacy_preprocessing.normalize_quotation_marks(
                text
            )
        ))

In [14]:
text = """
Chelsea 'opted against' signing Salomon Rondón on deadline day

 Chelsea reportedly opted against signing Salomón Rondón on deadline day despite their long search for a new centre forward.
With Olivier Giroud expected to leave, the Blues targeted Edinson Cavani, Dries Mertens and Moussa Dembele – only to end up with none of them.
According to Telegraph Sport, Dalian Yifang offered Rondón to Chelsea only for them to prefer keeping Giroud at the club.
Manchester United were also linked with the Venezuela international before agreeing a deal for Shanghai Shenhua striker Odion Ighalo.
Manager Frank Lampard made no secret of his transfer window frustration, hinting that to secure top four football he ‘needed’ signings.
Their draw against Leicester on Saturday means they have won just four of the last 13 Premier League matches.
"""

In [15]:
doc2 = nlp(textacy_preprocessing.remove_accents(
            textacy_preprocessing.normalize_quotation_marks(
                text
            )
        ))

In [16]:
doc1.similarity(doc2)

  app.launch_new_instance()


3.311242183284258

# Document plot

In [17]:
is_token_kept = lambda tok: not (tok.is_punct or tok.is_space or tok.is_stop)

In [18]:
def plot(
    doc_1: spacy.tokens.doc.Doc, 
    doc_2: spacy.tokens.doc.Doc
):
    docs = {1: doc_1, 2: doc_2}
    
    tokens_of_interest = {k: [tok for tok in v if is_token_kept(tok)] for k, v in docs.items()}
    X_300D = {k: pd.DataFrame.from_dict({tok.text: list(tok.vector) for tok in tokens_of_interest[k]}, orient='index') for k in docs.keys()}
    # TODO: do something with duplicates

    oov_tokens = {k: set((tok.text, tok.pos_) for tok in tokens_of_interest[k] if not tok.has_vector) for k in docs.keys()}
    print(oov_tokens)
    
    pca = PCA(n_components=2)
    pca.fit(pd.concat(X_300D.values(), axis=0))
    
    X_2D = {k: pd.DataFrame(
        pca.transform(X_300D[k]), index=X_300D[k].index, columns=['C1', 'C2']
    ).reset_index() for k in docs.keys()}    
    
    colors =  {1: "brown", 2: "green"}
    for k in docs.keys():
        X_2D[k].insert(3, "doc_color", colors[k])
    
    
    return pd.concat(X_2D.values(), axis=0)

In [19]:
X_2D = plot(doc1, doc2)

{1: {('Dembele', 'PROPN'), ('LaLiga', 'PROPN'), ('Ousmane', 'PROPN')}, 2: {('Odion', 'PROPN'), ('Shenhua', 'PROPN'), ('Edinson', 'PROPN'), ('Dembele', 'PROPN'), ('Ighalo', 'PROPN'), ('Yifang', 'PROPN')}}


In [20]:
p = figure(title="2 documents in their 300D embedding space", tooltips=[('token', '@index')])
source = ColumnDataSource(X_2D)

p.circle("C1", "C2", size=10, color="doc_color", alpha=0.5, source=source)

show(p)

In [479]:
TOOLTIPS = [
    ("token", "@index"),
]
p = figure(tooltips=TOOLTIPS,
           title="2 documents in the 300D embedding space")
source = ColumnDataSource(X_2D)

p.circle("C1", "C2", source=source, size=10, color="doc_color", alpha=0.5)

show(p)

# Open questions
- Every football specific term will be Out of Vocabulary (OOV). Maybe we should consider combining spacy.Language embeddings with custom football embeddings for those terms?

# Bonus: add pre-processing as part of the spacy.Language model

In [None]:
# TODO: figure out how to load the model properly with pipeline
# nlp = FootballLanguage(lang="en_core_web_md").from_disk("/Users/louis.guitton/workspace/papers/venv/lib/python3.7/site-packages/en_core_web_md/en_core_web_md-2.2.5")
# this doesn't work
class FootballLanguage(Language):
    def __call__(self, text, disable=[], component_cfg=None):
        text = textacy_preprocessing.remove_accents(
            textacy_preprocessing.normalize_quotation_marks(
                text
            )
        )
        return super().__call__(text, disable, component_cfg)

In [None]:
nlp.pipeline

In [None]:
   @classmethod
    def create_spacy_pipeline(cls, nlp, **kwargs):
        """
        Provides the integration with `spaCy <https://spacy.io>`_. Use this the
        following way:
        ::
           nlp = spacy.load('en', create_pipeline=wmd.WMD.create_spacy_pipeline)
        Please note that if you are going to search for the nearest documents
        then you should use :func:`~wmd.WMD.nearest_neighbors()` instead of
        evaluating multiple WMDs pairwise, as the former is much optimized and
        provides a lower complexity.
        :param nlp: `spaCy language object <https://spacy.io/docs/api/language>`_.
        :param kwargs: ignore_stops, only_alpha and frequency_processor. Refer \
                       to :func:`~wmd.WMD.SpacySimilarityHook.__init__()`.
        :return: The spaCy pipeline.
        :rtype: list.
        """
        return [nlp.tagger, nlp.parser, cls.SpacySimilarityHook(nlp, **kwargs)]