In [2]:
#first we look at the way we can use pre-trained embeddings using the PyMagnitude package

# BEFORE YOU START
# You need to have gone to this URL : https://github.com/plasticityai/magnitude#pre-converted-magnitude-formats-of-popular-embeddings-models
# downloaded some vectors of you choice (these files can be massive so it might take a while), and stored
# them somewhere you can access (e.g. with the file path I specify below).

# YOU MAY ALSO
# need to have run the data_preparation.R script if you lack the `sentences.csv` file

In [30]:
from pymagnitude import *
import re
import numpy as np
import pandas as pd
from sklearn import preprocessing
from nltk.corpus import stopwords
import time

In [53]:
pd.set_option('max_colwidth', 500)

In [8]:
%%time
#here's where you get your vectors in - you need to replace the path below with your own one
vectors = Magnitude("here/you/put/path/to/magnitude_file.magnitude")

CPU times: user 780 ms, sys: 30.2 ms, total: 810 ms
Wall time: 826 ms


In [None]:
#some things we can do with magnitude vectors for words

In [9]:
vectors.distance("biscuit","macaroon")

0.8671641

In [22]:
vectors.distance("biscuit",["macaroon", "cabbage"])

[0.8671641, 1.0519347]

In [11]:
vectors.similarity("biscuit","macaroon")

0.62401325

In [21]:
vectors.similarity("biscuit",["macaroon", "cabbage"])

[0.62401325, 0.4467167]

In [20]:
vectors.most_similar_to_given("biscuit", ["macaroon", "cabbage"])

'macaroon'

In [19]:
vectors.doesnt_match(["biscuit", "macaroon", "cabbage"])

'cabbage'

In [2]:
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [23]:
vectors.most_similar("biscuit", topn = 10)

[('biscuits', 0.8173666000366211),
 ('biscuity', 0.6973364353179932),
 ('cake', 0.6754744052886963),
 ('chocolate', 0.6582432985305786),
 ('oatcake', 0.6574128866195679),
 ('teacake', 0.6451709866523743),
 ('loaf', 0.6432216167449951),
 ('cornflake', 0.6331499814987183),
 ('flapjack', 0.6270235776901245),
 ('shortbread', 0.6263179779052734)]

In [24]:
vectors.most_similar(vectors.query("biscuit"), topn = 10)

[('biscuit', 1.0),
 ('biscuits', 0.8173666000366211),
 ('biscuity', 0.6973364353179932),
 ('cake', 0.6754744052886963),
 ('chocolate', 0.6582432985305786),
 ('oatcake', 0.6574128866195679),
 ('teacake', 0.6451709866523743),
 ('loaf', 0.6432216167449951),
 ('cornflake', 0.6331499814987183),
 ('flapjack', 0.6270235776901245)]

In [25]:
vectors.most_similar(positive=['Paris', 'Germany'], negative=['France'])

[('Berlin', 0.7935939431190491),
 ('Munich', 0.7534011006355286),
 ('Frankfurt', 0.7376378774642944),
 ('Cologne', 0.7260650992393494),
 ('Stuttgart', 0.7239525318145752),
 ('Leipzig', 0.7191416025161743),
 ('Vienna', 0.7057973146438599),
 ('Hamburg', 0.7021979093551636),
 ('Frankfurt-am-Main', 0.6996399164199829),
 ('DÃ¼sseldorf', 0.6979357004165649)]

In [26]:
vectors.most_similar(positive=['England', 'Baghdad'], negative=['London'])

[('Iraq', 0.7341831922531128),
 ('Kuwait', 0.6448562145233154),
 ('Mosul', 0.6440438628196716),
 ('Basra', 0.6172256469726562),
 ('Iraq--and', 0.6111494302749634),
 ('Al-Anbar', 0.6111178398132324),
 ('Baghdady', 0.6097682118415833),
 ('Al-Basrah', 0.6095267534255981),
 ('Al-Najaf', 0.6082150936126709),
 ('Basrah', 0.6069812774658203)]

In [44]:
#This embeds a search phrase or report sentence in our 300-dimensional vector space
#by simply averaging over the words in the phrase
# I have no doubt there are better ways of doing this.
def embed_phrase(phrase):
    try:
        input_nopunc = re.sub(r'[^\w\s]','',phrase) #take out punctuation
        input_lower = input_nopunc.lower().split() #make lower case and split by word
        #now take out words not in word2vec model, and also words in nltk stopword list
        clean_search = [word for word in input_lower if (word in vectors and word not in stopwords.words("English"))]
        if clean_search: #this means if clean_search isn't empty
            unnorm_vector = np.mean(vectors.query(clean_search), axis = 0) #take mean of vectors of words that remain
        else:
            unnorm_vector = np.zeros(300) #set to zero if no words remain
    except(TypeError):
        unnorm_vector = np.zeros(300) #set to zero if it doesn't seem to be a string
    #we normalize the result to length 1 so we can use dot products for cosine similarity
    norm_array = preprocessing.normalize(unnorm_vector.reshape(-1,1), norm = 'l2', axis = 0)
    return(np.concatenate(norm_array))

#this allows you to search a phrase and compare it to a set of comparison sentences
# again, no doubt this could be greatly improved.
def search_phrase(phrase, comparison_set):
    results_df = comparison_set
    #we take our search phrase and compute its dot product with all of our guide sentences
    #then we reorder by how similar the phrase is (larger dot product = more similar)
    #and give the top 10
    embedding = np.array(results_df['vectors'].values.tolist()).T
    results_df['search_results'] = np.dot(embed_phrase(phrase), embedding)
    return(results_df.sort_values(by=['search_results'])[::-1])

In [49]:
embed_phrase("jaffa cakes are my favourite and I like them the best")

array([-6.28026947e-03, -2.43652239e-02,  1.32176944e-03, -9.69787687e-03,
       -3.74229029e-02, -8.92745480e-02, -2.56897439e-03, -1.03370681e-01,
       -6.57415316e-02,  1.29279522e-02,  8.24017625e-04,  7.51992390e-02,
        2.91189346e-02, -2.35687736e-02,  5.70440330e-02,  4.56786007e-02,
        1.60301358e-01,  1.96943642e-03,  1.33410349e-01,  3.72173935e-02,
        2.48401333e-03,  7.24656135e-02,  6.67128386e-03,  4.35236953e-02,
        3.62036712e-02, -1.57978833e-02,  5.85241467e-02, -2.75252834e-02,
        4.37867455e-02, -3.61571088e-02,  1.13285063e-02, -2.19578911e-02,
       -1.92831922e-02,  7.66812358e-03,  3.62293907e-02, -5.31622693e-02,
        9.04731266e-03, -8.52390379e-03,  1.10962684e-03,  4.68735360e-02,
       -2.17354670e-02, -1.00070961e-01, -9.81302857e-02, -9.39529669e-03,
        4.40335227e-03, -2.58727856e-02,  1.92488786e-02, -5.05256131e-02,
       -1.49174258e-02,  3.19914962e-03, -1.35877803e-02,  1.67603744e-03,
        2.21653134e-02, -

In [28]:
corpus = pd.read_csv("sentences.csv")
corpus.head()

Unnamed: 0.1,Unnamed: 0,ID,sentence
0,1,FeatureSelection.md_1,* Decisions about how to do these things are u...
1,2,FeatureSelection.md_2,"This is usually easily accomplished - in R, fo..."
2,3,FeatureSelection.md_3,"Despite the practical ease of achieving this, ..."
3,4,FeatureSelection.md_4,You can choose to replace the punctuation mark...
4,5,FeatureSelection.md_5,"On the other hand, removing punctuation altoge..."


In [50]:
corpus['vectors'] = np.array(corpus['sentence'].apply(embed_phrase))

In [51]:
corpus.head()

Unnamed: 0.1,Unnamed: 0,ID,sentence,vectors
0,1,FeatureSelection.md_1,* Decisions about how to do these things are u...,"[-0.0080329105, 0.021076743, 0.0016090235, 0.0..."
1,2,FeatureSelection.md_2,"This is usually easily accomplished - in R, fo...","[0.013527644, -0.012912958, 0.05678807, -0.002..."
2,3,FeatureSelection.md_3,"Despite the practical ease of achieving this, ...","[-0.020053409, -0.0069089155, -0.0060613644, -..."
3,4,FeatureSelection.md_4,You can choose to replace the punctuation mark...,"[0.03999276, 0.03196642, 0.05058594, 0.0055737..."
4,5,FeatureSelection.md_5,"On the other hand, removing punctuation altoge...","[0.0051377737, 0.006850067, 0.021656906, 0.001..."


In [54]:
search_phrase("latent dirichlet allocation", corpus)[['ID','sentence']][:20]

Unnamed: 0,ID,sentence
96,LDA.md_1,* Latent Dirichlet Allocation is a probabilistic method for [*Topic*](Topics.md) Modelling.
229,TLDR.md_9,* Latent Dirichlet Allocation is a probabilistic method for [*Topic*](Topics.md) Modelling.
46,Glossary.md_29,* Latent semantic analysis.
69,Glossary.md_52,Used in [*Latent Semantic Analysis*](LSA.md).
209,README.md_2,"* [Search through a set of documents](Search.md) * [Find topics in a set of documents](Topics.md) * [Feature Selection](FeatureSelection.md) * [Latent Semantic Analysis (LSA)](LSA.md) * [Latent Dirichlet Allocation (LDA)](LDA.md) * [Word2Vec, Doc2Vec, fastText (Neural Network models)](NNmodels.md) * There is also R code for LSA and LDA accessible in `code/NLP-guidance."
221,TLDR.md_1,"* Following [feature selection](FeatureSelection.md), some methodologies we've had success with are [Latent Semantic Analysis](LSA.md) and [Neural Network models like Word2Vec and Doc2Vec](NNmodels.md)."
211,Search.md_1,"* Following [feature selection](FeatureSelection.md), some methodologies we've had success with are [Latent Semantic Analysis](LSA.md) and [Neural Network models like Word2Vec and Doc2Vec](NNmodels.md)."
116,LDA.md_21,"All dimensionality reduction type approaches in natural language processing suffer from this to some extent, but the complexity of LDA makes it worse here in my opinion."
148,LSA.md_28,"Finally, note that in terms of being an algorithm that * uses matrix algebra; and * results in the creation of a reduced-rank subspace; which * is somehow closest to the original space; and * has a basis that is a linear combination of the original basis vectors SVD is clearly similar in ethos to principal component analysis."
23,Glossary.md_6,"Clustering algorithms typically require some measure of distance (or, to some extent equivalently, similarity) between *documents* in a vector space."
