In [3]:
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 960 µs


In [4]:
import sys
sys.path.append('../')

time: 641 µs


In [5]:
from IPython.display import display
from pathlib import Path
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import PlaintextCorpusReader, stopwords
from nltk import FreqDist
import cytoolz as tlz
import string
import zipfile

import seaborn as sns
import spacy
from spacy.tokens import Doc
from scipy import sparse
from nltk.stem import WordNetLemmatizer
from nltk.data import ZipFilePathPointer
from spacy.matcher import PhraseMatcher, Matcher
from sklearn import feature_extraction as fe
from textacy import Corpus
from textacy.ke.textrank import textrank

import tensorflow as tf
import tensorflow_hub as hub

import pandas as pd

from keyword_extractor.extractors import TfIdfKeywordExtractor

sns.set()

time: 6.64 s


In [6]:
DATADIR = Path('../data')

time: 1.76 ms


In [21]:
nlp = spacy.load('en_core_web_sm')

time: 471 ms


In [22]:
text_corpus = PlaintextCorpusReader(DATADIR.joinpath('text_corpus').as_posix(), 'doc.*\.txt')
english_stopwords = set(stopwords.words('english'))
data = [(text_corpus.raw(fid), {'fileid': fid}) for fid in text_corpus.fileids()]
corpus = Corpus(nlp, data)

time: 2.53 s


In [67]:
MODEL_URL = {
    'Transformer': 'https://tfhub.dev/google/universal-sentence-encoder-large/5',
    'DAN': 'https://tfhub.dev/google/universal-sentence-encoder/4'
}

time: 1.21 ms


In [268]:
embed = hub.load(MODEL_URL['DAN'])

time: 8.48 s


In [269]:
embeddings = embed([
    "The quick brown fox jumps over the lazy dog.",
    "I am a sentence for which I would like to get its embedding"])

print(embeddings)

tf.Tensor(
[[-0.03133018 -0.06338634 -0.01607502 ... -0.03242778 -0.04575741
   0.05370456]
 [ 0.05080861 -0.01652428  0.01573781 ...  0.00976659  0.03170123
   0.0178812 ]], shape=(2, 512), dtype=float32)
time: 352 ms


In [270]:
doc = corpus[-1]

time: 1.09 ms


In [271]:
noun_chunks = list({nc.text for nc in doc.noun_chunks})

time: 7.42 ms


In [272]:
doc_embedding = embed([doc.text]).numpy()

time: 13.9 ms


In [273]:
len(set(noun_chunks)) / len(noun_chunks)

1.0

time: 2.86 ms


In [274]:
candidate_embeddings = embed(noun_chunks).numpy()

time: 18.5 ms


In [275]:
similiarities = doc_embedding.dot(candidate_embeddings.T).flatten()

time: 1.65 ms


In [276]:
argsorted = np.argsort(similiarities)

time: 1.11 ms


In [277]:
topn = 15
top_kw_idx = np.flip(argsorted[-topn:])

time: 1.44 ms


In [278]:
display(top_kw_idx)
display(similiarities[top_kw_idx])
display([noun_chunks[i] for i in top_kw_idx])

array([335, 318,  17,   8, 303, 201, 251, 336,  96, 328, 154, 168,  65,
       210,  66])

array([0.3616948 , 0.33865315, 0.32892054, 0.32784045, 0.31540003,
       0.31468648, 0.30139074, 0.29434103, 0.27571428, 0.2685988 ,
       0.25075483, 0.25062177, 0.24531764, 0.23816955, 0.23635092],
      dtype=float32)

['former Soviet weapons scientists',
 'the Soviet biological weapons program',
 'biological weapons threat reduction programs',
 'improvised nuclear weapons',
 'nuclear weapons',
 "the world's most dangerous weapons",
 'the nuclear, chemical, and biological weapons',
 'nuclear, biological, and chemical weapons',
 'the Cold War',
 'a nuclear weapon',
 'the deadliest weapons',
 'highly enriched uranium',
 'these dangerous weapons',
 'civilian reactors',
 'Soviet-supplied, civilian research reactors']

time: 8.12 ms


In [279]:
def min_max_scale(a):
     return (a - a.min()) / (a.max() - a.min())

time: 1.24 ms


In [280]:
candidate_similarities = candidate_embeddings.dot(candidate_embeddings.T)
candidate_similarities = candidate_similarities - np.diag(candidate_embeddings) * np.eye(candidate_embeddings.shape[0])

time: 2.94 ms


In [281]:
norm_candidate_similarities = min_max_scale(candidate_similarities)
max_sim_by_candidate  = norm_candidate_similarities.max(axis=1)

time: 2.57 ms


In [282]:
norm_similarities = min_max_scale(similiarities)

time: 1.01 ms


In [283]:
l = 0.5
marginal_relevance = l * norm_similarities - (1-l) * max_sim_by_candidate

time: 1.08 ms


In [284]:
def cosine_similarity(a1, a2):
    return a1.dot(a2.T)

def get_marginal_relevance(candidate_embeddings, doc_embedding, l=0.5):
    similiarities = doc_embedding.dot(candidate_embeddings.T).flatten()
    scaled_similarities = min_max_scale(similiarities)
    norm_similarities = normalize_importances(scaled_similarities)
    
    candidate_similarities = candidate_embeddings.dot(candidate_embeddings.T)
    candidate_similarities = candidate_similarities - np.diag(candidate_embeddings) * np.eye(candidate_embeddings.shape[0])
    scaled_candidate_similarities = min_max_scale(candidate_similarities)
    norm_candidate_similarities = normalize_importances(scaled_candidate_similarities)

    max_sim_by_candidate  = norm_candidate_similarities.max(axis=0)
    
    marginal_relevance = l * norm_similarities - (1-l) * max_sim_by_candidate
    return marginal_relevance

def normalize_importances(importances):
    return 0.5 + (importances - importances.mean()) / importances.std()


time: 2.44 ms


In [285]:
marginal_relevance = get_marginal_relevance(candidate_embeddings, doc_embedding, 0.5)

time: 5.56 ms


In [286]:
def show_top_keywords(candidates, scores, topn):
    argsorted = np.argsort(scores)
    top_kw_idx = np.flip(argsorted[-topn:])
#     display(top_kw_idx)
#     display(scores[top_kw_idx])
#     display([candidates[i] for i in top_kw_idx])
    top_scores = scores[top_kw_idx]
    keywords = [candidates[i] for i in top_kw_idx]
    display(pd.DataFrame({'keyword': keywords, 'score': top_scores}))
#     for i, kw, score in zip(top_kw_idx, scores, keywords):
#         print(f'* i: {i}, score: {score}, keyword: {kw}')

time: 1.28 ms


In [287]:
show_top_keywords(noun_chunks, similiarities, 15)

Unnamed: 0,keyword,score
0,former Soviet weapons scientists,0.361695
1,the Soviet biological weapons program,0.338653
2,biological weapons threat reduction programs,0.328921
3,improvised nuclear weapons,0.32784
4,nuclear weapons,0.3154
5,the world's most dangerous weapons,0.314686
6,"the nuclear, chemical, and biological weapons",0.301391
7,"nuclear, biological, and chemical weapons",0.294341
8,the Cold War,0.275714
9,a nuclear weapon,0.268599


time: 7.86 ms


In [288]:
show_top_keywords(noun_chunks, marginal_relevance, 15)

Unnamed: 0,keyword,score
0,former Soviet weapons scientists,-1.458852
1,the Soviet biological weapons program,-1.478876
2,"nuclear, biological, and chemical weapons",-1.589511
3,nuclear weapons,-1.755292
4,the Cold War,-1.839976
5,"the nuclear, chemical, and biological weapons",-1.917356
6,improvised nuclear weapons,-1.935032
7,biological weapons threat reduction programs,-1.949645
8,a nuclear weapon,-1.965364
9,the world's most dangerous weapons,-2.009738


time: 9.19 ms


### Chunking the document
We are going to split the document in half to reduce the amount of information the encoder has to attend.

In [289]:
sents = list(doc.sents)

time: 1.11 ms


In [290]:
mid = int(len(sents) / 2)

time: 1.04 ms


In [291]:
head, tail = sents[:mid], sents[mid:]
head_text = ' '.join([sent.text for sent in head])
tail_text = ' '.join([sent.text for sent in tail])

time: 3.07 ms


In [292]:
head_embedding = embed([head_text]).numpy()
tail_embedding = embed([tail_text]).numpy()

time: 14.9 ms


In [293]:
head_similiarities = head_embedding.dot(candidate_embeddings.T).flatten()
tail_similiarities = tail_embedding.dot(candidate_embeddings.T).flatten()

time: 1.57 ms


In [294]:
show_top_keywords(noun_chunks, head_similiarities, 15)
show_top_keywords(noun_chunks, tail_similiarities, 15)

Unnamed: 0,keyword,score
0,improvised nuclear weapons,0.396719
1,former Soviet weapons scientists,0.382709
2,nuclear weapons,0.382109
3,the world's most dangerous weapons,0.362242
4,the Soviet biological weapons program,0.357044
5,biological weapons threat reduction programs,0.349947
6,"the nuclear, chemical, and biological weapons",0.34493
7,"nuclear, biological, and chemical weapons",0.339674
8,a nuclear weapon,0.332664
9,highly enriched uranium,0.298978


Unnamed: 0,keyword,score
0,former Soviet weapons scientists,0.330695
1,the Soviet biological weapons program,0.323326
2,biological weapons threat reduction programs,0.287705
3,the Cold War,0.261209
4,even the zealous Russian border guard,0.253303
5,the former Soviet Union,0.249836
6,the world's most dangerous weapons,0.236315
7,Russian meddling,0.232863
8,improvised nuclear weapons,0.232249
9,discernible security,0.228499


time: 15.3 ms


In [295]:
l = 0.1
head_marginal_relevance = get_marginal_relevance(candidate_embeddings, head_embedding, l)
tail_marginal_relevance = get_marginal_relevance(candidate_embeddings, tail_embedding, l)

time: 10.4 ms


In [296]:
show_top_keywords(noun_chunks, head_marginal_relevance, 15)
show_top_keywords(noun_chunks, tail_marginal_relevance, 15)

Unnamed: 0,keyword,score
0,"nuclear, biological, and chemical weapons",-5.401329
1,this proliferation threat,-5.435826
2,biological threat reduction,-5.485261
3,these weapons,-5.540659
4,U.S. interests,-5.550962
5,town,-5.624581
6,two areas,-5.63603
7,security,-5.636514
8,a model,-5.63954
9,despotic regimes,-5.644756


Unnamed: 0,keyword,score
0,this proliferation threat,-5.459572
1,biological threat reduction,-5.48157
2,"nuclear, biological, and chemical weapons",-5.482782
3,U.S. interests,-5.521677
4,senior officials,-5.566259
5,despotic regimes,-5.583289
6,these weapons,-5.603049
7,the Soviet biological weapons program,-5.636947
8,two areas,-5.645697
9,security,-5.646102


time: 15 ms
