In [92]:
import os, stanza, cld3, logging, random, nltk, torch
import pandas as pd
from multiprocessing import Pool
from toolz import partition
import tensorflow_hub as hub
import tensorflow_text # https://github.com/tensorflow/tensorflow/issues/38597
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')
torch.cuda.is_available()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Preprocessing

In [24]:
# Read a text document
doc=open('txt/aarpus-a178676817aa4aee08acc216161a5cf3-3164bffa749c22e91ff36ea0fbd98fb4').read()

## With [NLTK](https://www.nltk.org/)

In [25]:
# lower case and tokenize
from nltk import word_tokenize
doc_tokens = word_tokenize(doc.lower())
doc_tokens[0:10]

['how',
 'does',
 'earnings',
 'inequality',
 'affect',
 'social',
 'security',
 'financing',
 '?',
 '-']

In [26]:
# Remove stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [29]:
doc_tokens_clean=[word for word in doc_tokens if word not in stop_words]
doc_tokens_clean[0:10]

['earnings',
 'inequality',
 'affect',
 'social',
 'security',
 'financing',
 '?',
 '-',
 'aarp',
 'insight']

In [32]:
# Remove punctuation
from string import punctuation
doc_tokens_clean=[word for word in doc_tokens_clean if word not in punctuation]
doc_tokens_clean[0:10]

['earnings',
 'inequality',
 'affect',
 'social',
 'security',
 'financing',
 'aarp',
 'insight',
 'issues',
 'lality']

In [34]:
# Stemming
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
doc_tokens_clean_stemmed = [porter.stem(word) for word in doc_tokens_clean]
doc_tokens_clean_stemmed[0:10]

['earn',
 'inequ',
 'affect',
 'social',
 'secur',
 'financ',
 'aarp',
 'insight',
 'issu',
 'laliti']

In [36]:
# Compare size of tokens in different sets
len(set(doc_tokens)), len(set(doc_tokens_clean)), len(set(doc_tokens_clean_stemmed))

(1457, 1347, 1212)

## With [Stanza](https://stanfordnlp.github.io/stanza/)

In [65]:
#### Tokenize and remove stopwords according to Universal POS tags using Stanza, English only.
# https://universaldependencies.org/u/pos/
''' with sentence split, try True later '''
nlp_en = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', 
                         tokenize_no_ssplit=False, 
                         use_gpu=True)
remove_pos_list=['DET', 'SYM', 'PUNCT', 'PART', 'CCONJ', 'SCONJ', 'AUX', 'X', 'ADP']

2023-02-10 16:45:02 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-02-10 16:45:02 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

2023-02-10 16:45:02 INFO: Use device: gpu
2023-02-10 16:45:02 INFO: Loading: tokenize
2023-02-10 16:45:02 INFO: Loading: pos
2023-02-10 16:45:03 INFO: Loading: lemma
2023-02-10 16:45:03 INFO: Done loading processors!


### Process a single document

In [55]:
out_doc=nlp_en(doc)

In [63]:
np.mean([len(sent.to_dict()) for sent in out_doc.sentences])

26.763803680981596

In [70]:
# Try a sentence
[word['lemma'] for word in out_doc.sentences[32].to_dict() if word['upos'] not in remove_pos_list]

['taxable',
 'maximum',
 'adjust',
 'change',
 'consumer',
 'price',
 'index',
 'urban',
 'wage',
 'earner',
 'clerical',
 'worker']

In [71]:
# Iterate all sentences
doc_tokens_clean_stanza=[]
for sent in out_doc.sentences:
    doc_tokens_clean_stanza+=[word['lemma'] for word in sent.to_dict() if word['upos'] not in remove_pos_list]

In [74]:
len(set(doc_tokens_clean_stanza))

1213

### Process documents in batch

In [93]:
#### Tokenize and remove stopwords according to Universal POS tags using Stanza, English only.
# https://universaldependencies.org/u/pos/
nlp_en = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', 
                         tokenize_no_ssplit=True, 
                         use_gpu=True)
remove_pos_list=['DET', 'SYM', 'PUNCT', 'PART', 'CCONJ', 'SCONJ', 'AUX', 'X', 'ADP']

def func_stanza_clean_en(documents):
    logging.info('Wrap documents as stanza objects ...')
    in_docs = [stanza.Document([], text=d) for d in documents] # Wrap each document as a stanza.Document object
    logging.info('NLP-ing ...')
    out_docs = nlp_en(in_docs)
    # Remove stopwords and not meaningful words using POS tagging.
    # https://universaldependencies.org/u/pos/
    remove_pos_list=['DET', 'SYM', 'PUNCT', 'PART', 'CCONJ', 'SCONJ', 'AUX', 'X', 'ADP']
    logging.info('Cleaning docs using POS tags ...')
    txt_tokens=[[t['lemma'] for t in token.to_dict() if t['upos'] not in remove_pos_list] for doc in out_docs for token in doc.sentences]
    return txt_tokens

2023-02-10 19:21:58 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-02-10 19:21:58 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

INFO:stanza:Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

2023-02-10 19:21:58 INFO: Use device: gpu
INFO:stanza:Use device: gpu
2023-02-10 19:21:58 INFO: Loading: tokenize
INFO:stanza:Loading: tokenize
2023-02-10 19:21:58 INFO: Loading: pos
INFO:stanza:Loading: pos
2023-02-10 19:21:59 INFO: Loading: lemma
INFO:stanza:Loading: lemma
2023-02-10 19:21:59 INFO: Done loading processors!
INFO:stanza:Done loading processors!


In [94]:
docs=[open('txt/'+file).read() for file in os.listdir('txt/')]
docs[-1][0:500]

'International Review of Experts’ Recommendations for Reforming the Long-Term Services and Supports Workforce - AARP Public Policy Institute LTSS Choices Spotlight ich v of we identify )21 ALL RIGHTS RESERVED | www.aarp.org/LTSSChoices1 | DECEMBER 2021 AARP Public Policy Institute © 2021 ALL RIGHTS RESERVED | www.aarp.org/LTSSChoices LT S S C H O I C E S International Review of Experts’ Recommendations for Reforming the Long-Term Services and Supports Workforce Susan C. Reinhard and Jane A. Tilly'

In [95]:
doc_tokens_clean_stanza=func_stanza_clean_en(docs)

### NER with Stanza

In [96]:
import stanza
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')
doc = nlp(docs[2])
print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in doc.ents], sep='\n')

2023-02-10 19:24:27 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/ner/ontonotes.pt:   0%|        …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/pretrain/fasttextcrawl.pt:   0%…

2023-02-10 19:24:30 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

INFO:stanza:Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2023-02-10 19:24:30 INFO: Use device: gpu
INFO:stanza:Use device: gpu
2023-02-10 19:24:30 INFO: Loading: tokenize
INFO:stanza:Loading: tokenize
2023-02-10 19:24:30 INFO: Loading: ner
INFO:stanza:Loading: ner
2023-02-10 19:24:30 INFO: Done loading processors!
INFO:stanza:Done loading processors!


entity: David Ribeiro	type: PERSON
entity: Tyler Bailey	type: PERSON
entity: June 2017	type: DATE
entity: American Council	type: ORG
entity: 600	type: CARDINAL
entity: Washington	type: GPE
entity: 20045	type: CARDINAL
entity: 202	type: CARDINAL
entity: 507-4000	type: CARDINAL
entity: 1	type: CARDINAL
entity: 1	type: CARDINAL
entity: 2	type: CARDINAL
entity: 3	type: CARDINAL
entity: 4	type: CARDINAL
entity: 5	type: CARDINAL
entity: 6	type: CARDINAL
entity: 7	type: CARDINAL
entity: 8	type: CARDINAL
entity: 9	type: CARDINAL
entity: 10	type: CARDINAL
entity: 11	type: CARDINAL
entity: 12	type: CARDINAL
entity: 12	type: CARDINAL
entity: 14	type: CARDINAL
entity: David Ribeiro	type: PERSON
entity: biennial	type: DATE
entity: City Energy Efficiency Scorecard	type: ORG
entity: Tyler Bailey	type: PERSON
entity: the City Energy Efficiency Scorecard	type: ORG
entity: the Kresge Foundation	type: ORG
entity: ACEEE	type: ORG
entity: Steven Nadel	type: PERSON
entity: Neal Elliott	type: PERSON
entity: 

In [99]:
[s.to_dict()['text'] for s in doc.entities if s.type=='ORG']

['American Council',
 'City Energy Efficiency Scorecard',
 'the City Energy Efficiency Scorecard',
 'the Kresge Foundation',
 'ACEEE',
 'ACEEE',
 'Cutter, Burton',
 'Willis and Loa 2015',
 'Resilient Cities',
 'Baatz',
 'The Los Angeles Department of Water and Power',
 'Distributed Generation',
 'Energy Infrastructure',
 'the Energy Information Administration',
 'Natural Gas Monthly',
 'York, Kushler',
 'Syed, Gerber',
 'Sharp',
 'National Conference of State Legislatures',
 'Transportation Connectivity Communities',
 'the Center for Neighborhood Technology',
 'AllTransit',
 'Safe Track',
 'Capital Bikeshare',
 'I-85',
 'the Metropolitan Atlanta Rapid Transit Authority',
 'Riddell',
 'The ICF International CHP Installation Database',
 'CHP',
 'CHP',
 'CHP',
 'DOE 2016',
 'CHP',
 'CHP',
 'CHP',
 'The CHP Installation Database',
 'Urban Green Council',
 'Energy Burden',
 'NRDC',
 'The City Energy Project',
 'the Institute for Market Transformation',
 'the National Resources Defense Counc

# Word2Vec examples

Source: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

Example studies:

- Kozlowski, Austin C., Matt Taddy, and James A. Evans. 2019. “The Geometry of Culture: Analyzing the Meanings of Class through Word Embeddings.” American Sociological Review 84 (5): 905–49. https://doi.org/10.1177/0003122419877135.
- Jones, Jason J., Mohammad Ruhul Amin, Jessica Kim, and Steven Skiena. 2020. “Stereotypical Gender Associations in Language Have Decreased Over Time.” Sociological Science 7 (January): 1–35. https://doi.org/10.15195/v7.a1.

## [Google News](https://code.google.com/archive/p/word2vec/)

In [100]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [101]:
# Lookup the vector values of a word.
wv['computer']

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

In [102]:
# Unfortunately, the model is unable to infer vectors for unfamiliar words. 
# This is one limitation of Word2Vec: if this limitation matters to you, check out the FastText model.
wv['cameroon']

KeyError: "Key 'cameroon' not present"

In [103]:
# Calculate word similarity.
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


In [14]:
wv.most_similar('king')

[('kings', 0.7138045430183411),
 ('queen', 0.6510956287384033),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204219460487366),
 ('prince', 0.6159993410110474),
 ('sultan', 0.5864822864532471),
 ('ruler', 0.5797566771507263),
 ('princes', 0.5646552443504333),
 ('Prince_Paras', 0.5432944297790527),
 ('throne', 0.5422104597091675)]

In [106]:
wv.most_similar('chess')

[('Chess', 0.6993610858917236),
 ('grandmasters', 0.6455792784690857),
 ('grandmaster', 0.6356159448623657),
 ('blindfold_chess', 0.6172971129417419),
 ('chess_grandmaster', 0.5920405983924866),
 ('Kasparov_Karpov', 0.5834527611732483),
 ('Anatoli_Karpov', 0.5782327055931091),
 ('backgammon', 0.5762614011764526),
 ('Korchnoi', 0.5705455541610718),
 ('Scrabble', 0.5702669024467468)]

## [GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/projects/glove/)

In [109]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
glove_file = datapath('/root/css_nlp/glove/glove.6B.50d.txt')
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

  _ = glove2word2vec(glove_file, tmp_file)


In [110]:
model['happy']

array([ 0.092086,  0.2571  , -0.58693 , -0.37029 ,  1.0828  , -0.55466 ,
       -0.78142 ,  0.58696 , -0.58714 ,  0.46318 , -0.11267 ,  0.2606  ,
       -0.26928 , -0.072466,  1.247   ,  0.30571 ,  0.56731 ,  0.30509 ,
       -0.050312, -0.64443 , -0.54513 ,  0.86429 ,  0.20914 ,  0.56334 ,
        1.1228  , -1.0516  , -0.78105 ,  0.29656 ,  0.7261  , -0.61392 ,
        2.4225  ,  1.0142  , -0.17753 ,  0.4147  , -0.12966 , -0.47064 ,
        0.3807  ,  0.16309 , -0.323   , -0.77899 , -0.42473 , -0.30826 ,
       -0.42242 ,  0.055069,  0.38267 ,  0.037415, -0.4302  , -0.39442 ,
        0.10511 ,  0.87286 ], dtype=float32)

In [111]:
model.most_similar('happy')

[("'m", 0.9142324328422546),
 ('everyone', 0.8976402878761292),
 ('everybody', 0.8965491056442261),
 ('really', 0.8839760422706604),
 ('me', 0.8784631490707397),
 ('definitely', 0.8762789368629456),
 ('maybe', 0.8756703734397888),
 ("'d", 0.8718011975288391),
 ('feel', 0.8707677721977234),
 ('i', 0.8707453012466431)]

# Calculate document similarity between documents/paragraphs/sentences

Example study: Ma, Ji, and René Bekkers. 2023. “Consensus Formation in Nonprofit and Philanthropic Studies: Networks, Reputation, and Gender.” Nonprofit and Voluntary Sector Quarterly, January, 08997640221146948. https://doi.org/10.1177/08997640221146948.

Max length of input documents ([caveat 1](https://github.com/tensorflow/hub/issues/244), [caveat 2](https://www.sbert.net/examples/applications/computing-embeddings/README.html?highlight=max#input-sequence-length))

## With [Word Mover Distance](http://proceedings.mlr.press/v37/kusnerb15.pdf)

[Gensim tutorial](https://radimrehurek.com/gensim/auto_examples/tutorials/run_wmd.html)

In [119]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [120]:
sentence_obama = 'Obama speaks to the media in Illinois'
sentence_president = 'The president greets the press in Chicago'
sentence_else= 'tomorrow is a rainy day'

In [122]:
wv.wmdistance(sentence_obama, sentence_president)

0.4437976459291981

In [123]:
wv.wmdistance(sentence_obama, sentence_else)

0.5462512284310116

In [124]:
wv.wmdistance(sentence_president, sentence_else)

0.6227118478717114

## With [universal-sentence-encoder](https://tfhub.dev/google/collections/universal-sentence-encoder)

In [113]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

In [114]:
docs1=['高兴', '狗狗真可爱', '出去走走', 'pets are our good friends', 'happy learning']
docs1_vectors=embed(docs1).numpy()
docs1_vectors

array([[ 0.13056417, -0.07187397,  0.0137589 , ...,  0.04946987,
         0.02765515,  0.04919228],
       [ 0.05786867, -0.06231322,  0.07939319, ...,  0.01655525,
         0.00354518,  0.03140444],
       [ 0.09586518, -0.0639194 , -0.03518083, ..., -0.01046547,
         0.05872947,  0.00258046],
       [ 0.05267328, -0.00014825, -0.04500623, ..., -0.07624684,
        -0.00496952, -0.05112167],
       [ 0.09143507, -0.03138757,  0.02067843, ..., -0.05256198,
         0.06383334, -0.0003199 ]], dtype=float32)

In [116]:
docs2=['动物是我们的好朋友', '开心学习']
docs2_vectors=embed(docs2).numpy()
docs2_vectors

array([[ 0.04859294, -0.02410855,  0.01296188, ..., -0.06364327,
        -0.01872791, -0.02179378],
       [ 0.10388371, -0.04205937, -0.02334163, ..., -0.04688105,
         0.04073469, -0.01615168]], dtype=float32)

In [117]:
np.inner(docs1_vectors, docs2_vectors)

array([[0.16803929, 0.3675997 ],
       [0.49006218, 0.21030766],
       [0.00966788, 0.29758108],
       [0.78957915, 0.11001315],
       [0.1702948 , 0.76996684]], dtype=float32)

In [118]:
# Verify: np.inner == 1-cosine distance
from scipy import spatial
1-spatial.distance.cdist(docs1_vectors, docs2_vectors, metric='cosine')

array([[0.16803931, 0.36759971],
       [0.49006217, 0.21030767],
       [0.00966789, 0.29758106],
       [0.78957921, 0.11001316],
       [0.17029482, 0.76996691]])