# Text Preprocessing and Word Embeddings

In [1]:
# importing packages & libraries
import nltk
import string
import json

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/msonjap/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Text data

In [2]:
# reading data
with open("./data/sarcasm.json", 'r') as f:
    datastore = json.load(f)

In [3]:
# parsing json file
sentences = []

for item in datastore:
    sentences.append(item["headline"])

In [4]:
# checking a list of sentences
sentences

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way',
 "advancing the world's women",
 'the fascinating case for eating lab-grown meat',
 'this ceo will send your kids to school, if you work for his company',
 'top snake handler leaves sinking huckabee campaign',
 "friday's morning email: inside trump's presser for the ages",
 'airline passengers tackle man who rushes cockpit in bomb threat',
 'facebook reportedly working on healthcare features and apps',
 "north korea praises trump and urges us voters to reject 'dull hillary'",
 "actually, cnn's jeffrey lord has been 'indefensible' for a while",
 'barcelona holds huge protest in su

In [5]:
# checking length of sentences
len(sentences)

26709

In [6]:
# getting a sample sentence
sentences[5]

"advancing the world's women"

### Stop words

In [7]:
# getting a list of stop words and punctuation to be removed
stpwrds = stopwords.words('english') + list(string.punctuation)

### Preprocessing sentences
- tokenize
- make lowercase
- remove stop words/puctuation.

In [8]:
# preprocessing sentences
processed_sentences = []
for s in sentences:
    low_tokens = word_tokenize(s.lower())
    processed_sentences.append([w for w in low_tokens if w
                        not in stpwrds])

### Using word2vec to make word embeddings from the sentences

In [9]:
# creating model
model = Word2Vec(sentences=processed_sentences, size=32,
                 sg=1, window=10, iter=5,
                 min_count=10, workers=4)

### Displaying a sample embedding (weights)

In [10]:
model.wv["woman"]

array([-0.19728355,  0.25669697, -0.01172502,  0.22906916, -0.24559021,
        0.3737016 ,  0.07278163, -0.31762868, -0.20259683,  0.37936068,
        0.5756696 ,  0.27204508,  0.06656046, -0.36689496, -0.42443192,
        0.5035996 ,  0.4206337 ,  0.29308727,  0.03022297,  0.4956168 ,
        0.02698719, -0.51658547, -0.13900377, -0.6115825 ,  0.01110985,
       -0.1518039 ,  0.1185193 , -0.5793855 , -0.19693236,  0.2608732 ,
       -0.15896313, -0.36685145], dtype=float32)

### Exploring the top 4 embeddings most similar to a word of your choice

In [11]:
# getting top 4 words most similar to a given word (using cosine similarity)
model.wv.most_similar('woman', topn=4)

[('dad', 0.9650973081588745),
 ('man', 0.9641059637069702),
 ('mom', 0.9552103281021118),
 ('little', 0.9411618113517761)]

### Getting the 5 words most similar to "president" in this corpus

In [12]:
# getting top 5 words most similar to a given word (using cosine similarity)
model.wv.most_similar('president', topn=5)

[('russia', 0.9821116328239441),
 ('iran', 0.9751731157302856),
 ('administration', 0.9702547192573547),
 ('colbert', 0.970198392868042),
 ('noah', 0.9675952792167664)]