# Applying Word2Vec on HackerNews
## Assumptions
This notebook assumes the following:
- This script assumes that the collection is stored by years
- Currently, we hardcode the years to 2016, 2017, 2018 which fits the scope of the project

## Requirements
In order to run the notebook, you have to do the following:
- source activate hackernews
- import the data into MongoDB <br>
```mongoimport --db HackerNews --collections hn_{{ $year }} {{ $year }}.fmt```

## Sources
https://rare-technologies.com/word2vec-tutorial/

# Getting data
## Connecting to MongoDB

In [1]:
from pymongo import MongoClient

In [2]:
client = MongoClient()
db = client.HackerNews

In [4]:
# hn_2016 = db.hn_2016
# hn_2017 = db.hn_2017
hn_2018 = db.hn_2018

In [None]:
cursor = hn_2016.find_one()
cursor['text']

In [4]:
cursor = hn_2016.find()

## Filtering via months

In [5]:
from datetime import date

In [6]:
epoch_dt = date(1970, 1,1)

In [7]:
months = {'jan': 1,
         'feb': 2,
         'mar': 3,
         'apr': 4,
         'may': 5,
         'jun': 6,
         'jul': 7,
         'aug': 8,
         'sep': 9,
         'oct': 10,
         'nov': 11,
         'dec': 12}

In [None]:
month_interested = 'jan'
month = months[month_interested]
start_time = int((date(2016, month, 1) - epoch_dt).total_seconds()) 
end_time = int((date(2016, month+1, 1) - epoch_dt).total_seconds())

start_time, end_time

In [None]:
cursor = hn_2016.find_one()
cursor

## Getting the text

In [None]:
# entries = []
entries_text = []
for entry in cursor:
    if int(entry['time']) > start_time and int(entry['time']) < end_time:
#         entries.append(entry)
        entries_text.append(entry['text'])

In [None]:
print(len(entries_text))

## Cleaning up the text
1) takes in unclean text <br>
2) clean(text) <br>
    - unescape, remove tags, unneccessary spaces, decontracted
3) sent_tokenize <br>
4) clean_2 <br>
5) word_tokenize <br>
6) clean_3 <br>
    - stopwords removal, pos_tag, only accept noun, verb, adj, adv, and lemmatize noun and verb

In [8]:
import string
import re
import html
import nltk
from nltk.corpus import stopwords

In [9]:
def clean(text):
    clean_text = html.unescape(text)
    clean_text = re.sub(r'\\n', ' ', clean_text)
    clean_text = re.sub(r'<a.*</a>',' ', clean_text)
    clean_text = re.sub(r'<p.*</p>', ' ', clean_text)
    clean_text = re.sub(r'<.?>', ' ', clean_text)
    clean_text = re.sub(r'</.?>', ' ', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won't", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
    clean_text = decontracted(clean_text)
    return clean_text

def clean_2(text):
    # remove punctuations
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    clean_text = regex.sub('', text)
    return clean_text

def clean_3(tokens):
    clean_tokens = [token for token in tokens if token not in stopwords.words('english')]
    pos = nltk.pos_tag(clean_tokens, tagset='universal')
    wnl = nltk.WordNetLemmatizer()
    new_tokens = []
    accepted_pos = ['NOUN', 'VERB', 'ADJ','ADV']
    to_lemmatize = ['NOUN', 'VERB']
    change_dict = {'NOUN':'n',
                 'VERB':'v',
                 'ADJ':'a',
                 'ADV':'r'}
    for i in pos:
        if i[-1] in accepted_pos:
            temp = i[0]
            if i[-1] in to_lemmatize:
                temp = wnl.lemmatize(temp, pos = change_dict[i[-1]])
            temp.lower()
            new_tokens.append(temp.lower())
    return new_tokens

In [None]:
entries_text[8]

In [11]:
corpus = ''
for i in entries_text:
    if len(clean(i)) < 5:
        continue
    if clean(i)[-1] != '.':
        corpus += clean(i)[:-1] + '.'
    else:
        corpus += clean(i)
len(corpus)

NameError: name 'entries_text' is not defined

In [None]:
test = corpus
clean_test = clean(test)
clean_test

## Tokenization

In [None]:
sentences = nltk.sent_tokenize(clean_test)
for i in sentences:
    print(i)
    print('----')

In [None]:
sentences = [nltk.word_tokenize(clean_2(sentence)) for sentence in sentences]
for sent in sentences:
    print(sent)

In [None]:
test = [clean_3(tokens) for tokens in sentences]
for sent in test:
    print(sent)

# Putting everything together

In [10]:
clean_text = clean(corpus)
sentences = nltk.sent_tokenize(clean_text)
sentences = [nltk.word_tokenize(clean_2(sentence)) for sentence in sentences]
sentences = [clean_3(tokens) for tokens in sentences]

NameError: name 'corpus' is not defined

In [None]:
sentences[4]

# Word2Vec

In [12]:
from gensim.models import Word2Vec
import multiprocessing


In [11]:
cpu_count = multiprocessing.cpu_count()
print("Number of cpus: {}".format(cpu_count))
w2v_model = Word2Vec(sentences, size = 100, window = 20, min_count =5, workers = cpu_count, seed = 123)
# w2v_model.save('model')

Number of cpus: 4


NameError: name 'sentences' is not defined

In [None]:
w2v_model.save('model')

In [13]:
w2v_model = Word2Vec.load('model')

In [14]:
words = w2v_model.wv.vocab
len(words)

85953

In [None]:
words

In [9]:
w2v_model.most_similar(positive=['doctor', 'male'], negative = ['female'], topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('physician', 0.7035205364227295),
 ('patient', 0.694062352180481),
 ('doctors', 0.6193013191223145),
 ('hospital', 0.6171454787254333),
 ('patients', 0.616409182548523)]

In [10]:
w2v_model.similarity('doctor', 'male')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


0.24809471

In [20]:
similar = w2v_model.wv.most_similar('')
similar

  if np.issubdtype(vec.dtype, np.int):


[('orca', 0.6142606735229492),
 ('surfer', 0.5859272480010986),
 ('specimen', 0.5731604099273682),
 ('tiger', 0.5649954080581665),
 ('dwarves', 0.5575430393218994),
 ('eggplant', 0.5490908622741699),
 ('den', 0.5466058254241943),
 ('roo', 0.5462160110473633),
 ('fur', 0.5430614948272705),
 ('moose', 0.5396895408630371)]

# Plotting

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
labels = []
vectors = []
for word in w2v_model.wv.vocab:
    vectors.append(w2v_model[word])
    labels.append(word)

In [None]:
tsne_model = TSNE(perplexity=20, n_components= 3, init = 'pca', n_iter = 250, random_state=123)
new_values = tsne_model.fit_transform(vectors)

In [None]:
%matplotlib notebook

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

for i in range(len(new_values)):
    value = new_values[i]
    plt.scatter(value[0], value[1], value[2], alpha=0.5, color = 'steelblue')
    plt.annotate(labels[i],
                xy = (value[0], value[1], value[2]),
                xytext = (5,2),
                textcoords = 'offset points',
                ha = 'right',
                va = 'bottom',
                alpha = 0.7)

ax.set_axis_off()
# ax.set_xlabel('X Label')
# ax.set_ylabel('Y Label')
# ax.set_zlabel('Z Label')

plt.show()


# Training

In [12]:
w2v_model = Word2Vec.load('model')

In [None]:
# w2v_model.wv.accuracy('questions-words.txt')

In [11]:
len(w2v_model.wv.vocab)

65719

In [14]:
cursor = hn_2016.find()

In [16]:
import time
tic = time.time()
month_shortform = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

for month_interested in month_shortform:
    cursor = hn_2018.find()
    if month_interested == 'dec':
        start_time = int((date(2018, 12, 1) - epoch_dt).total_seconds()) 
        end_time = int((date(2018+1, 1, 1) - epoch_dt).total_seconds())
    else:
        month = months[month_interested]
        start_time = int((date(2018, month, 1) - epoch_dt).total_seconds()) 
        end_time = int((date(2018, month+1, 1) - epoch_dt).total_seconds())
    
    print(month_interested, start_time, end_time)
    entries_text = []
    for entry in cursor:
        if int(entry['time']) > start_time and int(entry['time']) < end_time:
            entries_text.append(entry['text'])
    print('Number of entries: ',len(entries_text))
    corpus = ''
    for i in entries_text:
        if len(clean(i)) < 5:
            continue
        if clean(i)[-1] != '.':
            corpus += clean(i)[:-1] + '.'
        else:
            corpus += clean(i)
    clean_text = clean(corpus)
    sentences = nltk.sent_tokenize(clean_text)
    sentences = [nltk.word_tokenize(clean_2(sentence)) for sentence in sentences]
    sentences = [clean_3(tokens) for tokens in sentences]
    print('Number of sentences: ',len(sentences))
    w2v_model.build_vocab(sentences, update=True)
    w2v_model.train(sentences, total_examples = w2v_model.corpus_count, epochs = 10)
    w2v_model.save('model')
    print('model saved')
    print('model corpus size: {}, time elapsed: {}'.format(len(w2v_model.wv.vocab), time.time() - tic))

jan 1514764800 1517443200
Number of entries:  234992
Number of sentences:  515331
model saved
model corpus size: 87430, time elapsed: 2175.947058200836
feb 1517443200 1519862400
Number of entries:  209738
Number of sentences:  483225
model saved
model corpus size: 88568, time elapsed: 4204.82554769516
mar 1519862400 1522540800
Number of entries:  237342
Number of sentences:  528557
model saved
model corpus size: 89937, time elapsed: 6471.897032499313
apr 1522540800 1525132800
Number of entries:  237609
Number of sentences:  534533
model saved
model corpus size: 91431, time elapsed: 9552.911533594131
may 1525132800 1527811200
Number of entries:  237648
Number of sentences:  528362
model saved
model corpus size: 92790, time elapsed: 12288.604754686356
jun 1527811200 1530403200
Number of entries:  231815
Number of sentences:  529024
model saved
model corpus size: 94101, time elapsed: 14585.267132520676
jul 1530403200 1533081600
Number of entries:  224880
Number of sentences:  520389
model

In [None]:
w2v_model.build_vocab(sentences, update=True)

In [None]:
w2v_model.train(sentences, total_examples = w2v_model.corpus_count, epochs = 10)

In [None]:
w2v_model.corpus_count, w2v_model.iter

In [None]:
len(w2v_model.wv.vocab)

In [None]:
w2v_model['raimi']