# Applying Word2Vec on HackerNews
## Assumptions
This notebook assumes the following:
- This script assumes that the collection is stored by years
- Currently, we hardcode the years to 2016, 2017, 2018 which fits the scope of the project

## Requirements
In order to run the notebook, you have to do the following:
- source activate hackernews
- import the data into MongoDB <br>
```mongoimport --db HackerNews --collections hn_{{ $year }} {{ $year }}.fmt```

## Sources
https://rare-technologies.com/word2vec-tutorial/

# Word2Vec Results

In [1]:
from gensim.models import Word2Vec

In [2]:
w2v_model = Word2Vec.load('model_20161718')

In [3]:
words = w2v_model.wv.vocab
len(words)

96769

In [None]:
words

In [11]:
w2v_model.most_similar(positive=['docker', 'os'], topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('vm', 0.7280181646347046),
 ('linux', 0.7059256434440613),
 ('chroots', 0.6838865280151367),
 ('lxd', 0.6769089102745056),
 ('rancheros', 0.6695564985275269)]

In [5]:
w2v_model.similarity('doctor', 'male')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


0.33735788

In [12]:
similar = w2v_model.wv.most_similar('os', topn=30)
similar

  if np.issubdtype(vec.dtype, np.int):


[('oses', 0.732519268989563),
 ('macosx', 0.6737253665924072),
 ('usermode', 0.6693651676177979),
 ('userland', 0.659410834312439),
 ('ring0', 0.6555902361869812),
 ('xmacos', 0.6456476449966431),
 ('microkernel', 0.6410301327705383),
 ('chromeos', 0.6395987272262573),
 ('nextstep', 0.6387742161750793),
 ('oslevel', 0.6381477117538452),
 ('userlands', 0.6297509670257568),
 ('linux', 0.6284586191177368),
 ('kext', 0.6250098943710327),
 ('genode', 0.6236993074417114),
 ('winnt', 0.6209630966186523),
 ('nonunix', 0.6192915439605713),
 ('osx', 0.6163003444671631),
 ('posixy', 0.6157057285308838),
 ('hypervisor', 0.6135842800140381),
 ('linuxbsd', 0.6103509664535522),
 ('linuxbased', 0.6099382638931274),
 ('unices', 0.6092256903648376),
 ('openstep', 0.6082422137260437),
 ('distrosi', 0.6082149744033813),
 ('keykos', 0.6048111915588379),
 ('windows', 0.6025958061218262),
 ('kernel', 0.6017416715621948),
 ('virtualisation', 0.6009407043457031),
 ('linuxwindows', 0.6005334854125977),
 ('kerne

# Getting data
## Connecting to MongoDB

In [13]:
from pymongo import MongoClient

In [14]:
client = MongoClient()
db = client.HackerNews

In [15]:
hn_2016 = db.hn_2016

In [17]:
cursor = hn_2016.find_one()
cursor

{'_id': ObjectId('5b98b7d9116cc408a06171ba'),
 'by': 'cm2187',
 'id': '12254432',
 'parent': '12254396',
 'text': 'And the company has no idea of whether the person being hire is as good as he pretends to be. This uncertainty goes both ways.',
 'time': '1470750510',
 'timestamp': '2016-08-09 13:48:30 UTC',
 'title': '',
 'type': 'comment',
 'url': ''}

In [12]:
cursor = hn_2016.find()

## Filtering via months

In [6]:
from datetime import date

In [7]:
epoch_dt = date(1970, 1,1)

In [8]:
months = {'jan': 1,
         'feb': 2,
         'mar': 3,
         'apr': 4,
         'may': 5,
         'jun': 6,
         'jul': 7,
         'aug': 8,
         'sep': 9,
         'oct': 10,
         'nov': 11,
         'dec': 12}

In [10]:
month_interested = 'jan'
month = months[month_interested]
start_time = int((date(2016, month, 1) - epoch_dt).total_seconds()) 
end_time = int((date(2016, month, 1+7) - epoch_dt).total_seconds())

start_time, end_time

(1451606400, 1452211200)

In [5]:
cursor = hn_2016.find_one()
cursor

{'_id': ObjectId('5b98b7d9116cc408a06171ba'),
 'by': 'cm2187',
 'id': '12254432',
 'parent': '12254396',
 'text': 'And the company has no idea of whether the person being hire is as good as he pretends to be. This uncertainty goes both ways.',
 'time': '1470750510',
 'timestamp': '2016-08-09 13:48:30 UTC',
 'title': '',
 'type': 'comment',
 'url': ''}

## Getting the text

In [13]:
# entries = []
entries_text = []
for entry in cursor:
    if int(entry['time']) > start_time and int(entry['time']) < end_time:
#         entries.append(entry)
        entries_text.append(entry['text'])

In [14]:
print(len(entries_text))

41301


## Cleaning up the text
1) takes in unclean text <br>
2) clean(text) <br>
    - unescape, remove tags, unneccessary spaces, decontracted
3) sent_tokenize <br>
4) clean_2 <br>
5) word_tokenize <br>
6) clean_3 <br>
    - stopwords removal, pos_tag, only accept noun, verb, adj, adv, and lemmatize noun and verb

In [15]:
import string
import re
import html
import nltk
from nltk.corpus import stopwords

In [16]:
def clean(text):
    clean_text = html.unescape(text)
    clean_text = re.sub(r'\\n', ' ', clean_text)
    clean_text = re.sub(r'<a.*</a>',' ', clean_text)
    clean_text = re.sub(r'<p.*</p>', ' ', clean_text)
    clean_text = re.sub(r'<.?>', ' ', clean_text)
    clean_text = re.sub(r'</.?>', ' ', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won't", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
    clean_text = decontracted(clean_text)
    return clean_text

def clean_2(text):
    # remove punctuations
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    clean_text = regex.sub('', text)
    return clean_text

def clean_3(tokens):
    clean_tokens = [token for token in tokens if token not in stopwords.words('english')]
    pos = nltk.pos_tag(clean_tokens, tagset='universal')
    wnl = nltk.WordNetLemmatizer()
    new_tokens = []
    accepted_pos = ['NOUN', 'VERB', 'ADJ','ADV']
    to_lemmatize = ['NOUN', 'VERB']
    change_dict = {'NOUN':'n',
                 'VERB':'v',
                 'ADJ':'a',
                 'ADV':'r'}
    for i in pos:
        if i[-1] in accepted_pos:
            temp = i[0]
            if i[-1] in to_lemmatize:
                temp = wnl.lemmatize(temp, pos = change_dict[i[-1]])
            temp.lower()
            new_tokens.append(temp.lower())
    return new_tokens