In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

from sklearn.naive_bayes import MultinomialNB

from sklearn import metrics
from sklearn.decomposition import LatentDirichletAllocation
from gensim import corpora

In [44]:
import scrape as sc
from datetime import datetime

In [45]:
#load the data
scraper = sc.Scrape()
comments = scraper.return_comments_date(datetime(2016, 6, 20))

In [46]:
#test out the comments
print(len(comments)) #https://news.ycombinator.com/front?day=2015-04-09
print(len(comments[0])) #https://news.ycombinator.com/item?id=11941130

30
102


In [47]:
#flatten the list
comments = sum(comments, [])
print(comments[:3])

In [53]:
#basic stop words
from stop_words import get_stop_words
stop_words = get_stop_words('en')

#flatten the comments list

txts = [[word for word in sentence.lower().split() if word not in stop_words]
        for sentence in comments]

#collapse into tokens
from collections import Counter
frequency = Counter([word for txt in txts for word in txt])
tokens = [[token for token in txt if frequency[token] > 1] for txt in txts]

[['honestly', 'happy', 'stop', 'windows', '10', 'turning', 'laptop', 'hair', 'randomly', 'otherwise', 'entirely', 'thing', 'sitting', 'nothing,', 'suddenly', 'fan', 'will', 'start', 'cpu', 'utilization'], ['apparently', 'windows', '10', 'likes', 'use', 'idle', 'time', 'test', 'desktop', 'causing', 'rev', 'fan', 'idle', 'minutes.'], ['seriously', 'earth', 'happen?', 'even', 'edge', 'ends', 'rest', 'os', 'going', 'ensure', 'will', 'get', '3', 'hours', 'battery', 'life', 'laptop.'], ['windows', 'update', 'via', 'problem', 'machines', 'support.', 'randomly', '100%', 'cpu', 'minutes'], ['use', 'specific', 'pc', 'around', 'week,', 'basically', 'unusable', 'startup.', '100%', 'around', '20', 'minutes,', 'every', 'now', 'spike', 'hell', 'microsoft', 'playing', 'at,', 'process', 'switching', 'computers', 'linux.'], ['well,', 'one', 'thing', 'can', 'sure', 'will', 'never', 'happen', 'service', 'bothers', 'you,', 'can', 'always', 'reschedule', 'turn', 'off.', 'good', 'like', 'updates', 'windows',

In [54]:
#make the dictionary mapping
dict_gensim = corpora.Dictionary(tokens)
print(dict_gensim)

Dictionary(9977 unique tokens: ['counter?', '10k', 'universe', 'gap', 'teleport']...)


In [61]:
from gensim import models
#term frequency - inverse document frequency 
corpus = [dict_gensim.doc2bow(txt) for txt in txts]
tfidf = models.TfidfModel(corpus)

In [62]:
#compute the LDA model
corpus_tfidf = tfidf[corpus]
lda_gs = models.LdaModel(corpus = corpus_tfidf, id2word=dict_gensim, num_topics = 3, passes = 15)

In [63]:
lda_gs.print_topics(3)

[(0,
  '0.003*think + 0.003*can + 0.003*like + 0.002*people + 0.002*will + 0.002*just + 0.002*-- + 0.002*... + 0.002*use + 0.002*one'),
 (1,
  '0.002*docker + 0.002*just + 0.001*data + 0.001*like + 0.001*file + 0.001*can + 0.001*use + 0.001*using + 0.001*look + 0.001*chrome'),
 (2,
  '0.002*thanks! + 0.001*carbon + 0.001*eu + 0.001*gc + 0.001*sure. + 0.001*yep, + 0.001*thanks. + 0.001*bug + 0.001*trees + 0.001*now?')]

In [65]:
ttps = lda_gs.top_topics(corpus=corpus_tfidf, num_words=5)
idx = 0
for lst, val in ttps:
    print('Topic {0}'.format(idx))
    print(35*('-'))
    idx += 1
    for i, z in lst:
        print('    {0:20s}: {1:5.4f}'.format(z, i))
    print(35*('-'))


Topic 0
-----------------------------------
    think               : 0.0029
    can                 : 0.0028
    like                : 0.0027
    people              : 0.0023
    will                : 0.0023
-----------------------------------
Topic 1
-----------------------------------
    docker              : 0.0021
    just                : 0.0017
    data                : 0.0015
    like                : 0.0014
    file                : 0.0013
-----------------------------------
Topic 2
-----------------------------------
    thanks!             : 0.0018
    carbon              : 0.0013
    eu                  : 0.0012
    gc                  : 0.0011
    sure.               : 0.0009
-----------------------------------
