In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
import json
import numpy as np
from uuid import uuid4
import pandas as pd
import re

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
no_features = 1000

In [5]:
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [6]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [40]:
no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)



In [39]:
LatentDirichletAllocation?

[0;31mInit signature:[0m [0mLatentDirichletAllocation[0m[0;34m([0m[0mn_components[0m[0;34m=[0m[0;36m10[0m[0;34m,[0m [0mdoc_topic_prior[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mtopic_word_prior[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mlearning_method[0m[0;34m=[0m[0;34m'batch'[0m[0;34m,[0m [0mlearning_decay[0m[0;34m=[0m[0;36m0.7[0m[0;34m,[0m [0mlearning_offset[0m[0;34m=[0m[0;36m10.0[0m[0;34m,[0m [0mmax_iter[0m[0;34m=[0m[0;36m10[0m[0;34m,[0m [0mbatch_size[0m[0;34m=[0m[0;36m128[0m[0;34m,[0m [0mevaluate_every[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m [0mtotal_samples[0m[0;34m=[0m[0;36m1000000.0[0m[0;34m,[0m [0mperp_tol[0m[0;34m=[0m[0;36m0.1[0m[0;34m,[0m [0mmean_change_tol[0m[0;34m=[0m[0;36m0.001[0m[0;34m,[0m [0mmax_doc_update_iter[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m [0mn_jobs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mrandom_sta

In [8]:
no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
people time right did good said say make way government
Topic 1:
window problem using server application screen display motif manager running
Topic 2:
god jesus bible christ faith believe christian christians sin church
Topic 3:
game team year games season players play hockey win league
Topic 4:
new 00 sale 10 price offer shipping condition 20 15
Topic 5:
thanks mail advance hi looking info help information address appreciated
Topic 6:
windows file files dos program version ftp ms directory running
Topic 7:
edu soon cs university ftp internet article email pub david
Topic 8:
key chip clipper encryption keys escrow government public algorithm nsa
Topic 9:
drive scsi drives hard disk ide floppy controller cd mac
Topic 10:
just ll thought tell oh little fine work wanted mean
Topic 11:
does know anybody mean work say doesn help exist program
Topic 12:
card video monitor cards drivers bus vga driver color memory
Topic 13:
like sounds looks look bike sound lot things really thing
To

In [18]:
out = []
with open("/Users/mitchell/Downloads/reviews_Musical_Instruments_5.json") as f:
    data = f.readlines()
    for row in data:
        out.append(json.loads(row))

In [22]:
data[0]

'{"reviewerID": "A2IBPI20UZIR0U", "asin": "1384719342", "reviewerName": "cassandra tu \\"Yeah, well, that\'s just like, u...", "helpful": [0, 0], "reviewText": "Not much to write about here, but it does exactly what it\'s supposed to. filters out the pop sounds. now my recordings are much more crisp. it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing,", "overall": 5.0, "summary": "good", "unixReviewTime": 1393545600, "reviewTime": "02 28, 2014"}\n'

In [37]:
d = []
for row in out:
    assign = np.random.choice(["GOOD", "WONDERING", "BAD"], p=[0.3, 0.3, 0.4])
    d.append({
        "id": str(uuid4()),
        "project_name": "handsome-coyote",
        "text": re.sub(r"\d+", "", row["reviewText"][:100]),
        "type": assign
    })

In [38]:
pd.DataFrame(d).head(200).to_csv("~/Desktop/review.csv", index=False)