In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk, re
from nltk.stem.wordnet import WordNetLemmatizer

stopwords = set(nltk.corpus.stopwords.words('english'))


In [2]:
import json
## reading the json file
with open('C:\\Users\\Mansi\\Documents\\Harrisburg classes\\Sem 7 - 699\\ProjectCollaterals_610_MansiJoshi_20180817\\WMT\\data_feedsWMT.json') as json_datafeeds:
    load_datafeeds = json.load(json_datafeeds)

In [3]:
feed_titles2 = []

for feed in load_datafeeds:
    feed_titles2.append(str(feed['title']))

print("Total number of titles: " + str(len(feed_titles2)))


Total number of titles: 68074


In [4]:
def tokenize_titles(title):
    tokens = nltk.word_tokenize(title)
    lmtzr = WordNetLemmatizer()
    filtered_tokens = []
    
    for token in tokens:
        token = token.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
        token = re.sub(r'[^a-zA-Z0-9 ]', '', token)
        if token not in stopwords:
            filtered_tokens.append(token.lower())
    
    lemmas = [lmtzr.lemmatize(t,'v') for t in filtered_tokens]

    return lemmas

In [5]:
def clstr_lda(num_topics, stories):
    # top words to be identified
    n_top_words = 100

    tf_vectorizer = CountVectorizer(max_df=500, min_df=2, max_features=50,  ##Orig max_df=90, min_df=2, max_features=1000
                                    tokenizer=tokenize_titles, ngram_range=(2,3))

    tf = tf_vectorizer.fit_transform(stories)

    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=30,   ##Orig max_iter=3000
                                    learning_method='batch', learning_offset=10.,
                                    random_state = 1)
    lda.fit(tf)
    tf_feature_names = tf_vectorizer.get_feature_names()

    # print top topic words
    topics = dict()
    for topic_idx, topic in enumerate(lda.components_):
        topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print("Topic #%d:" % topic_idx)
        print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
    return topics

In [6]:
##topics = clstr_lda(10, feed_titles2)## Original combination
topics = clstr_lda(10, feed_titles2)

Topic #0:
 100 |  1 |   100 | post  | ship  | new  | gift card | sale  | save  | deal  | reg   | walmart   |  reg  | reg  |  reg |  walmart  |  save |  amazon | set  |  5 |  40 |  50 |  3 | amazon  |  25 |  20 |  get | free ship |   50 |  10 |  30 |  15 |   walmart | news  |  free ship | price  |  news |  35 |   20 | minimum wage |  page |  2019 | 2019  |  march | inc  wmt |  13 | walmart inc | walmart inc  | part time | retail merchandiser
Topic #1:
minimum wage |  13 | save  |  save | ship  | deal  | post  |  15 | walmart   |  30 | sale  |  100 |  10 |  news | amazon  |  5 |  20 | news  |  3 |  50 |  amazon |  40 |  35 |   100 |   walmart | set  |  1 | new  |  get |  25 |  walmart  | gift card | price  |  march | free ship | 2019  |  free ship |   20 |  reg  | reg  |  reg |  2019 |   50 |  page | inc  wmt | reg   | walmart inc | walmart inc  | part time | retail merchandiser
Topic #2:
 15 | part time | retail merchandiser | minimum wage |  13 |  5 |  25 | new  | post  | gift card |  

In [7]:
print(topics)

{0: [' 100', ' 1', '  100', 'post ', 'ship ', 'new ', 'gift card', 'sale ', 'save ', 'deal ', 'reg  ', 'walmart  ', ' reg ', 'reg ', ' reg', ' walmart ', ' save', ' amazon', 'set ', ' 5', ' 40', ' 50', ' 3', 'amazon ', ' 25', ' 20', ' get', 'free ship', '  50', ' 10', ' 30', ' 15', '  walmart', 'news ', ' free ship', 'price ', ' news', ' 35', '  20', 'minimum wage', ' page', ' 2019', '2019 ', ' march', 'inc  wmt', ' 13', 'walmart inc', 'walmart inc ', 'part time', 'retail merchandiser'], 1: ['minimum wage', ' 13', 'save ', ' save', 'ship ', 'deal ', 'post ', ' 15', 'walmart  ', ' 30', 'sale ', ' 100', ' 10', ' news', 'amazon ', ' 5', ' 20', 'news ', ' 3', ' 50', ' amazon', ' 40', ' 35', '  100', '  walmart', 'set ', ' 1', 'new ', ' get', ' 25', ' walmart ', 'gift card', 'price ', ' march', 'free ship', '2019 ', ' free ship', '  20', ' reg ', 'reg ', ' reg', ' 2019', '  50', ' page', 'inc  wmt', 'reg  ', 'walmart inc', 'walmart inc ', 'part time', 'retail merchandiser'], 2: [' 15', 'par

In [50]:
##topics2 = clstr_lda(10, feed_titles2)##

Topic #0:
   | user data | need know | facebook give |  facebook | device makers |  apple  | facebook  | users  | news 
Topic #1:
macos mojave |  ios | macos 1014 | dark mode | wwdc  | users  | device makers |  data | users  data | app store
Topic #2:
screen time |  iphone | tim cook | apple ceo | ceo tim | ceo tim cook | apple ceo tim | news  |  ios | user data
Topic #3:
apple watch | apple announce |   new | watchos 5 | new feature |    | wwdc 2018  | news  | iphone  | app store
Topic #4:
watchos 5 | apple introduce | group facetime | macos mojave |  ios | macos 1014 | 12  | new feature | ios 12  | apple unveil
Topic #5:
 wwdc | apple unveil | apple  wwdc |  wwdc 2018 | iphone  |  apple  | wwdc 2018  | wwdc  |  ios | screen time
Topic #6:
 new | iphone x |  apple  | new feature |   new | wwdc  | facebook  |  ios | screen time | wwdc 2018 
Topic #7:
12  | ios 12  | new feature | users  |  data | users  data |  ios | macos 1014 | macos mojave | need know
Topic #8:
wwdc 2018  | app stor

In [1]:
### Pls ignore - using the old Json file created for HW4

import webhoseio, os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk, re
from nltk.stem.wordnet import WordNetLemmatizer

stopwords = set(nltk.corpus.stopwords.words('english'))

webhoseio.config(token=os.environ['WEBHOSE_TOKEN'])
query_params = {
    "q": "organization:Tesla",
    "ts": "1523748602856",
    "sort": "crawled"
}

KeyError: 'WEBHOSE_TOKEN'

In [68]:
### Pls ignore - using the old Json file created for HW4
# be careful how many times you make this call
# get the first batch
output = webhoseio.query("filterWebContent", query_params)

In [None]:
### Pls ignore - using the old Json file created for HW4
output = webhoseio.get_next()

In [69]:
### Pls ignore - using the old Json file created for HW4
for feed in output['posts']:
    print(str(feed['title']) + '|||' + str(feed['published']))

Tesla needs to pull back on Autopilot before it's too late (TSLA)|||2018-04-15T03:23:00.000+03:00
Tesla needs to pull back on Autopilot before it's too late|||2018-04-15T05:10:00.000+03:00
Tips on how to save money when you travel | TravelWireNews|||2018-04-15T07:46:00.000+03:00
Why Tesla may be the most hated stock in America and the most loved|||2018-04-15T03:20:00.000+03:00
Why Tesla may be the most hated stock in America -- and the most loved|||2018-04-15T09:13:00.000+03:00
Investors love or loathe Tesla|||2018-04-15T10:20:00.000+03:00
Humans are underrated, excessive automation at Tesla my mistake: Elon Musk|||2018-04-14T22:03:00.000+03:00
Tesla (TSLA) Downgraded by Vetr to “Hold”|||2018-04-15T11:26:00.000+03:00
Tesla got a ‘dangerous transport’ problem in Norway – NORWAY NEWS|||2018-04-14T21:04:00.000+03:00
TeslaCoilCoin Price Up 15.8% Over Last Week (TESLA)|||2018-04-15T17:44:00.000+03:00
TeslaCoilCoin (TESLA) Reaches One Day Trading Volume of $755.00|||2018-04-15T16:26:00.000+0