In [1]:
import webhoseio, os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk, re
from nltk.stem.wordnet import WordNetLemmatizer

stopwords = set(nltk.corpus.stopwords.words('english'))

In [2]:
webhoseio.config(token="2c45808b-852d-4156-af2f-faf960992ea9")
query_params = {
    "q": "organization:Tesla",
    "ts": "1523748602856",
    "sort": "crawled"
}

In [3]:
output = webhoseio.query("filterWebContent", query_params)

In [4]:
for feed in output['posts']:
    print(str(feed['title']) + '|||' + str(feed['published']))

Meet Elon Musk’s new boss at Tesla: Australian telecom exec Robyn Denholm|||2018-11-08T02:00:00.000+02:00
New Tesla chairwoman's biggest challenge is controlling Musk|||2018-11-08T10:17:00.000+02:00
New Tesla chairwoman's biggest challenge is controlling Musk|||2018-11-08T02:00:00.000+02:00
Robyn Denholm to Replace Elon Musk as Tesla Chairman|||2018-11-08T23:21:00.000+02:00
Samsung gives first glimpse of foldable phone|||2018-11-08T20:03:00.000+02:00
Tesla Names Robyn Denholm as Chairman to Replace Elon Musk|||2018-11-08T19:55:00.000+02:00
New Tesla chairwoman's biggest challenge is controlling Musk|||2018-11-08T02:00:00.000+02:00
New Tesla chairwoman's biggest challenge is controlling Musk|||2018-11-08T10:17:00.000+02:00
New Tesla chairwoman's biggest challenge is controlling Musk|||2018-11-09T01:08:00.000+02:00
New Tesla chairwoman’s biggest challenge is controlling Musk|||2018-11-08T23:19:00.000+02:00
Robyn Denholm replaces Elon Musk as Tesla chairman|||2018-11-08T22:58:00.000+02:00

In [5]:
feed_titles = []

for feed in output['posts']:
    feed_titles.append(str(feed['title']))

print("Total number of titles: " + str(len(feed_titles)))

Total number of titles: 100


In [6]:
def tokenize_titles(title):
    tokens = nltk.word_tokenize(title)
    lmtzr = WordNetLemmatizer()
    filtered_tokens = []
    
    for token in tokens:
        token = token.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
        token = re.sub(r'[^a-zA-Z0-9 ]', '', token)
        if token not in stopwords:
            filtered_tokens.append(token.lower())
    
    lemmas = [lmtzr.lemmatize(t,'v') for t in filtered_tokens]

    return lemmas

In [7]:
def clstr_lda(num_topics, titles):
    # top words to be identified
    n_top_words = 10

    tf_vectorizer = CountVectorizer(max_df=0.92, min_df=0.02, max_features=2000,
                                    tokenizer=tokenize_titles, ngram_range=(4,5))

    tf = tf_vectorizer.fit_transform(titles)

    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=1000,
                                    learning_method='batch', learning_offset=10.,
                                    random_state = 1)
    lda.fit(tf)
    tf_feature_names = tf_vectorizer.get_feature_names()

    # print top topic words
    topics = dict()
    for topic_idx, topic in enumerate(lda.components_):
        topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print("Topic #%d:" % topic_idx)
        print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
    return topics

In [10]:
topics = clstr_lda(6, feed_titles)

Topic #0:
robyn denholm replace elon musk | denholm replace elon musk | robyn denholm replace elon | new tesla chairwoman  |  biggest challenge control | new tesla chairwoman  biggest | tesla chairwoman  biggest challenge | chairwoman  biggest challenge control | chairwoman  biggest challenge | tesla chairwoman  biggest
Topic #1:
vehicle battery production europe | plan electric vehicle battery | electric vehicle battery production | plan electric vehicle battery production | electric vehicle battery production europe | musk   tesla reach | elon musk   tesla reach | musk   tesla reach farthest |   tesla reach farthest | reach farthest point sun
Topic #2:
denholm take challenge tame elon | take challenge tame elon musk | take challenge tame elon | denholm take challenge tame | robyn denholm take challenge tame | challenge tame elon musk | robyn denholm take challenge | name director robyn denholm | tesla name director robyn denholm | denholm replace musk board chairman
Topic #3:
replace