In [1]:
import webhoseio, os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk, re
from nltk.stem.wordnet import WordNetLemmatizer

stopwords = set(nltk.corpus.stopwords.words('english'))

In [2]:
webhoseio.config(token="2c45808b-852d-4156-af2f-faf960992ea9")
query_params = {
    "q": "organization:Tesla",
    "ts": "1523748602856",
    "sort": "crawled"
}

In [3]:
output = webhoseio.query("filterWebContent", query_params)

In [4]:
for feed in output['posts']:
    print(str(feed['title']) + '|||' + str(feed['published']))

Elon Musk just mocked the SEC on Twitter after he was forced to pay $20 million in fines to the agency|||2018-10-04T23:57:00.000+03:00
Tesla CEO Elon Musk mocks SEC as 'Short-Seller Enrichment Commission'|||2018-10-04T23:50:00.000+03:00
Tesla CEO Elon Musk apparently rips SEC after deal: 'Shortseller Enrichment Commission'|||2018-10-04T23:54:00.000+03:00
Mark Cuban Prodded Tesla’s Musk to Settle SEC Charges|||2018-10-04T22:33:00.000+03:00
Elon Musk Doesn't Take SEC Seriously|||2018-10-05T00:06:00.000+03:00
Tesla CEO Elon Musk apparently rips SEC after deal: andapos;Shortseller Enrichment Commissionandapos;|||2018-10-04T23:54:00.000+03:00
Elon Musk Doesn't Take SEC Seriously|||2018-10-05T00:06:00.000+03:00
Musk takes swipe at SEC on heels of fraud settlement|||2018-10-05T00:14:00.000+03:00
Tesla CEO Elon Musk apparently rips SEC after deal: andapos;Shortseller Enrichment Commissionandapos;|||2018-10-04T23:54:00.000+03:00
Elon Musk Doesn't Take SEC Seriously|||2018-10-05T00:06:00.000+03:

In [5]:
feed_titles = []

for feed in output['posts']:
    feed_titles.append(str(feed['title']))

print("Total number of titles: " + str(len(feed_titles)))

Total number of titles: 100


In [6]:
def tokenize_titles(title):
    tokens = nltk.word_tokenize(title)
    lmtzr = WordNetLemmatizer()
    filtered_tokens = []
    
    for token in tokens:
        token = token.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
        token = re.sub(r'[^a-zA-Z0-9 ]', '', token)
        if token not in stopwords:
            filtered_tokens.append(token.lower())
    
    lemmas = [lmtzr.lemmatize(t,'v') for t in filtered_tokens]

    return lemmas

In [7]:
def clstr_lda(num_topics, titles):
    # top words to be identified
    n_top_words = 10

    tf_vectorizer = CountVectorizer(max_df=0.92, min_df=0.02, max_features=2000,
                                    tokenizer=tokenize_titles, ngram_range=(4,5))

    tf = tf_vectorizer.fit_transform(titles)

    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=1000,
                                    learning_method='batch', learning_offset=10.,
                                    random_state = 1)
    lda.fit(tf)
    tf_feature_names = tf_vectorizer.get_feature_names()

    # print top topic words
    topics = dict()
    for topic_idx, topic in enumerate(lda.components_):
        topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print("Topic #%d:" % topic_idx)
        print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
    return topics

In [8]:
topics = clstr_lda(6, feed_titles)

Topic #0:
tesla ceo elon musk |  hortseller enrichment commission  | elon musk apparently rip | rip sec deal  | ceo elon musk apparently rip | apparently rip sec deal | tesla ceo elon musk apparently | elon musk apparently rip sec | apparently rip sec deal  | ceo elon musk apparently
Topic #1:
swipe sec heel fraud settlement | swipe sec heel fraud | sec heel fraud settlement | musk take swipe sec heel | take swipe sec heel fraud | take swipe sec heel | musk take swipe sec | heel fraud settlement  | sec heel fraud settlement  | tesla share fall elon musk
Topic #2:
elon musk mock sec | musk mock sec twitter | elon musk mock sec twitter |  shortseller enrichment commission |  shortseller enrichment commission  | sec twitter   shortseller | sec twitter   | twitter   shortseller | twitter   shortseller enrichment |   shortseller enrichment commission
Topic #3:
musk mock us agency days | mock us agency days settle | us agency days settle | musk mock us agency | mock us agency days |   musk m