In [1]:
import webhoseio, os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk, re
from nltk.stem.wordnet import WordNetLemmatizer

stopwords = set(nltk.corpus.stopwords.words('english'))

webhoseio.config(token=os.environ['WEBHOSE_TOKEN'])
query_params = {
    "q": "organization:Tesla",
    "ts": "1523748602856",
    "sort": "crawled"
}

In [2]:
# be careful how many times you make this call
# get the first batch
output = webhoseio.query("filterWebContent", query_params)

In [3]:
for feed in output['posts']:
    print(str(feed['title']) + '|||' + str(feed['published']))

GM CEO Mary Barra Slips To Second Place On Most Powerful Women List|||2018-09-25T15:57:00.000+03:00
TeslaCoilCoin Price Up 25.8% Over Last 7 Days (CRYPTO:TESLA)|||2018-09-26T06:46:00.000+03:00
Tesla shares fall after CEO Musk abuses British diver|||2018-09-26T12:33:00.000+03:00
Tesla shares fall after CEO Musk abuses British diver|||2018-09-26T12:36:00.000+03:00
Tesla shares fall after CEO Musk abuses British diver|||2018-09-26T12:37:00.000+03:00
Tesla: Tesla building own car carriers to boost deliveries: Musk, Auto News, ET Auto|||2018-09-26T03:00:00.000+03:00
Reviewing VOLKSWAGEN (VLKAY) and Tesla (TSLA)|||2018-09-26T10:08:00.000+03:00
Tesla ex-HR Chief suggested promoting employees to prevent unionization|||2018-09-26T13:48:00.000+03:00
CEO tweeting: lessons from Elon Musk|||2018-09-26T15:19:00.000+03:00
Tesla offers incentives, taps volunteers in end-of-quarter rush|||2018-09-26T03:00:00.000+03:00
Skeptic Says Tesla Will Likey Hit Q3 Delivery Guidance|||2018-09-26T16:35:00.000+03:0

In [4]:
feed_titles = []

for feed in output['posts']:
    feed_titles.append(str(feed['title']))

print("Total number of titles: " + str(len(feed_titles)))

Total number of titles: 100


In [7]:
def tokenize_titles(title):
    tokens = nltk.word_tokenize(title)
    lmtzr = WordNetLemmatizer()
    filtered_tokens = []
    
    for token in tokens:
        token = token.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
        token = re.sub(r'[^a-zA-Z0-9 ]', '', token)
        if token not in stopwords:
            filtered_tokens.append(token.lower())
    
    lemmas = [lmtzr.lemmatize(t,'v') for t in filtered_tokens]

    return lemmas

In [23]:
def clstr_lda(num_topics, titles):
    # top words to be identified
    n_top_words = 10

    tf_vectorizer = CountVectorizer(max_df=0.96, min_df=0.02, max_features=1000,
                                    tokenizer=tokenize_titles, ngram_range=(3,4))

    tf = tf_vectorizer.fit_transform(titles)

    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=1000,
                                    learning_method='batch', learning_offset=10.,
                                    random_state = 1)
    lda.fit(tf)
    tf_feature_names = tf_vectorizer.get_feature_names()

    # print top topic words
    topics = dict()
    for topic_idx, topic in enumerate(lda.components_):
        topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print("Topic #%d:" % topic_idx)
        print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
    return topics

In [24]:
topics = clstr_lda(7, feed_titles)

Topic #0:
tesla share fall | ceo musk abuse british | abuse british diver | musk abuse british diver | musk abuse british | fall ceo musk abuse | fall ceo musk | share fall ceo musk | ceo musk abuse | tesla share fall ceo
Topic #1:
ceo elon musk | tesla ceo elon musk | tesla ceo elon | elon musk sue | elon musk sue sec | musk sue sec | lawsuit tesla ceo elon | lawsuit tesla ceo | file lawsuit tesla ceo | file lawsuit tesla
Topic #2:
sue elon musk | sec sue elon musk | sec sue elon | sue elon musk mislead | elon musk mislead | sec file complaint | file complaint tesla | sec file complaint tesla | charge elon musk | file complaint tesla ceo
Topic #3:
sec file complaint | sec file complaint tesla | file complaint tesla | tesla   elon |   elon musk | tesla   elon musk | complaint tesla   | complaint tesla   elon | file complaint tesla   | billion investment saudi arabia
Topic #4:
 tsla  | tesla  tsla  | tesla  tsla | musk say tesla | tesla   breakthrough  | tesla   breakthrough |  autopilo

In [None]:
#output = webhoseio.get_next()