In [38]:
import json 
import tqdm
import re
import numpy as np

from langdetect import detect, lang_detect_exception
from collections import defaultdict
from spacy_langdetect import LanguageDetector

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD


In [39]:
languages = defaultdict(list)
urls = set()
tweet_texts = set()
for line in tqdm.tqdm(open("data/2021_02_05.jsonl")):
    data = json.loads(line)
    data = data["data"]
    tweet_text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", data["text"])
    if data["url"] not in urls and tweet_text not in tweet_texts and "article" in data["extra"]:
        tweet_texts.add(tweet_text)
        urls.add(data["url"])
        article = data["extra"]["article"]
        #content = "{}\n{}\n{}".format(article["title"], article["text"], article["keywords"])
        content = article["title"] + " . " + article["text"]
        try:
            lang = detect(content)
            languages[lang].append({"content": content, "item": data})
        except lang_detect_exception.LangDetectException:
            pass

13973it [00:38, 363.32it/s]


In [40]:
print(len(languages["en"]))

1266


In [41]:
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english',
                             lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

data_vectorized = vectorizer.fit_transform([x["content"] for x in languages["en"]])

NUM_TOPICS = 20
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=20, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 20
iteration: 2 of max_iter: 20
iteration: 3 of max_iter: 20
iteration: 4 of max_iter: 20
iteration: 5 of max_iter: 20
iteration: 6 of max_iter: 20
iteration: 7 of max_iter: 20
iteration: 8 of max_iter: 20
iteration: 9 of max_iter: 20
iteration: 10 of max_iter: 20
iteration: 11 of max_iter: 20
iteration: 12 of max_iter: 20
iteration: 13 of max_iter: 20
iteration: 14 of max_iter: 20
iteration: 15 of max_iter: 20
iteration: 16 of max_iter: 20
iteration: 17 of max_iter: 20
iteration: 18 of max_iter: 20
iteration: 19 of max_iter: 20
iteration: 20 of max_iter: 20


In [42]:
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [43]:
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('raspberry', 154.78419243050303), ('driver', 116.4606157703367), ('rtl', 115.84501306743468), ('wireless', 102.42171865499766), ('wifi', 93.49727235954511), ('realtek', 69.18982347036949), ('console', 45.8187665179099), ('firmware', 45.490462639147054), ('power', 45.33801473690249), ('box', 40.627909570589054)]
Topic 1:
[('batman', 142.9465692280746), ('superman', 93.34460973774662), ('new', 89.63692519889989), ('affleck', 77.3977899413624), ('movie', 73.86347494272778), ('com', 67.69214689043203), ('series', 65.80161921823158), ('arthur', 64.97432094982223), ('ben', 61.74775932245859), ('reeves', 51.537635366376335)]
Topic 2:
[('use', 159.5477806904329), ('user', 151.64019816327578), ('audio', 133.44232474341135), ('output', 131.8582160685391), ('device', 117.90171974295093), ('sync', 116.83960810997218), ('used', 100.81984421163237), ('make', 99.33763083598711), ('protocol', 84.53412991352104), ('using', 80.97874060926964)]
Topic 3:
[('days', 54.90461534194452),

In [44]:
topics = []
top_n = 10
for idx, topic in enumerate(lda.components_):
    words = [(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]
    topics.append(words)

items_topics = defaultdict(list)
for item in languages["en"]:
    x = lda.transform(vectorizer.transform([item["content"]]))[0]
    topic_idx = np.argmax(x, axis=0)
    items_topics[topic_idx].append({"item": item, "score": x[topic_idx]})
    #print("topic: {}\nurl: {}\ncontent: {}".format(topics[topic_idx], item["item"]["url"], item["content"][:250]))
    #break

In [45]:
for key in items_topics:
    bests = sorted(items_topics[key], key=lambda x: x["score"], reverse=True)
    print(",".join([i[0] for i in topics[key][:4]]))
    print("\t{}".format("\n\t".join([i["item"]["item"]["text"].replace("\n", ".") for i in bests[:3]])))
    print()

diablo,overwatch,blizzard,year
	Diablo 4, Overwatch 2 Not Launching In 2021 Says Activision Blizzard - Screen Rant https://t.co/jCoDLps7Gc
	Diablo 4, Overwatch 2 Not Launching In 2021 Says Activision Blizzard..In an earnings call, Activision Blizzard conf… https://t.co/xKKaWADQUw
	Activision Blizzard Doesn’t Expect To Release Overwatch 2 Or Diablo 4 This Year.https://t.co/XMzA1rjzXx https://t.co/zE0QdTscra

raspberry,use,need,like
	How to Create an Image of a Raspberry Pi SD Card? (Win/Linux/Mac) https://t.co/dsAZ0mhBXi #raspberrypi https://t.co/j1J3s1hTs3
	How to Install Manjaro on Raspberry Pi? https://t.co/B4pAJHUG5r #raspberrypi https://t.co/IhUFlxls9a
	6 cheap Raspberry Pi 4 Cases for under £20 https://t.co/vJOisRwJCp via @jmdawson_blog

import,using,public,android
	RT @codewallblog: Running .Net Core Generic Host as a Windows Service, Linux Daemon or Console App - https://t.co/62XTpgHpvu   #Developer #…
	RT @AramT87: My Latest Article: A Complete Tutorial to Connect Android with 