In [2]:
import json 
import tqdm
import numpy as np

from langdetect import detect, lang_detect_exception
from collections import defaultdict
from spacy_langdetect import LanguageDetector

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD


In [3]:
languages = defaultdict(list)
urls = set()
for line in tqdm.tqdm(open("2021_02_05.jsonl")):
    data = json.loads(line)
    data = data["data"]
    if data["url"] not in urls and "article" in data["extra"]:
        urls.add(data["url"])
        article = data["extra"]["article"]
        #content = "{}\n{}\n{}".format(article["title"], article["text"], article["keywords"])
        content = article["text"]
        try:
            lang = detect(content)
            languages[lang].append({"content": content, "item": data})
        except lang_detect_exception.LangDetectException:
            pass

13973it [00:26, 526.44it/s]


In [4]:
print(len(languages["en"]))

1015


In [5]:
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english',
                             lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

data_vectorized = vectorizer.fit_transform([x["content"] for x in languages["en"]])

NUM_TOPICS = 20
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [6]:
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [7]:
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('add', 0.07431176744681985), ('batman', 0.06936543636785658), ('new', 0.06264394220936703), ('use', 0.06255989221185389), ('raspberry', 0.06195486669265004), ('code', 0.06153130391869498), ('port', 0.06147042654189851), ('data', 0.0612070113497002), ('machine', 0.061108013984025066), ('aws', 0.06069611065371877)]
Topic 1:
[('affleck', 73.7699805163911), ('snyder', 60.487345650366784), ('darkseid', 50.44075506727805), ('zack', 49.55038678067667), ('league', 46.73840649835856), ('ben', 46.3403913781781), ('batman', 45.61941151057166), ('cavill', 43.917598194681105), ('justice', 42.20553972791295), ('new', 35.947510438168244)]
Topic 2:
[('pico', 294.6000838257751), ('raspberry', 137.6984840775456), ('arduino', 109.7981128851849), ('image', 98.40800915280256), ('board', 84.5957175508373), ('micropython', 68.15714035535464), ('dragon', 66.85099788337742), ('code', 60.97306656705449), ('adafruit', 58.287948917973715), ('make', 57.74834421805032)]
Topic 3:
[('raspberry',

In [9]:
topics = []
top_n = 10
for idx, topic in enumerate(lda.components_):
    words = [(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]
    topics.append(words)

for item in languages["en"]:
    x = lda.transform(vectorizer.transform([item["content"]]))[0]
    topic_idx = np.argmax(x, axis=0)
    print("topic: {}\nurl: {}\ncontent: {}".format(topics[topic_idx], item["item"]["url"], item["content"][:250]))
    break

topic: [('add', 0.07431176744681985), ('batman', 0.06936543636785658), ('new', 0.06264394220936703), ('use', 0.06255989221185389), ('raspberry', 0.06195486669265004), ('code', 0.06153130391869498), ('port', 0.06147042654189851), ('data', 0.0612070113497002), ('machine', 0.061108013984025066), ('aws', 0.06069611065371877)]
url: https://www.teachingschools-sw.org.uk/news/send-a-message-to-space-on-astro-pi-mission-zero-and-cyberfirst-role-model-talk-with-west-country-computing-hub
content: Who we are
