https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
# necessary imports
import os, json
import pandas as pd
import numpy as np
import glob
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
import re
from nltk.corpus import stopwords
stopwords = stopwords.words('german')
import spacy
nlp = spacy.load('de_dep_news_trf')
import nltk
import gensim
from gensim import corpora, models
import warnings
warnings.filterwarnings('ignore')

In [2]:
# read lda_tweets
df = pd.read_csv('../../data/BTW17_Twitter/lda/lda_tweets.csv')
df.drop(columns='Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,created_at,id,text,user,entities,tags
0,2017-06-28,8.798635e+17,RT @Junge_Freiheit: AfD-Vize Beatrix von Storc...,"{'id': 728326774027919360, 'id_str': '72832677...","{'hashtags': [{'text': 'Ehefueralle', 'indices...",ehefueralle
1,2017-06-28,8.798879e+17,RT @AfD_Bund: Dr. @Alice_Weidel:\n»#Ehefuerall...,"{'id': 1222045254, 'id_str': '1222045254', 'na...","{'hashtags': [{'text': 'Ehefueralle', 'indices...",ehefueralle
2,2017-06-28,8.798901e+17,"RT @hessenSPD: Abstimmen, abstimmen, abstimmen...","{'id': 2443895924, 'id_str': '2443895924', 'na...","{'hashtags': [{'text': 'ehefueralle', 'indices...",ehefueralle
3,2017-06-28,8.798903e+17,RT @Die_Gruenen: Hallo @spdbt! Jetzt hindert E...,"{'id': 12302322, 'id_str': '12302322', 'name':...","{'hashtags': [{'text': 'Ehefueralle', 'indices...",ehefueralle
4,2017-06-28,8.798903e+17,"RT @Die_Gruenen: ""Wer die Debatte auch nur hal...","{'id': 391983270, 'id_str': '391983270', 'name...","{'hashtags': [{'text': 'Ehefueralle', 'indices...",ehefueralle


In [3]:
tqdm.pandas()

accepted_pos = ['NOUN', 'PROPN', 'ADJ', 'ADV', 'VERB']

# nlp pipeline for text
df['tokens'] = df['text'].progress_apply(lambda x: [token.lemma_ for token in nlp(x) if token.pos_ in accepted_pos])
       
# stopword removal
df['tokens'] = df['tokens'].apply(lambda x: [item for item in x if item not in stopwords])

# remove short words
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if len(word) >= 3])

# lower all strings
df['tokens'] = df['tokens'].apply(lambda x: [word.lower() for word in x])

HBox(children=(FloatProgress(value=0.0, max=28899.0), HTML(value='')))




In [4]:
print('original document: ')
words = []
for word in df['text'][0].split(' '):
    words.append(word)
print(words)
print('\n\ntokenized and lemmatized document: ')
print(df['tokens'][0])

original document: 
['RT', '@Junge_Freiheit:', 'AfD-Vize', 'Beatrix', 'von', 'Storch', 'zur', '#Ehefueralle:', 'Merkel', 'hat', 'rechten', 'Flügel', 'der', 'CDU', 'fast', 'widerstandslos', 'ausradiert', 'https://…']


tokenized and lemmatized document: 
['afd-vize', 'beatrix', 'storch', 'ehefueralle', 'merkel', 'recht', 'flügel', 'cdu', 'fast', 'widerstandslos', 'ausradieren']


In [12]:
x[0][1]

'0.099*"ehefueralle" + 0.018*"@volker_beck" + 0.012*"@beatrix_vstorch" + 0.011*"merkel" + 0.011*"offen" + 0.011*"abstimmung" + 0.011*"freitag" + 0.010*"marx" + 0.010*"@die_gruenen" + 0.010*"kardinal"'

In [30]:
x[0][1].split('*')[10].split('+ ')[1]

IndexError: list index out of range

In [34]:
range(len([1,2,3]))

range(0, 3)

In [38]:
hashtag_list = df['tags'].unique().tolist()
topic_word_list = []
scores_list = []

for index in tqdm(range(len(hashtag_list))):
    hashtag = hashtag_list[index]
    
    # filter df
    temp_df = df[df['tags']==hashtag]
    
    # get dictionary
    dictionary = gensim.corpora.Dictionary(temp_df['tokens'])
        
    # get bag of words corpus
    bow_corpus = [dictionary.doc2bow(doc) for doc in temp_df['tokens']]
    
    # lda_model
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=1, id2word=dictionary, passes=2, workers=2)
    
    # get words and scores
    x = lda_model.show_topics()
    topic_words = []
    scores = []
    y = x[0][1].split('"')
    z = x[0][1].split('*')
    for i in range(len(y)):
        if i%2 != 0:
            topic_words.append(y[i])    
    for i in range(len(z)):
        if i == 0:
            scores.append(z[i])
        elif i != max(range(len(z))):
            scores.append(z[i].split('+ ')[1])         
    
    topic_word_list.append(topic_words)
    scores_list.append(scores)
    
# save hashtag and words to output df 
output_df = pd.DataFrame({'hashtag': hashtag_list, 'topic_words': topic_word_list, 'scores': scores_list})

HBox(children=(FloatProgress(value=0.0, max=145.0), HTML(value='')))




In [39]:
output_df.head()

Unnamed: 0,hashtag,topic_words,scores
0,ehefueralle,"[ehefueralle, @volker_beck, @beatrix_vstorch, ...","[0.099, 0.018, 0.012, 0.011, 0.011, 0.011, 0.0..."
1,gesundheit,"[gesundheit, pflege, prekär, dossier, neues, u...","[0.107, 0.043, 0.039, 0.039, 0.039, 0.039, 0.0..."
2,afd,"[afd, traudichdeutschland, @afd, @joerg_meuthe...","[0.103, 0.059, 0.027, 0.023, 0.018, 0.009, 0.0..."
3,btw17,"[btw17, afd, @afd, wählen, traudichdeutschland...","[0.082, 0.038, 0.027, 0.023, 0.020, 0.012, 0.0..."
4,btw2017,"[btw2017, traudichdeutschland, @fraukepetry, k...","[0.077, 0.057, 0.031, 0.030, 0.029, 0.028, 0.0..."


In [40]:
# save to csv
path_file = '../../data/BTW17_Twitter/lda/topics.csv'
output_df.to_csv(path_file)