In [None]:
%pip install HanTa
%pip install lda
%pip install pyldavis
%pip install germalemma == 0.1.3


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer, util
# import lda
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from HanTa import HanoverTagger as ht
from nltk.corpus import stopwords
from germalemma import GermaLemma
from bertopic import BERTopic
import pyLDAvis.gensim_models
import pyLDAvis
from bertopic import BERTopic
pyLDAvis.enable_notebook()

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
en_tagger = ht.HanoverTagger('morphmodel_en.pgz')
de_tagger = ht.HanoverTagger('morphmodel_ger.pgz')


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Moinam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Moinam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Moinam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moinam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Text Analysis Functions

def clean_data(filename: str):
    def remove_emojis(data):
        emoj = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                          u"\U0001F680-\U0001F6FF"  # transport & map symbols
                          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                          u"\U00002500-\U00002BEF"  # chinese char
                          u"\U00002702-\U000027B0"
                          u"\U00002702-\U000027B0"
                          u"\U000024C2-\U0001F251"
                          u"\U0001f926-\U0001f937"
                          u"\U00010000-\U0010ffff"
                          u"\u2640-\u2642"
                          u"\u2600-\u2B55"
                          u"\u200d"
                          u"\u23cf"
                          u"\u23e9"
                          u"\u231a"
                          u"\ufe0f"  # dingbats
                          u"\u3030"
                          "]+", re.UNICODE)
        return re.sub(emoj, '', data)

    # Removing mentions and hashtags and url
    def remove_mentions_and_tags(text):
        text = re.sub(r'@\S*', '', text)
        text = re.sub(r'http\S+', '', text)
        return re.sub(r'#\S*', '', text)

    read_df = pd.read_parquet(filename)
    read_df = read_df.loc[read_df['tweet_type'] != 'retweet']
    read_df = read_df.drop_duplicates(subset=['tweet_id'], ignore_index=True)
    read_df.text = read_df.text.apply(remove_emojis)
    read_df.text = read_df.text.apply(remove_mentions_and_tags)
    read_df['text'] = read_df['text'].str.replace(
        r'[^\w\s]+', '')  # remove punctuations
    read_df['text'] = read_df['text'].str.replace('\d+', '')  # removes number
    read_df = remove_stopwords(read_df)
    en_df = read_df[read_df.tweet_language == 'en']
    de_df = read_df[read_df.tweet_language == 'de']
    return en_df, de_df, read_df


def extract_adj_noun(df, tagger: ht.HanoverTagger, lang: str):

    def noun(text, tagger):
        words = nltk.word_tokenize(text)
        tokens = [word for (word, x, pos) in tagger.tag_sent(
            words, taglevel=1) if pos == 'NN']
        return tokens

    def adj(text, tagger):
        words = nltk.word_tokenize(text)
        tokens = [word for (word, x, pos) in tagger.tag_sent(
            words, taglevel=1) if pos == 'ADJ']
        return tokens

    def to_lowercase(text):
        token = []
        for i in range(len(text)):
            val = text[i].lower()
            token.append(val)
        return token

    df['noun'] = df.text.apply(noun, tagger=tagger)
    df['adj'] = df.text.apply(adj, tagger=tagger)
    df.noun = df.noun.apply(to_lowercase)
    return df


def lementize_en_text(df):
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()

    def lemmatize_text(text):
        return [lemmatizer.lemmatize(w, 'n') for w in w_tokenizer.tokenize(text)]

    def lemmatize_text_adj(text):
        return [lemmatizer.lemmatize(w, 'a') for w in w_tokenizer.tokenize(text)]

    df['lemma_noun'] = df.noun.apply(lemmatize_text)
    df['lemma_adj'] = df.adj.apply(lemmatize_text_adj)
    dict_docs = df.lemma_noun.to_dict()
    return df, dict_docs


def lementize_de_text(df):
    lemmatizer = GermaLemma()

    def lemmatize_noun(x):
        arr = []
        token = x.split()
        for i in range(len(token)):
            lemma_noun = lemmatizer.find_lemma(token[i], 'N')
            arr.append(lemma_noun)
        return arr

    def lemmatize_adj(x):
        arr = []
        token = x.split()
        for i in range(len(token)):
            lemma_noun = lemmatizer.find_lemma(token[i], 'ADJ')
            arr.append(lemma_noun)
        return arr

    df['lemma_noun'] = df.noun.apply(lemmatize_noun)
    df['lemma_adj'] = df.adj.apply(lemmatize_adj)
    dict_docs = df.lemma_noun.to_dict()
    return df, dict_docs


def lda_topic_model(df):

    def generate_tokens(tweet):
        words = []
        for i in range(len(tweet)):
            word = tweet[i]
        # using the if condition because we introduced extra spaces during text cleaning
            if word != '':
                words.append(word)
        return words

    def create_dictionary(words):
        return corpora.Dictionary(words)

    def create_document_matrix(tokens, id2word):
        corpus = []
        for text in tokens:
            corpus.append(id2word.doc2bow(text))
        return corpus

    # storing the generated tokens in a new column named 'words'
    df['tokens'] = df.lemma_noun.apply(generate_tokens)

    # passing the dataframe column having tokens as the argument
    id2word = create_dictionary(df.tokens)

    # passing the dataframe column having tokens and dictionary
    corpus = create_document_matrix(df.tokens, id2word)
    return corpus, id2word


def fetch_doc_topic(df, corpus, lda_model, ntopics):
    def get_doc_topic(corpus, model):
        doc_topic = []
        for doc in range(len(corpus)):
            try:
                doc_topic.append(
                    {'Tweet': df.iloc[doc].text, 'Hashtags': df.iloc[doc].hashtags, 'Topic_Probability': model.__getitem__(corpus[doc], eps=0)})
            except Exception as e:
                print(e)
        return doc_topic

    def calc_doc_topic_count(doc_topic, count_arr):
        for topic in doc_topic:
            prob = topic['Topic_Probability']
            big = -99999
            for p in prob:
                if p[1] > big:
                    big = p[1]
                    max_top = p[0]
            count_arr[max_top] += 1
        return count_arr

    doc_topic = get_doc_topic(corpus, lda_model)
    doc_topic_df = pd.DataFrame(doc_topic)
    count_arr = calc_doc_topic_count(doc_topic, [0]*ntopics)
    return doc_topic, doc_topic_df, count_arr


def doc_topic_pie_chart(ntopics, count_arr):
    topics = []
    for i in range(ntopics):
        topics.append(i+1)
    plt.figure(figsize=(8, 8))
    plt.pie(count_arr, labels=topics,
            autopct='%1.1f%%', textprops={'fontsize': 18})
    plt.title('Tweets distribution in topics', fontsize=20)
    # plt.legend(data_pie.tweets_count,fontsize=20)
    plt.show()


def print_beauty(df, word_topic_df):
    for i in range(6, 10):
        print('Tweet:', df.iloc[i].Tweet)
        print('Hashtags:', df.iloc[i].Hashtags)
        print('Topic_Probability:', df.iloc[i].Topic_Probability)
        print("----------------------------------------------------------------\n")

    for i in range(4):
        print("Topic ", (i+1))
        print("Most Probable Word List: ",
              word_topic_df.iloc[i].most_prob_words)
        print("-------------------------------------------------------\n")


def fetch_word_topic(lda_model, id2word):
    def get_topic_to_wordids(model):
        p = list()
        for topicid in range(model.num_topics):
            topic = model.state.get_lambda()[topicid]
            topic = topic / max(topic)  # normalize to probability dist
            p.append(topic)
        return p

    def create_dict_word_topic(id2word, word_topic):
        word_topic_dict = []
        i = 0
        for topic in word_topic:
            prob_words = []
            for word_ind in range(len(topic)):
                if (topic[word_ind] >= 0.15):
                    prob_words.append(id2word[word_ind])
            word_topic_dict.append(
                {'Topic': (i+1), 'most_prob_words': prob_words})
            i += 1
        return word_topic_dict

    word_topic = get_topic_to_wordids(lda_model)
    word_topic_dict = create_dict_word_topic(id2word, word_topic)
    word_topic_df = pd.DataFrame(word_topic_dict)
    return word_topic_dict, word_topic_df


def remove_stopwords(df):
    stop_en = stopwords.words('english')
    stop_de = stopwords.words('german')
    df['tweet_without_stopwords'] = df['text'].apply(
        lambda x: ' '.join([word for word in x.split() if word not in (stop_de)]))
    df['tweet_without_stopwords'] = df['tweet_without_stopwords'].apply(
        lambda x: ' '.join([word for word in x.split() if word not in (stop_en)]))
    return df


def run_bert_topic_model(df, method: str):
    if method == "roberta":
        sen_embed_model = SentenceTransformer(
            'T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
        model = BERTopic(embedding_model=sen_embed_model, nr_topics="auto")
    else:
        model = BERTopic(
            verbose=True, language="multilingual", nr_topics="auto")
    docs = df.tweet_without_stopwords.to_list()
    topics, probabilities = model.fit_transform(docs)
    return model, topics, probabilities

def load_bert_topic_model(filename:str):
    model = BERTopic.load(filename)
    return model


  read_df['text'] = read_df['text'].str.replace('\d+', '')  # removes number


In [3]:
# Hashtag Analysis Functions

def extract_most_frequent_hashtags(df):
    dates = list()

    for row, record in enumerate(df.hashtags):
        if record != '[]' and df['timestamp'][row][0:7] > '2021-05':
            dates.append(df['timestamp'][row][0:7])
    date = list()
    for dt in dates:
        if dt not in date:
            date.append(dt)

    i = 1
    j = 6
    for i in range(1, 3):
        for j in range(1, 13):
            if i == 1 and j > 5 and j < 10 or i == 2 and j < 10:
                globals()[f'hash202{i}_0{j}'] = list()
                globals()[f'hash202{i}_0{j}'].extend([hash for row, hash in enumerate(
                    df['hashtags']) if df['timestamp'][row][0:7] == f'202{i}-0{j}' and hash != '[]'])

            elif i == 1 and j > 5 and j >= 10 or i == 2 and j >= 10:
                globals()[f'hash202{i}_{j}'] = list()
                globals()[f'hash202{i}_{j}'].extend([hash for row, hash in enumerate(
                    df['hashtags']) if df['timestamp'][row][0:7] == f'202{i}-{j}' and hash != '[]'])

    # preprocessing the hashtags list from specific periods
    i = 1
    j = 6
    for i in range(1, 3):
        for j in range(1, 13):
            if i == 1 and j > 5 and j < 10 or i == 2 and j < 10:
                globals()[f'all_hashtags_202{i}_0{j}'] = list()
                for s in globals()[f'hash202{i}_0{j}']:
                    bb = s.split(', ')
                    a = [re.search(r"[\[\'\"]*(\w*)[\]\']*", i).group(1)
                         for i in bb]
                    globals()[f'all_hashtags_202{i}_0{j}'].extend(a)

            elif i == 1 and j > 5 and j >= 10 or i == 2 and j >= 10:
                globals()[f'all_hashtags_202{i}_{j}'] = list()
                for s in globals()[f'hash202{i}_{j}']:
                    bb = s.split(', ')
                    a = [re.search(r"[\[\'\"]*(\w*)[\]\']*", i).group(1)
                         for i in bb]
                    globals()[f'all_hashtags_202{i}_{j}'].extend(a)

    def hashtag_set_finder():
        hashtag_set = set()
        for dt in date:
            i, j = dt[3], dt[5:]
            lis = globals()[f'all_hashtags_202{i}_{j}']
            for i in lis:
                hashtag_set.add(i)
        return hashtag_set
    hashtags_in_DS = hashtag_set_finder()
    hashtags_in_DS2 = {i.lower() for i in hashtags_in_DS}

    def generate_hashtags(dataset):
        listOfAllHashtags = list()
        for row, hash in enumerate(dataset.hashtags):
            if dataset['timestamp'][row][0:10] >= "2021-06-01" and hash != '[]':
                bb = hash.split(', ')

                a = [re.search(r"[\[\']*(\w*)[\]\']*", i).group(1) for i in bb]
                listOfAllHashtags.extend(a)
        listOfAllHashtagsL = [i.lower() for i in listOfAllHashtags]
        return listOfAllHashtagsL
    list_of_all_hashtags = generate_hashtags(df)
    dic_of_mfh = {i: list_of_all_hashtags.count(
        i)/len(list_of_all_hashtags) for i in hashtags_in_DS2 if list_of_all_hashtags.count(i) > 500}
    #input_hashtags = [i[0] for i in list_of_mfh]
    return dic_of_mfh


def hashtag_groups(df):
    representative_hashtags = list()
    model = SentenceTransformer(
        'T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
    dic_of_mfh = extract_most_frequent_hashtags(df)
    input_hashtags = list(dic_of_mfh.keys())
    sentence = input_hashtags
    clusterd_hashtags = list()
    for hash in input_hashtags:
        embedding = model.encode(hash)
        sublist = {hash}
        for j in range(len(input_hashtags)):
            if j < len(input_hashtags)-1:
                if util.pytorch_cos_sim(embedding, model.encode(input_hashtags[j+1])) > 0.5:
                    sublist.add(input_hashtags[j+1])
                    input_hashtags.remove(input_hashtags[j+1])
        clusterd_hashtags.append(sublist)

        clusterd_hashtags.sort(reverse=True, key=lambda i: len(i))
    print("for each cluster a group name has been extracted based on the most frequent hashtag of that group or a Group Representative\n")
    for i in clusterd_hashtags:
        i = list(i)
        group_n = i[0]
        for j in i:
            if dic_of_mfh[j] > dic_of_mfh[group_n]:
                group_n = j
        representative_hashtags.append(group_n)
        print(f"group name  :  {group_n}")
        print(f'{i}\n')
    return representative_hashtags


def tweet_clustering(df, tweet_id, rep_hashtags, model):
  hashclusters = set()
  tweet_hashtags = list()
  hashtags = str(list(df[df.tweet_id == tweet_id].hashtags))
  hashtags = hashtags.split(', ')
  a = [re.search(r"[\[\'\"]*(\w*)[\]\']*", i).group(1) for i in hashtags]
  tweet_hashtags.extend(a)
  #print(a)
  if not tweet_hashtags[0]:
    print("no hashtags found for this tweet")
    return

  for tw_hash in tweet_hashtags:
    embedding = model.encode(tw_hash)
    for rep_hash in rep_hashtags:
        if util.pytorch_cos_sim(embedding, model.encode(rep_hash)) > 0.5:
            hashclusters.add(rep_hash)

  return list(hashclusters)


In [4]:
en_df, de_df, read_df = clean_data("twitter_data.parquet")


In [5]:
read_df

Unnamed: 0,timestamp,tweet_id,conversation_id,author_id,text,retweet_count,reply_count,like_count,quote_count,referenced_tweets,hashtags,tweet_type,tweet_language,tweet_without_stopwords
0,2021-06-20T21:29:24.000Z,1406725899744157698,1406725899744157698,2981738470,Wenn Wirtschaftsjounalistinnen über schreiben...,9,1,62,4,[],['IchbinHanna'],original,de,Wenn Wirtschaftsjounalistinnen schreiben amp D...
1,2021-06-20T16:17:25.000Z,1406647386542325764,1406647386542325764,2981738470,,0,0,1,0,"[{'type': 'quoted', 'id': '1406620276822061057'}]","['IchbinHanna', 'PeerReview']",original,nl,
2,2021-06-18T13:10:36.000Z,1405875593711964166,1405875593711964166,1132055796571877376,Thread about the cruel in German The debate...,5,0,19,0,"[{'type': 'quoted', 'id': '1405846267759054851'}]","['precarity', 'academia', 'IchbinHanna', 'Acad...",original,en,Thread cruel German The debate started days ag...
3,2021-06-18T10:21:31.000Z,1405833045224087555,1405833045224087555,242424959,Liebe und Forsa schön dass ihr euch per Umfr...,0,1,2,0,[],"['Mittelbau', 'IchBinHanna']",original,de,Liebe Forsa schön per Umfrage interessiert Die...
4,2021-06-18T08:35:29.000Z,1405806358335832065,1405806358335832065,1132055796571877376,Wichtiger Thread zu in der der durch die A...,2,0,13,0,"[{'type': 'quoted', 'id': '1405494574533984264'}]","['Machtmissbrauch', 'Wissenschaft', 'prekär', ...",original,de,Wichtiger Thread Arbeitsverträge begünstigt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50901,2021-09-17T14:36:31.000Z,1438874511605125123,1438766234489794560,743696568939753475,I still cant get over how they titled their ...,0,1,0,0,"[{'type': 'replied_to', 'id': '143887433770928...",[],original,en,I still cant get titled youngins Juniorprofess...
50902,2021-08-25T04:56:31.000Z,1430393629651177473,1430093188757401618,1134768684315160577,WissZeitVG Maximaldauer erreicht\n\nEinfach u...,0,0,0,0,"[{'type': 'replied_to', 'id': '143009318875740...",[],original,de,WissZeitVG Maximaldauer erreicht Einfach unfas...
50903,2022-08-13T13:39:47.000Z,1558448232631762944,1558448232631762944,329984940,fellow profs is OUR problem \nSee this collec...,2,0,4,0,"[{'type': 'quoted', 'id': '1556700609155317761'}]",['academicprecarity'],original,en,fellow profs OUR problem See collection info r...
50904,2021-09-02T19:41:11.000Z,1433515364932067338,1433515364932067338,3005636663,beschließt neues Gesetz zur Stärkung der Berl...,1,0,6,0,[],"['Wissenschaft', 'Postdocs', 'Promotionsrecht'...",original,de,beschließt neues Gesetz Stärkung Berliner Daue...


In [6]:
representative_hashtags = hashtag_groups(read_df)

  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_

for each cluster a group name has been extracted based on the most frequent hashtag of that group or a Group Representative

group name  :  ichbinhanna
['ichbinhanna', 'ichbinreyhan']

group name  :  wissenschaft
['wissenschaft']

group name  :  95vswisszeitvg
['95vswisszeitvg']

group name  :  frististfrust
['frististfrust']

group name  :  hannaimbundestag
['hannaimbundestag']



In [9]:
# Text Analysis LDA pipeline
en_topics = 4
de_topics = 5


def run_lda(df, tagger: ht.HanoverTagger, language: str, n_topics: int):
    df = extract_adj_noun(df, tagger, language)
    if language == "english":
        df, dict_docs = lementize_en_text(df)
    else:
        df, dict_docs = lementize_de_text(df)
    corpus, id2word = lda_topic_model(df)
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=42)
    doc_topic, doc_topic_df, count_arr = fetch_doc_topic(
        df, corpus, lda_model, n_topics)
    word_topic_dict, word_topic_df = fetch_word_topic(lda_model, id2word)
    return df, dict_docs, corpus, id2word, lda_model, doc_topic, doc_topic_df, count_arr, word_topic_dict, word_topic_df


In [7]:
# Text Analysis BERTopic pipeline
def run_bert(df, choice: str, model_name: str, embed:str):
    topics = probabilities = None
    if choice == "load":
        model = load_bert_topic_model(model_name)
    else:
        model, topics, probabilities = run_bert_topic_model(df, method=embed)

    return model, topics, probabilities


In [8]:
bert_50, topics_50, prob_50 = run_bert(
    read_df, "load", "BERTopic_50", "no_embed")
bert_roberta_50, topics_roberta_50, prob_roberta_50 = run_bert(
    read_df, "load", "BERTopic_Roberta_50", "no_embed")


In [9]:
def create_topics_df(df):
    sen_model = SentenceTransformer(
        'T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
    hashtag_topics = []
    docs = df.tweet_without_stopwords.to_list()
    doc_info_bert_50 = bert_50.get_document_info(docs)
    doc_info_bert_roberta_50 = bert_roberta_50.get_document_info(docs)
    df['text_topic_bert'] = doc_info_bert_50.Name
    df['text_topic_bert_roberta'] = doc_info_bert_roberta_50.Name
    for i in range(len(df)):
        hashtag_topics.append(tweet_clustering(
            df, df.iloc[i].tweet_id, representative_hashtags, sen_model))
    df['hashtag_topics'] = hashtag_topics
    return df


In [10]:
new_df = create_topics_df(read_df)

  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_

no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtags found for this tweet
no hashtag

In [11]:
new_df.to_parquet("extracted_topics_tweets_dataset.csv", index=False)

In [None]:
# en_df, en_dict_docs, en_corpus, en_id2word, en_lda_model, en_doc_topic, en_doc_topic_df, en_count_arr, en_word_topic_dict, en_word_topic_df = run_lda(
#     en_df, en_tagger, "english", en_topics)

# de_df, de_dict_docs, de_corpus, de_id2word, de_lda_model, de_doc_topic, de_doc_topic_df, de_count_arr, de_word_topic_dict, de_word_topic_df = run_lda(
#     de_df, de_tagger, "german", de_topics)


In [None]:
# print_beauty(en_doc_topic_df, en_word_topic_df)


In [None]:
# print_beauty(de_doc_topic_df, de_word_topic_df)


In [165]:
# model.save('BERTopic_Roberta_50')



Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [56]:
index = read_df[read_df["tweet_id"] == 1405806358335832065].index.values[0]
index

4

In [57]:
doc_info_bert_roberta_50.iloc[index].Name


'0_wisszeitvg_ich_das_die'