In [None]:
%pip install HanTa
%pip install lda
%pip install pyldavis
%pip install germalemma == 0.1.3


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer, util

# import lda
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from HanTa import HanoverTagger as ht
from nltk.corpus import stopwords
from germalemma import GermaLemma
from bertopic import BERTopic
import pyLDAvis.gensim_models
import pyLDAvis
from bertopic import BERTopic

pyLDAvis.enable_notebook()

nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt")
nltk.download("stopwords")
en_tagger = ht.HanoverTagger("morphmodel_en.pgz")
de_tagger = ht.HanoverTagger("morphmodel_ger.pgz")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to C:\Users\Er Pravin
[nltk_data]     Pandey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Er Pravin
[nltk_data]     Pandey\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Er Pravin
[nltk_data]     Pandey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Er Pravin
[nltk_data]     Pandey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Text Analysis Functions


def clean_data(filename: str):
    """
    Method to preprocess tweet data by removing emojis, mentions, tags and stopwords.
        :params: `filename (str)`: Data file name to be processed.
        :return: 
                 `en_df`: tweets dataframe of english language
                 `de_df`: tweets dataframe of german language
                 `read_df`: tweets dataframe of all languages (Entire Corpus)
    """

    def remove_emojis(data):
        """
        Method to remove emojis from tweets
            :params: `data (str)`: text of tweet data.
            :return: `data (str)`: text of tweet data without emojis.
        """
        emoj = re.compile(
            "["
            "\U0001F600-\U0001F64F"  # emoticons
            "\U0001F300-\U0001F5FF"  # symbols & pictographs
            "\U0001F680-\U0001F6FF"  # transport & map symbols
            "\U0001F1E0-\U0001F1FF"  # flags (iOS)
            "\U00002500-\U00002BEF"  # chinese char
            "\U00002702-\U000027B0"
            "\U00002702-\U000027B0"
            "\U000024C2-\U0001F251"
            "\U0001f926-\U0001f937"
            "\U00010000-\U0010ffff"
            "\u2640-\u2642"
            "\u2600-\u2B55"
            "\u200d"
            "\u23cf"
            "\u23e9"
            "\u231a"
            "\ufe0f"  # dingbats
            "\u3030"
            "]+",
            re.UNICODE,
        )
        return re.sub(emoj, "", data)

    # Removing mentions and hashtags and url
    def remove_mentions_and_tags(text):
        """
        Method to remove mentions and tags from tweets
            :params: `text (str)`: text of tweet data.
            :return: `text (str)`: text tweet data without mentions and tags.
        """
        text = re.sub(r"@\S*", "", text)
        text = re.sub(r"http\S+", "", text)
        return re.sub(r"#\S*", "", text)

    def remove_stopwords(df):
        """
        Method to remove mentions and tags from tweets
            :params: `df (Dataframe)`: Dataframe object of tweet data.
            :return: `df (Dataframe)`: Dataframe object with new column `tweet_without_stopwords`
                      containing tweet data without stopwords.
        """
        stop_en = stopwords.words("english")
        stop_de = stopwords.words("german")
        df["tweet_without_stopwords"] = df["text"].apply(
            lambda x: " ".join([word for word in x.split() if word not in (stop_de)]))
        df["tweet_without_stopwords"] = df["tweet_without_stopwords"].apply(
            lambda x: " ".join(
                [word for word in x.split() if word not in (stop_en)])
        )
        return df

    read_df = pd.read_parquet(filename)
    read_df = read_df.loc[read_df["tweet_type"] != "retweet"]
    read_df = read_df.drop_duplicates(subset=["tweet_id"], ignore_index=True)
    read_df.text = read_df.text.apply(remove_emojis)
    read_df.text = read_df.text.apply(remove_mentions_and_tags)
    read_df["text"] = read_df["text"].str.replace(
        r"[^\w\s]+", "")  # remove punctuations
    read_df["text"] = read_df["text"].str.replace("\d+", "")  # removes number
    read_df = remove_stopwords(read_df)
    en_df = read_df[read_df.tweet_language == "en"]
    de_df = read_df[read_df.tweet_language == "de"]
    return en_df, de_df, read_df


def extract_adj_noun(df, tagger: ht.HanoverTagger):
    """
    Method to remove mentions and tags from tweets
        :params: 
                 `df (Dataframe)`: Dataframe object of tweet data.
                 `tagger (HanoverTagger)`: HanTa Tagger object to extraxt adjectives and noun from text.
        :return: `df (Dataframe)`: Dataframe object with new columns `adj` and `noun` containing extracted adjectives and nouns.
    """

    def noun(text, tagger):
        """
        Method to extract nouns
        :params: 
                 `text (str)`: text of tweet data.
                 `tagger (HanoverTagger)`: HanTa Tagger object to extraxt adjectives and noun from text.
        :return: `tokens (array)`: extracted nouns.
        """
        words = nltk.word_tokenize(text)
        tokens = [word for (word, x, pos) in tagger.tag_sent(
            words, taglevel=1) if pos == "NN"]
        return tokens

    def adj(text, tagger):
        """
        Method to extract adjectives
        :params: 
                 `text (str)`: text of tweet data.
                 `tagger (HanoverTagger)`: HanTa Tagger object to extraxt adjectives and noun from text.
        :return: `tokens (array)`: extracted adjectives.
        """
        words = nltk.word_tokenize(text)
        tokens = [word for (word, x, pos) in tagger.tag_sent(
            words, taglevel=1) if pos == "ADJ"]
        return tokens

    def to_lowercase(text):
        """
        Method to chnage text case to lowercase
        :params: 
                 `text (str)`: text of tweet data.
        :return: `tokens (array)`: lower case tokens.
        """
        token = []
        for i in range(len(text)):
            val = text[i].lower()
            token.append(val)
        return token
    
    def word_token(x):
        return ' '.join([w for w in x])

    df["noun"] = df.tweet_without_stopwords.apply(noun, tagger=tagger)
    df["adj"] = df.tweet_without_stopwords.apply(adj, tagger=tagger)
    df.noun = df.noun.apply(to_lowercase)
    df.noun = df.noun.apply(word_token)
    df.adj = df.adj.apply(word_token)
    return df


def lementize_en_text(df):
    """
    Method to Lementize English Text
        :params: 
                 `df (Dataframe)`: Dataframe object of tweet data.
        :return: 
                 `df (Dataframe)`: Dataframe object with updated `adj` and `noun` containing lemantized adjectives and nouns.
                 `dict_docs (Dict)`: Dictionary of lemantized noun from text.
    """
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()

    def lemmatize_text(text):
        return [lemmatizer.lemmatize(w, "n") for w in w_tokenizer.tokenize(text)]

    def lemmatize_text_adj(text):
        return [lemmatizer.lemmatize(w, "a") for w in w_tokenizer.tokenize(text)]

    df["lemma_noun"] = df.noun.apply(lemmatize_text)
    df["lemma_adj"] = df.adj.apply(lemmatize_text_adj)
    dict_docs = df.lemma_noun.to_dict()
    return df, dict_docs


def lementize_de_text(df):
    """
    Method to Lementize German Text
        :params: 
                 `df (Dataframe)`: Dataframe object of tweet data.
        :return: 
                 `df (Dataframe)`: Dataframe object with updated `adj` and `noun` containing lemantized adjectives and nouns.
                 `dict_docs (Dict)`: Dictionary of lemantized noun from text.
    """
    lemmatizer = GermaLemma()

    def lemmatize_noun(x):
        arr = []
        token = x.split()
        for i in range(len(token)):
            lemma_noun = lemmatizer.find_lemma(token[i], "N")
            arr.append(lemma_noun)
        return arr

    def lemmatize_adj(x):
        arr = []
        token = x.split()
        for i in range(len(token)):
            lemma_noun = lemmatizer.find_lemma(token[i], "ADJ")
            arr.append(lemma_noun)
        return arr

    df["lemma_noun"] = df.noun.apply(lemmatize_noun)
    df["lemma_adj"] = df.adj.apply(lemmatize_adj)
    dict_docs = df.lemma_noun.to_dict()
    return df, dict_docs


def gen_doc_matrix(df):
    """
    Method to generate Document Matrix and ID to Word Matrix.
        :params: 
                 `df (Dataframe)`: Dataframe object of tweet data.
        :return: 
                 `corpus (Dict)`: Document token Matrix.
                 `id2word (Dict)`: ID represenatation of words Dictionary.
    """
    def generate_tokens(tweet):
        """
        Method to generate word array (tokens)
        :params: 
                 `tweet (str)`: tweet text.
        :return: 
                 `words (array)`: words array.
        """
        words = []
        for i in range(len(tweet)):
            word = tweet[i]
            # using the if condition because we introduced extra spaces during text cleaning
            if word != "":
                words.append(word)
        return words

    def create_dictionary(words):
        return corpora.Dictionary(words)

    def create_document_matrix(tokens, id2word):
        """
        Method to generate document matrix
        :params: 
                 `tokens (array)`: words array.
                 `id2word (Dict)`: ID represenatation of words Dictionary.
        :return: 
                 `corpus (Dict)`: document token matrix.
        """
        corpus = []
        for text in tokens:
            corpus.append(id2word.doc2bow(text))
        return corpus

    # storing the generated tokens in a new column named 'words'
    df["tokens"] = df.lemma_noun.apply(generate_tokens)

    # passing the dataframe column having tokens as the argument
    id2word = create_dictionary(df.tokens)

    # passing the dataframe column having tokens and dictionary
    corpus = create_document_matrix(df.tokens, id2word)
    return corpus, id2word


def fetch_doc_topic(df, corpus, lda_model, ntopics):
    """
    Method to generate Document Topic Matrix.
        :params: 
                 `df (Dataframe)`: Dataframe object of tweet data.
                 `corpus (Dict)`: document token Matrix.
                 `lda_model (Gensim LDA Object)`: Lda model object.
                 `ntopics (int)`: number of topics
        :return: 
                 `doc_topic (Dict)`: document Topic matrix.
                 `doc_topic_df (Dataframe)`: document Topic matrix Dataframe.
                 `count_arr (array)`: Count of documents in each topic.
    """
    def get_doc_topic(corpus, model):
        """
        Method to generate document topic matrix
        :params: 
                 `corpus (Dict)`: document token Matrix.
                 `model (Gensim LDA Object)`: Lda model object.
        :return: 
                 `doc_topic (array)`: document Topic matrix.
        """
        doc_topic = []
        for doc in range(len(corpus)):
            try:
                doc_topic.append(
                    {
                        "Tweet": df.iloc[doc].text,
                        "Hashtags": df.iloc[doc].hashtags,
                        "Topic_Probability": model.__getitem__(corpus[doc], eps=0),
                    }
                )
            except Exception as e:
                print(e)
        return doc_topic

    def calc_doc_topic_count(doc_topic, count_arr):
        """
        Method to generate document topic count
        :params: 
                 `doc_topic (Dict)`: document Topic matrix.
                 `count_arr (array)`: placeholder array of length ntopics.
        :return: 
                 `count_arr (array)`: Count of documents in each topic.
        """
        for topic in doc_topic:
            prob = topic["Topic_Probability"]
            big = -99999
            for p in prob:
                if p[1] > big:
                    big = p[1]
                    max_top = p[0]
            count_arr[max_top] += 1
        return count_arr

    doc_topic = get_doc_topic(corpus, lda_model)
    doc_topic_df = pd.DataFrame(doc_topic)
    count_arr = calc_doc_topic_count(doc_topic, [0] * ntopics)
    return doc_topic, doc_topic_df, count_arr


def doc_topic_pie_chart(ntopics, count_arr):
    """
    Method to generate Pie Chart to display document segregation.
        :params: 
                 `ntopics (int)`: number of topics.
                 `count_arr (array)`: Count of documents in each topic.
    """
    topics = []
    for i in range(ntopics):
        topics.append(i + 1)
    plt.figure(figsize=(8, 8))
    plt.pie(count_arr, labels=topics, autopct="%1.1f%%",
            textprops={"fontsize": 18})
    plt.title("Tweets distribution in topics", fontsize=20)
    # plt.legend(data_pie.tweets_count,fontsize=20)
    plt.show()

def fetch_word_topic(lda_model, id2word):
    """
    Method to generate Document Topic Matrix.
        :params: 
                 `lda_model (Gensim LDA Object)`: Lda model object.
                 `id2word (Dict)`: ID represenatation of words Dictionary.
        :return: 
                 `word_topic_dict (Dict)`: Word Topic matrix.
                 `word_topic_df (Dataframe)`: Word Topic matrix Dataframe.
    """
    def get_topic_to_wordids(model):
        """
        Method to generate word topic probability array.
        :params: 
                 `model (Gensim LDA Object)`: Lda model object.
        :return: 
                 `p (array)`: word topic probability array.
        """
        p = list()
        for topicid in range(model.num_topics):
            topic = model.state.get_lambda()[topicid]
            topic = topic / max(topic)  # normalize to probability dist
            p.append(topic)
        return p

    def create_dict_word_topic(id2word, word_topic):
        """
        Method to generate most probable word topic matrix.
        :params: 
                 `id2word (Dict)`: ID represenatation of words Dictionary.
                 `word_topic (array)`: word topic probability array.

        :return: 
                 `word_topic_dict (Dict)`: Most probable word topic matrix.
        """
        word_topic_dict = []
        i = 0
        for topic in word_topic:
            prob_words = []
            for word_ind in range(len(topic)):
                if topic[word_ind] >= 0.15:
                    prob_words.append(id2word[word_ind])
            word_topic_dict.append(
                {"Topic": (i + 1), "most_prob_words": prob_words})
            i += 1
        return word_topic_dict

    word_topic = get_topic_to_wordids(lda_model)
    word_topic_dict = create_dict_word_topic(id2word, word_topic)
    word_topic_df = pd.DataFrame(word_topic_dict)
    return word_topic_dict, word_topic_df

def print_beauty(df, word_topic_df):
    """
    Custom method to display topic probabilities for certain tweets from df.
        :params: 
                 `df (Dataframe)`: tweet dataset dataframe.
                 `word_topic_df (Dataframe)`: Probable word list for each tweet.
    """
    for i in range(6, 10):
        print("Tweet:", df.iloc[i].Tweet)
        print("Hashtags:", df.iloc[i].Hashtags)
        print("Topic_Probability:", df.iloc[i].Topic_Probability)
        print("----------------------------------------------------------------\n")

    for i in range(4):
        print("Topic ", (i + 1))
        print("Most Probable Word List: ",
              word_topic_df.iloc[i].most_prob_words)
        print("-------------------------------------------------------\n")

def run_bert_topic_model(df, method: str ="default", filename: str = "BERTopic_model"):
    """
    Method to run BERTopic model package and fit documents to it.
        :params: 
                 `df (Dataframe)`: tweet dataset dataframe.
                 `method (str)`: Sentence Embedding name to be used. `roberta` or `default`
        :return: 
                 `model (BERTopic model)`: Document fitted BERTopic model object.
                 `topics (Dict)`: Topic matrix.
                 `probabilities (Dict)`: Document topic probabilities matrix.
    """
    if method == "roberta":
        sen_embed_model = SentenceTransformer(
            "T-Systems-onsite/cross-en-de-roberta-sentence-transformer")
        model = BERTopic(embedding_model=sen_embed_model, nr_topics="auto")
    elif method == 'default':
        model = BERTopic(
            verbose=True, language="multilingual", nr_topics="auto")
    docs = df.tweet_without_stopwords.to_list()
    topics, probabilities = model.fit_transform(docs)
    model.reduce_topics(docs, nr_topics=50)
    try:
        model.save(filename)
    except Exception as e:
        print(f"BERTopic model file selected already exists in storage. Unable to save model.\n Error: {e}")
    return model, topics, probabilities


def load_bert_topic_model(filename: str):
    """
    Method to load document fitted exsiting BERTopic model.
        :params: 
                 `filename (str)`: File name of the BERTopic model.
        :return: 
                 `model (BERTopic model)`: Document fitted BERTopic model object.
    """
    try:
        model = BERTopic.load(filename)
    except Exception as e:
        print(f"BERTopic model file not found.\n Error: {e}")
        model = None
    return model


  read_df["text"] = read_df["text"].str.replace("\d+", "")  # removes number


In [3]:
# Hashtag Analysis Functions


def extract_most_frequent_hashtags(df):
    dates = list()

    for row, record in enumerate(df.hashtags):
        if record != "[]" and df["timestamp"][row][0:7] > "2021-05":
            dates.append(df["timestamp"][row][0:7])
    date = list()
    for dt in dates:
        if dt not in date:
            date.append(dt)

    i = 1
    j = 6
    for i in range(1, 3):
        for j in range(1, 13):
            if i == 1 and j > 5 and j < 10 or i == 2 and j < 10:
                globals()[f"hash202{i}_0{j}"] = list()
                globals()[f"hash202{i}_0{j}"].extend(
                    [hash for row, hash in enumerate(df["hashtags"]) if df["timestamp"][row][0:7] == f"202{i}-0{j}" and hash != "[]"]
                )

            elif i == 1 and j > 5 and j >= 10 or i == 2 and j >= 10:
                globals()[f"hash202{i}_{j}"] = list()
                globals()[f"hash202{i}_{j}"].extend(
                    [hash for row, hash in enumerate(df["hashtags"]) if df["timestamp"][row][0:7] == f"202{i}-{j}" and hash != "[]"]
                )

    # preprocessing the hashtags list from specific periods
    i = 1
    j = 6
    for i in range(1, 3):
        for j in range(1, 13):
            if i == 1 and j > 5 and j < 10 or i == 2 and j < 10:
                globals()[f"all_hashtags_202{i}_0{j}"] = list()
                for s in globals()[f"hash202{i}_0{j}"]:
                    bb = s.split(", ")
                    a = [re.search(r"[\[\'\"]*(\w*)[\]\']*", i).group(1) for i in bb]
                    globals()[f"all_hashtags_202{i}_0{j}"].extend(a)

            elif i == 1 and j > 5 and j >= 10 or i == 2 and j >= 10:
                globals()[f"all_hashtags_202{i}_{j}"] = list()
                for s in globals()[f"hash202{i}_{j}"]:
                    bb = s.split(", ")
                    a = [re.search(r"[\[\'\"]*(\w*)[\]\']*", i).group(1) for i in bb]
                    globals()[f"all_hashtags_202{i}_{j}"].extend(a)

    def hashtag_set_finder():
        hashtag_set = set()
        for dt in date:
            i, j = dt[3], dt[5:]
            lis = globals()[f"all_hashtags_202{i}_{j}"]
            for i in lis:
                hashtag_set.add(i)
        return hashtag_set

    hashtags_in_DS = hashtag_set_finder()
    hashtags_in_DS2 = {i.lower() for i in hashtags_in_DS}

    def generate_hashtags(dataset):
        listOfAllHashtags = list()
        for row, hash in enumerate(dataset.hashtags):
            if dataset["timestamp"][row][0:10] >= "2021-06-01" and hash != "[]":
                bb = hash.split(", ")

                a = [re.search(r"[\[\']*(\w*)[\]\']*", i).group(1) for i in bb]
                listOfAllHashtags.extend(a)
        listOfAllHashtagsL = [i.lower() for i in listOfAllHashtags]
        return listOfAllHashtagsL

    list_of_all_hashtags = generate_hashtags(df)
    dic_of_mfh = {
        i: list_of_all_hashtags.count(i) / len(list_of_all_hashtags) for i in hashtags_in_DS2 if list_of_all_hashtags.count(i) > 20
    }
    # input_hashtags = [i[0] for i in list_of_mfh]
    return dic_of_mfh


def hashtag_groups(df):
    representative_hashtags = list()
    model = SentenceTransformer("T-Systems-onsite/cross-en-de-roberta-sentence-transformer")
    dic_of_mfh = extract_most_frequent_hashtags(df)
    input_hashtags = list(dic_of_mfh.keys())
    sentence = input_hashtags
    clusterd_hashtags = list()
    for hash in input_hashtags:
        embedding = model.encode(hash)
        sublist = {hash}
        for j in range(len(input_hashtags)):
            if j < len(input_hashtags) - 1:
                if util.pytorch_cos_sim(embedding, model.encode(input_hashtags[j + 1])) > 0.5:
                    sublist.add(input_hashtags[j + 1])
                    input_hashtags.remove(input_hashtags[j + 1])
        clusterd_hashtags.append(sublist)

        clusterd_hashtags.sort(reverse=True, key=lambda i: len(i))
    print("for each cluster a group name has been extracted based on the most frequent hashtag of that group or a Group Representative\n")
    for i in clusterd_hashtags:
        i = list(i)
        group_n = i[0]
        for j in i:
            if dic_of_mfh[j] > dic_of_mfh[group_n]:
                group_n = j
        representative_hashtags.append(group_n)
        print(f"group name  : {group_n}")
        print(f"{i}\n")
    return representative_hashtags


def tweet_clustering(df, tweet_id, rep_hashtags, model):
  hashclusters = set()
  tweet_hashtags = list()
  hashtags = str(list(df[df.tweet_id == tweet_id].hashtags))
  hashtags = hashtags.split(', ')
  a = [re.search(r"[\[\'\"]*(\w*)[\]\']*", i).group(1) for i in hashtags]
  tweet_hashtags.extend(a)
  #print(a)
  if not tweet_hashtags[0]:
    print("no hashtags found for this tweet")
    return []

  for tw_hash in tweet_hashtags:
    embedding = model.encode(tw_hash)
    highestsim=0
    hash=""
    for rep_hash in rep_hashtags:
        sim=util.pytorch_cos_sim(embedding, model.encode(rep_hash))
        if  sim> 0.5 and sim>highestsim:
          highestsim=sim
          hash=rep_hash
    if hash != "":
        hashclusters.add(hash)       
  if not hashclusters:
    print(f"hashtags in tweet {tweet_id} are very infrequent and disimilar to the founded groups.")
    hashclusters.add("no_groups")
  return list(hashclusters)


In [4]:
en_df, de_df, read_df = clean_data("twitter_data.parquet")


In [5]:
read_df


Unnamed: 0,timestamp,tweet_id,conversation_id,author_id,text,retweet_count,reply_count,like_count,quote_count,referenced_tweets,hashtags,tweet_type,tweet_language,tweet_without_stopwords
0,2021-06-20T21:29:24.000Z,1406725899744157698,1406725899744157698,2981738470,Wenn Wirtschaftsjounalistinnen über schreiben...,9,1,62,4,[],['IchbinHanna'],original,de,Wenn Wirtschaftsjounalistinnen schreiben amp D...
1,2021-06-20T16:17:25.000Z,1406647386542325764,1406647386542325764,2981738470,,0,0,1,0,"[{'type': 'quoted', 'id': '1406620276822061057'}]","['IchbinHanna', 'PeerReview']",original,nl,
2,2021-06-18T13:10:36.000Z,1405875593711964166,1405875593711964166,1132055796571877376,Thread about the cruel in German The debate...,5,0,19,0,"[{'type': 'quoted', 'id': '1405846267759054851'}]","['precarity', 'academia', 'IchbinHanna', 'Acad...",original,en,Thread cruel German The debate started days ag...
3,2021-06-18T10:21:31.000Z,1405833045224087555,1405833045224087555,242424959,Liebe und Forsa schön dass ihr euch per Umfr...,0,1,2,0,[],"['Mittelbau', 'IchBinHanna']",original,de,Liebe Forsa schön per Umfrage interessiert Die...
4,2021-06-18T08:35:29.000Z,1405806358335832065,1405806358335832065,1132055796571877376,Wichtiger Thread zu in der der durch die A...,2,0,13,0,"[{'type': 'quoted', 'id': '1405494574533984264'}]","['Machtmissbrauch', 'Wissenschaft', 'prekär', ...",original,de,Wichtiger Thread Arbeitsverträge begünstigt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50901,2021-09-17T14:36:31.000Z,1438874511605125123,1438766234489794560,743696568939753475,I still cant get over how they titled their ...,0,1,0,0,"[{'type': 'replied_to', 'id': '143887433770928...",[],original,en,I still cant get titled youngins Juniorprofess...
50902,2021-08-25T04:56:31.000Z,1430393629651177473,1430093188757401618,1134768684315160577,WissZeitVG Maximaldauer erreicht\n\nEinfach u...,0,0,0,0,"[{'type': 'replied_to', 'id': '143009318875740...",[],original,de,WissZeitVG Maximaldauer erreicht Einfach unfas...
50903,2022-08-13T13:39:47.000Z,1558448232631762944,1558448232631762944,329984940,fellow profs is OUR problem \nSee this collec...,2,0,4,0,"[{'type': 'quoted', 'id': '1556700609155317761'}]",['academicprecarity'],original,en,fellow profs OUR problem See collection info r...
50904,2021-09-02T19:41:11.000Z,1433515364932067338,1433515364932067338,3005636663,beschließt neues Gesetz zur Stärkung der Berl...,1,0,6,0,[],"['Wissenschaft', 'Postdocs', 'Promotionsrecht'...",original,de,beschließt neues Gesetz Stärkung Berliner Daue...


In [6]:
# Text Analysis LDA pipeline
en_topics = 4
de_topics = 5


def run_lda(df, tagger: ht.HanoverTagger, language: str, n_topics: int):
    """
    LDA Topic modeling text analysis pipeline.
        :params: 
                 `df (Dataframe)`: Dataframe object of tweet data of specific language.
                 `tagger (HanoverTagger)`: HanTa Tagger object to extraxt adjectives and noun from text.
                 `language (str)`: Language of corpus. `english` or `german`
                 `ntopics (int)`: number of topics
        :return: 
                 `df (Dataframe)`: Dataframe object of tweet data of specific language with adjectives, noun and token columns.
                 `dict_docs (Dict)`: Dictionary of lemantized noun from text.
                 `corpus (Dict)`: Document token Matrix.
                 `id2word (Dict)`: ID represenatation of words Dictionary.
                 `lda_model (Gensim LDA Object)`: Lda model object.
                 `doc_topic (Dict)`: document Topic matrix.
                 `doc_topic_df (Dataframe)`: document Topic matrix Dataframe.
                 `count_arr (array)`: Count of documents in each topic.
                 `word_topic_dict (Dict)`: Word Topic matrix.
                 `word_topic_df (Dataframe)`: Word Topic matrix Dataframe.
    """
    df = extract_adj_noun(df, tagger)
    if language == "english":
        df, dict_docs = lementize_en_text(df)
    elif language == "german":
        df, dict_docs = lementize_de_text(df)
    corpus, id2word = gen_doc_matrix(df)
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=42)
    doc_topic, doc_topic_df, count_arr = fetch_doc_topic(df, corpus, lda_model, n_topics)
    word_topic_dict, word_topic_df = fetch_word_topic(lda_model, id2word)
    return df, dict_docs, corpus, id2word, lda_model, doc_topic, doc_topic_df, count_arr, word_topic_dict, word_topic_df


In [7]:
en_df, en_dict_docs, en_corpus, en_id2word, en_lda_model, en_doc_topic, en_doc_topic_df, en_count_arr, en_word_topic_dict, en_word_topic_df = run_lda(
    en_df, en_tagger, "english", en_topics)

de_df, de_dict_docs, de_corpus, de_id2word, de_lda_model, de_doc_topic, de_doc_topic_df, de_count_arr, de_word_topic_dict, de_word_topic_df = run_lda(
    de_df, de_tagger, "german", de_topics)


In [13]:
#LDA for german and english text together
en_de_arr= [en_df, de_df]
en_de_lda_df = pd.concat(en_de_arr)

#performing lda here on a combined dataset
en_de_corpus, en_de_id2word = gen_doc_matrix(en_de_lda_df)
en_de_lda_model = gensim.models.ldamodel.LdaModel(corpus=en_de_corpus, id2word=en_de_id2word, num_topics=3, random_state=42)
en_de_doc_topic, en_de_doc_topic_df, en_de_count_arr = fetch_doc_topic(en_de_lda_df, en_de_corpus, en_de_lda_model, 3)
en_de_word_topic_dict, en_de_word_topic_df = fetch_word_topic(en_de_lda_model, en_de_id2word)

In [14]:
en_de_vis = pyLDAvis.gensim_models.prepare(en_de_lda_model,
                                     en_de_corpus,
                                     dictionary=en_de_lda_model.id2word)
en_de_vis

In [16]:
print_beauty(en_de_doc_topic_df, en_de_word_topic_df)

Tweet: Like so many  Ive learned that I am a good enough researcher to stay in academia but what the  does is not to select the best candidates It also does not train researchers to become the best candidates deserving of permanent jobs
Hashtags: ['IchbinHanna', 'WissZeitVG']
Topic_Probability: [(0, 0.22442107), (1, 0.086784974), (2, 0.68879396)]
----------------------------------------------------------------

Tweet: For those who dont read German

The response by the ministry of education to the outpouring of stories and discussions regarding academic precarity  is unbelievably patronising and disrespectful
Hashtags: ['IchBinHanna']
Topic_Probability: [(0, 0.4263605), (1, 0.044729613), (2, 0.52890986)]
----------------------------------------------------------------

Tweet: So  proudly explains the advantages of the  to early carrer researchers as if they were preschool children This law is a great obstacle to many brilliant young minds and threatens their future  nothing to be proud

IndexError: single positional indexer is out-of-bounds

In [None]:
print_beauty(en_doc_topic_df, en_word_topic_df)


Tweet: Like so many  Ive learned that I am a good enough researcher to stay in academia but what the  does is not to select the best candidates It also does not train researchers to become the best candidates deserving of permanent jobs
Hashtags: ['IchbinHanna', 'WissZeitVG']
Topic_Probability: [(0, 0.064213924), (1, 0.06291898), (2, 0.80872524), (3, 0.06414183)]
----------------------------------------------------------------

Tweet: For those who dont read German

The response by the ministry of education to the outpouring of stories and discussions regarding academic precarity  is unbelievably patronising and disrespectful
Hashtags: ['IchBinHanna']
Topic_Probability: [(0, 0.73760694), (1, 0.033181116), (2, 0.19639003), (3, 0.032821883)]
----------------------------------------------------------------

Tweet: So  proudly explains the advantages of the  to early carrer researchers as if they were preschool children This law is a great obstacle to many brilliant young minds and threate

In [None]:
print_beauty(de_doc_topic_df, de_word_topic_df)


Tweet: Wichtiger Thread  zur Fehlwahrnehmung wissenschaftlicher Arbeit in der Öffentlichkeit Wissenschaft als von der Allgemeinheit bezahltes Hobby 
   
Hashtags: ['IchbinHanna', 'Wissenschaft', 'WissZeitVG']
Topic_Probability: [(0, 0.60016865), (1, 0.025433093), (2, 0.025454503), (3, 0.025494363), (4, 0.3234494)]
----------------------------------------------------------------

Tweet:       
Hashtags: ['IchbinHanna', 'Ausbeutung', 'Arbeitsrecht']
Topic_Probability: [(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
----------------------------------------------------------------

Tweet: Letzter Punkt Gerade die VWL mit ihren Engführungen amp der Marginalisierung heterodoxer Strömungen zeigt dass es auch strukturelle Probleme amp Pfadabhängigkeiten in der Selbstverwaltung der Wissenschaften gibt die mit mehr Dauerstellen nur noch weiter zementiert werden 
Hashtags: []
Topic_Probability: [(0, 0.017720742), (1, 0.017705793), (2, 0.018131433), (3, 0.9287428), (4, 0.017699191)]
-----------

In [8]:
# Text Analysis BERTopic pipeline
def run_bert(df, model_name: str ="BERTopic_model", choice: str= "train", embed: str ="default"):
    """
    Method to run BERTopic topic modeling pipeline.
        :params: 
                 `df (Dataframe)`: tweet dataset dataframe.
                 `choice (str)`: Train or load BERTopic model. `train` or `load`
                 `model_name (str)`: File name of the BERTopic model to be loaded or saved as after training.
                 `embed (str)`: Sentence Embedding name to be used. `roberta` or `default`
        :return: 
                 `model (BERTopic model)`: Document fitted BERTopic model object.
                 `topics (Dict)`: Topic matrix.
                 `probabilities (Dict)`: Document topic probabilities matrix.
    """
    model = topics = probabilities = None
    if choice == "load":
        model = load_bert_topic_model(model_name)
    elif choice == 'train':
        model, topics, probabilities = run_bert_topic_model(df, method=embed)

    return model, topics, probabilities


In [9]:
# bert_50, topics_50, prob_50 = run_bert(read_df, "BERTopic_50", "load", "default")
bert_roberta_50, topics_roberta_50, prob_roberta_50 = run_bert(read_df, "BERTopic_Roberta_50",  "load", "default")


In [10]:
def create_topics_df(df, representative_hashtags, bert):
    """
    Method to create new dataframe with topics extracted from text and hashtags.
        :params: 
                 `df (Dataframe)`: tweet dataset dataframe.
                 `representative_hashtags (Dict)`: Hashtag Cluster Matrix with a representative name for each cluster.
                 `bert (BERTopic model)`: Document fitted BERTopic model object with defualt embedding.
        :return: 
                 `df (Dataframe)`: New tweet dataset dataframe with extracted topics.
    """
    sen_model = SentenceTransformer("T-Systems-onsite/cross-en-de-roberta-sentence-transformer")
    hashtag_topics = []
    docs = df.tweet_without_stopwords.to_list()
    doc_info_bert = bert.get_document_info(docs)
    df["text_topic"] = doc_info_bert.Name
    for i in range(len(df)):
        hashtag_topics.append(tweet_clustering(df, df.iloc[i].tweet_id, representative_hashtags, sen_model))
    df["hashtag_topics"] = hashtag_topics
    return df


def fetch_topics_tweet_id(df, tweet_id: int, representative_hashtags, bert):
    """
    Method to create new dataframe with topics extracted from text and hashtags.
        :params: 
                 `df (Dataframe)`: tweet dataset dataframe.
                 `representative_hashtags (Dict)`: Hashtag Cluster Matrix with a representative name for each cluster.
                 `bert (BERTopic model)`: Document fitted BERTopic model object with defualt embedding.
        :return: 
                 `doc_info_bert (str)`: Text Topic assigned to Tweet.
                 `hashtag_topics (list)`: Hashtag Topics assigned to Tweet.
    """
    sen_model = SentenceTransformer("T-Systems-onsite/cross-en-de-roberta-sentence-transformer")
    docs = df.tweet_without_stopwords.to_list()
    doc_info_bert = bert.get_document_info(docs)
    index = df[df["tweet_id"] == tweet_id].index.values[0]
    hashtag_topics = tweet_clustering(df, tweet_id, representative_hashtags, sen_model)
    return doc_info_bert.iloc[index].Name, hashtag_topics


In [11]:
# representative_hashtags = hashtag_groups(read_df)
rep_hash = pd.read_parquet("rep_hash.parquet")
representative_hashtags = rep_hash.group_name.to_numpy()
representative_hashtags

array(['wisszeitvg', 'ichbinhanna', 'dauerstellen',
       'weilwirwissenschaftlieben', 'academictwitter', 'streik',
       'ichbinhannaat', 'wissenschaftsfreiheit', 'academia', 'diversity',
       'cdu', 'hochschulen', 'phd', 'bmbf', 'ugnovelle',
       'waspostdocswollen', 'ichbinreyhan', 'sachsenanhalt',
       'gegenwisszeitvg10', 'hochschule', 'hannastreikt',
       'zukunftsvertrag', '95vswisszeitvg', 'getorganized',
       'hannaimbundestag', 'mlunterfinanziert', 'hitzefrei',
       'koalitionsvertrag', 'ausbeutung', 'drittmittel', 'unverzichtbar',
       'wiko22', 'pandemie', 'firstgen', 'corona', 'maithinkx',
       'ichwerdehannasein', 'bundestagswahl', 'oneofusallofus',
       'überstunden', 'innovation', 'stopthecuts', 'fuckademia',
       'wisssystemfehler', 'bundestag', 'einkaufszentrum',
       'bastaprecariatodistato', 'bafög50', 'rassismus',
       'speakupostbelgien', 'frististfrust'], dtype=object)

In [None]:
# new_df = create_topics_df(read_df, representative_hashtags, bert_roberta_50)
# new_df.to_parquet("tweets_analysis_dataset.parquet", index=False)


In [12]:
text_top, hashtag_topic = fetch_topics_tweet_id(read_df, 1405806358335832065, representative_hashtags, bert_roberta_50)
text_top, hashtag_topic

  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_hostname)
  context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
  match_hostname(cert, asserted_

('0_wisszeitvg_ich_das_die',
 ['95vswisszeitvg',
  'bastaprecariatodistato',
  'ausbeutung',
  'wissenschaftsfreiheit',
  'ichbinhanna',
  'wisszeitvg'])

In [13]:
read_df.iloc[read_df[read_df["tweet_id"] == 1405806358335832065].index.values[0]].hashtags

"['Machtmissbrauch', 'Wissenschaft', 'prekär', 'IchbinHanna', 'WissZeitVG', '95vsWissZeitVG']"