In [1]:
import logging
import pickle
import re
import string
from pathlib import Path

import gensim
import nltk
import numpy as np
import pandas as pd
from gensim import corpora, models
from gensim.models import CoherenceModel
from gsdmm import MovieGroupProcess
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from unidecode import unidecode

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
tweets_df = pd.read_pickle("all_tweets_v8.pkl")
tweets_df.head(2)

Unnamed: 0,author_id,context_annotations,conversation_id,created_at,edit_history_tweet_ids,id,in_reply_to_user_id,lang,possibly_sensitive,referenced_tweets,reply_settings,text,author,withheld,geo,quote_count,retweet_count,like_count,reply_count,impression_count,media_keys,poll_ids,hashtags,urls,mentions,cashtags,annotations,is_retweet,gender_of_author,profession_of_author,hashtags_flattened,trend_topics,n_trend_topics,text_length,has_media,has_hashtags,has_mentions,is_reply,tweet_type,ratio_like,ratio_retweet,majority_lang,english,universal,eng_astroturf,eng_fake_follower,eng_financial,eng_other,eng_overall,eng_self_declared,eng_spammer,uni_astroturf,uni_fake_follower,uni_financial,uni_other,uni_overall,uni_self_declared,uni_spammer,verified_author,followers_count_author,following_count_author,tweet_count_author,age_of_account_in_days_author,sentiment,created_at_day_of_week,created_at_month_of_year,created_at_time_of_day_in_seconds,reply_to_tweet_ratio,retweet_to_tweet_ratio,average_tweets_of_author_per_day,media_media_key_1,media_type_1,media_height_1,media_url_1,media_width_1,media_public_metrics_1,media_preview_image_url_1,media_duration_ms_1,media_alt_text_1,media_media_key_2,media_type_2,media_height_2,media_url_2,media_width_2,media_public_metrics_2,media_preview_image_url_2,media_duration_ms_2,media_alt_text_2,media_media_key_3,media_type_3,media_height_3,media_url_3,media_width_3,media_public_metrics_3,media_preview_image_url_3,media_duration_ms_3,media_alt_text_3,media_media_key_4,media_type_4,media_height_4,media_url_4,media_width_4,media_public_metrics_4,media_preview_image_url_4,media_duration_ms_4,media_alt_text_4
0,1449804331142811655,,1617597872803041280,2023-01-23 18:59:21,[1617597872803041280],1617597872803041280,,tr,False,,everyone,"Kararlarındaki temel dayanak, hukukun evrensel...",Goksun_KHK,,,0,2,1,0,82,,,"[{'start': 95, 'end': 120, 'tag': 'OhalKomisyo...",,,,,0,ORG,NOT AVAILABLE,OhalKomisyonuHukuksuzdur,OhalKomisyonuHukuksuzdur,1,120,0,1,0,0,no_reply_and_no_retweet,0.012195,0.02439,tr,0.874858,0.847463,0.13,0.51,0.22,0.91,0.91,0.08,0.25,0.12,0.44,0.1,0.87,0.87,0.19,0.16,False,1107.0,174.0,2962.0,470.0,0.994189,0,1,68361,0.013817,0.435233,1.231915,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1449804331142811655,,1617597646339702823,2023-01-23 18:58:27,[1617597646339702823],1617597646339702823,,fr,False,,everyone,#OhalKomisyonuHukuksuzdur https://t.co/3aUr5MZWPU,Goksun_KHK,,,0,1,0,0,52,[3_1617597636604723228],,"[{'start': 0, 'end': 25, 'tag': 'OhalKomisyonu...","[{'start': 26, 'end': 49, 'url': 'https://t.co...",,,,0,ORG,NOT AVAILABLE,OhalKomisyonuHukuksuzdur,OhalKomisyonuHukuksuzdur,1,49,1,1,0,0,no_reply_and_no_retweet,0.0,0.019231,tr,0.874858,0.847463,0.13,0.51,0.22,0.91,0.91,0.08,0.25,0.12,0.44,0.1,0.87,0.87,0.19,0.16,False,1107.0,174.0,2962.0,470.0,0.733746,0,1,68307,0.013817,0.435233,1.231915,3_1617597636604723228,photo,1600.0,https://pbs.twimg.com/media/FnLceudWABwBC4b.jpg,1600.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Preprocessing tweets

In [3]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hasansalimkanmaz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hasansalimkanmaz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
tweets = tweets_df[tweets_df.lang == "tr"].set_index("id")["text"].drop_duplicates()

tweets = tweets.str.lower()  # lower all the words
tweets = tweets.map(lambda x: re.sub("\n", " ", x))  # remove \n
tweets = tweets.drop_duplicates()

tweets = tweets.map(lambda x: re.sub("#(\w+)[^\w]", "", x))  # remove hashtag
tweets = tweets.map(lambda x: re.sub("@([A-Za-z0-9_]+)", "", x))  # remove mentions
tweets = tweets.map(lambda x: re.sub("https://t.co/[A-Za-z0-9]+", "", x))  # remove link
tweets = tweets.map(lambda x: re.sub("\b\w*khk\w*\b", "khk", x))  # normalise khk words

tweets = tweets.str.replace("[^\w\s]", "")
tweets = tweets.map(lambda x: unidecode(x))  # remove accents

# remove stopwords
stopwords = nltk.corpus.stopwords.words("turkish")
stopwords = [unidecode(i) for i in stopwords]
tweets = tweets.apply(
    lambda x: " ".join([word for word in x.split() if word not in (stopwords)])
)

tweets = tweets.drop_duplicates()
tweets = tweets[
    tweets.str.len() > 5
]  # selecting only tweets that are greater than 5 chars
tweets = tweets.map(word_tokenize)

print(tweets.shape)
tweets.head()

  tweets = tweets.str.replace("[^\w\s]", "")


(231966,)


id
1617597872803041280    [kararlarindaki, temel, dayanak, hukukun, evre...
1617597477963829248    [oldukten, sonra, insanlari, iade, yapan, ohal...
1617597218038349833    [talimat, kuruldu, talimat, karar, verdiler, t...
1617597107396804608    [kendini, anayasanin, ustunde, gorerek, zaten,...
1617596957253603329    [magdur, ettigi, vatandasinin, hizli, bir, sek...
Name: text, dtype: object

In [5]:
docs = tweets.to_numpy()
dictionary = gensim.corpora.Dictionary(docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]
vocab_length = len(dictionary)

In [6]:
def get_fitted_lda_model_and_coherence(K):
    lda_model = gensim.models.LdaMulticore(
        bow_corpus,
        num_topics=K,
        id2word=dictionary,
        passes=15,
        workers=7,
        random_state=0,
        chunksize=300000,
        eval_every=None,
    )

    # Save LDA model to disk
    lda_model.save(f"models/lda_model_with_K_{K}.pkl")

    cm = CoherenceModel(model=lda_model, corpus=bow_corpus, texts=docs, coherence="c_v")
    coherence_score = cm.get_coherence()

    return lda_model, coherence_score


def print_top_words_from_lda_model(lda_model):
    for idx, topic in lda_model.print_topics(-1, num_words=20):
        print(f"Topic: {idx} \nWords: {topic}")

In [13]:
log_file_path = "gensim.log"

logging.basicConfig(
    filename="gensim.log",
    format="%(asctime)s:%(levelname)s:%(message)s",
    level=logging.DEBUG,
    filemode="w",
)

lda_models = []
for K in tqdm(range(3, 26)):
    print(f"\n\nTraining LDA model with K equals {K}...\n")
    model, coherence_score = get_fitted_lda_model_and_coherence(K)
    model_data = {"model": model, "coherence_score": coherence_score, "K": K}
    lda_models.append(model_data)
    print(f"Training concluded for LDA model with K equals {K}.\nResults: {model_data}")

sorted_lda_models = sorted(lda_models, key=lambda x: x["coherence_score"], reverse=True)
for i in sorted_lda_models:
    print(i)

  0%|                                                     | 0/6 [00:00<?, ?it/s]



Training LDA model with K equals 20...



 17%|███████▎                                    | 1/6 [05:31<27:38, 331.68s/it]

Training concluded for LDA model with K equals 20.
Results: {'model': <gensim.models.ldamulticore.LdaMulticore object at 0x2b36e3910>, 'coherence_score': 0.4140287423608586, 'K': 20}


Training LDA model with K equals 21...



 33%|██████████████▋                             | 2/6 [11:01<22:02, 330.59s/it]

Training concluded for LDA model with K equals 21.
Results: {'model': <gensim.models.ldamulticore.LdaMulticore object at 0x17a14b550>, 'coherence_score': 0.418719033485717, 'K': 21}


Training LDA model with K equals 22...



 50%|██████████████████████                      | 3/6 [16:28<16:27, 329.02s/it]

Training concluded for LDA model with K equals 22.
Results: {'model': <gensim.models.ldamulticore.LdaMulticore object at 0x17a114970>, 'coherence_score': 0.4305853299221616, 'K': 22}


Training LDA model with K equals 23...



 67%|█████████████████████████████▎              | 4/6 [22:02<11:02, 331.09s/it]

Training concluded for LDA model with K equals 23.
Results: {'model': <gensim.models.ldamulticore.LdaMulticore object at 0x3035a7910>, 'coherence_score': 0.4326857170164763, 'K': 23}


Training LDA model with K equals 24...



 83%|████████████████████████████████████▋       | 5/6 [27:34<05:31, 331.20s/it]

Training concluded for LDA model with K equals 24.
Results: {'model': <gensim.models.ldamulticore.LdaMulticore object at 0x17a1161a0>, 'coherence_score': 0.4393614521035249, 'K': 24}


Training LDA model with K equals 25...



100%|████████████████████████████████████████████| 6/6 [33:07<00:00, 331.18s/it]

Training concluded for LDA model with K equals 25.
Results: {'model': <gensim.models.ldamulticore.LdaMulticore object at 0x17a14b490>, 'coherence_score': 0.42934199159830017, 'K': 25}
{'model': <gensim.models.ldamulticore.LdaMulticore object at 0x17a1161a0>, 'coherence_score': 0.4393614521035249, 'K': 24}
{'model': <gensim.models.ldamulticore.LdaMulticore object at 0x3035a7910>, 'coherence_score': 0.4326857170164763, 'K': 23}
{'model': <gensim.models.ldamulticore.LdaMulticore object at 0x17a114970>, 'coherence_score': 0.4305853299221616, 'K': 22}
{'model': <gensim.models.ldamulticore.LdaMulticore object at 0x17a14b490>, 'coherence_score': 0.42934199159830017, 'K': 25}
{'model': <gensim.models.ldamulticore.LdaMulticore object at 0x17a14b550>, 'coherence_score': 0.418719033485717, 'K': 21}
{'model': <gensim.models.ldamulticore.LdaMulticore object at 0x2b36e3910>, 'coherence_score': 0.4140287423608586, 'K': 20}





In [14]:
print_top_words_from_lda_model(sorted_lda_models[0]["model"])

Topic: 0 
Words: 0.014*"yil" + 0.012*"gozaltina" + 0.011*"bir" + 0.011*"15" + 0.009*"askeri" + 0.006*"bugun" + 0.006*"temmuz" + 0.006*"tarafindan" + 0.006*"once" + 0.005*"polis" + 0.005*"alindi" + 0.005*"gezi" + 0.005*"davasi" + 0.005*"sonra" + 0.004*"ogrenci" + 0.004*"yilinda" + 0.004*"onunde" + 0.004*"alinan" + 0.004*"7" + 0.003*"12"
Topic: 1 
Words: 0.010*"bir" + 0.009*"var" + 0.009*"edilsin" + 0.007*"khklariptal" + 0.005*"elektrik" + 0.005*"olarak" + 0.005*"zam" + 0.005*"olan" + 0.005*"ekonomik" + 0.004*"onemli" + 0.004*"son" + 0.004*"1" + 0.004*"konusunda" + 0.003*"tarihi" + 0.003*"iscileri" + 0.003*"ilk" + 0.003*"muhalefet" + 0.003*"kanun" + 0.003*"acik" + 0.003*"enerji"
Topic: 2 
Words: 0.047*"olsun" + 0.023*"guzel" + 0.018*"bir" + 0.012*"saglik" + 0.010*"gecmis" + 0.010*"gun" + 0.007*"allah" + 0.007*"iyi" + 0.006*"tesekkurler" + 0.006*"sevgili" + 0.006*"hocam" + 0.005*"degerli" + 0.005*"olacak" + 0.005*"kutlu" + 0.005*"diliyorum" + 0.004*"var" + 0.004*"mutlu" + 0.004*"dostlar" 

In [15]:
lda_scores = pd.DataFrame(
    [{"coherence_score": m["coherence_score"], "K": m["K"]} for m in sorted_lda_models]
)
# lda_scores.to_csv("lda_scores.csv", index=False)
print(lda_scores.to_latex())

\begin{tabular}{lrr}
\toprule
{} &  coherence\_score &   K \\
\midrule
0 &         0.439361 &  24 \\
1 &         0.432686 &  23 \\
2 &         0.430585 &  22 \\
3 &         0.429342 &  25 \\
4 &         0.418719 &  21 \\
5 &         0.414029 &  20 \\
\bottomrule
\end{tabular}



  print(lda_scores.to_latex())


In [9]:
def get_topics_lists(model, top_clusters, n_words):
    """
    Gets lists of words in topics as a list of lists.

    model: gsdmm instance
    top_clusters:  numpy array containing indices of top_clusters
    n_words: top n number of words to include

    """
    # create empty list to contain topics
    topics = []

    # iterate over top n clusters
    for cluster in top_clusters:
        # create sorted dictionary of word distributions
        sorted_dict = sorted(
            model.cluster_word_distribution[cluster].items(),
            key=lambda k: k[1],
            reverse=True,
        )[:n_words]

        # create empty list to contain words
        topic = []

        # iterate over top n words in topic
        for k, v in sorted_dict:
            # append words to topic list
            topic.append(k)

        # append topics to topics list
        topics.append(topic)

    return topics

In [29]:
def get_fitted_gsdmm_model_and_coherence(K, n_iters):
    gsdmm = MovieGroupProcess(K=K, alpha=0.1, beta=0.3, n_iters=n_iters)
    _ = gsdmm.fit(docs, vocab_length)

    # print number of documents per topic
    doc_count = np.array(gsdmm.cluster_doc_count)
    print("Number of documents per topic :", doc_count)

    # Topics sorted by the number of document they are allocated to
    top_index = doc_count.argsort()[-15:][::-1]

    # get topics to feed to coherence model
    topics = get_topics_lists(gsdmm, top_index, 20)

    # evaluate model using Topic Coherence score
    cm_gsdmm = CoherenceModel(
        topics=topics,
        dictionary=dictionary,
        corpus=bow_corpus,
        texts=docs,
        coherence="c_v",
    )

    with open(f"models/gsdmm_model_with_K_{K}", "wb") as f:
        pickle.dump(gsdmm, f)

    return gsdmm, cm_gsdmm.get_coherence()


def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(
            cluster_word_distribution[cluster].items(),
            key=lambda k: k[1],
            reverse=True,
        )[:values]
        print(f"\nCluster {cluster} : {sort_dicts}")


def print_top_words_gsdmm(model):
    # print number of documents per topic
    doc_count = np.array(model.cluster_doc_count)
    print("Number of documents per topic :", doc_count)

    # Topics sorted by the number of document they are allocated to
    top_index = doc_count.argsort()[::-1]
    print("Most important clusters (by number of docs inside):", top_index)

    # get top words in topics
    top_words(model.cluster_word_distribution, top_index, 20)

In [19]:
gsdmm_models = []
for n_topic in range(3, 26):
    model, coherence_score = get_fitted_gsdmm_model_and_coherence(n_topic, 30)
    model_data = {"model": model, "coherence_score": coherence_score, "K": n_topic}
    print(model_data)
    gsdmm_models.append(model_data)

sorted_gsdmm_models = sorted(
    gsdmm_models, key=lambda x: x["coherence_score"], reverse=True
)

for i in sorted_gsdmm_models:
    print(i)

In stage 0: transferred 207006 clusters with 20 clusters populated
In stage 1: transferred 155525 clusters with 20 clusters populated


IOStream.flush timed out


In stage 2: transferred 88396 clusters with 20 clusters populated
In stage 3: transferred 60168 clusters with 20 clusters populated
In stage 4: transferred 49263 clusters with 20 clusters populated
In stage 5: transferred 44578 clusters with 20 clusters populated
In stage 6: transferred 41955 clusters with 20 clusters populated


IOStream.flush timed out


In stage 7: transferred 40443 clusters with 20 clusters populated
In stage 8: transferred 39202 clusters with 20 clusters populated
In stage 9: transferred 38460 clusters with 20 clusters populated
In stage 10: transferred 38252 clusters with 20 clusters populated
In stage 11: transferred 37898 clusters with 20 clusters populated
In stage 12: transferred 37743 clusters with 20 clusters populated
In stage 13: transferred 37144 clusters with 20 clusters populated
In stage 14: transferred 37203 clusters with 20 clusters populated
In stage 15: transferred 37166 clusters with 20 clusters populated


IOStream.flush timed out


In stage 16: transferred 37005 clusters with 20 clusters populated


IOStream.flush timed out
IOStream.flush timed out


In stage 17: transferred 36807 clusters with 20 clusters populated
In stage 18: transferred 36908 clusters with 20 clusters populated


IOStream.flush timed out
IOStream.flush timed out


In stage 19: transferred 36723 clusters with 20 clusters populated


IOStream.flush timed out


In stage 20: transferred 36513 clusters with 20 clusters populated
In stage 21: transferred 36276 clusters with 20 clusters populated
In stage 22: transferred 36345 clusters with 20 clusters populated
In stage 23: transferred 36390 clusters with 20 clusters populated
In stage 24: transferred 36328 clusters with 20 clusters populated
In stage 25: transferred 36229 clusters with 20 clusters populated
In stage 26: transferred 36047 clusters with 20 clusters populated
In stage 27: transferred 36226 clusters with 20 clusters populated
In stage 28: transferred 36374 clusters with 20 clusters populated
In stage 29: transferred 36345 clusters with 20 clusters populated
Number of documents per topic : [13936  8985 10420 10749 10467 26242 12347 10796  9381  9905  9951  3743
  5156 13303 11106 13189 11208 15396 15329 10357]
{'model': <gsdmm.mgp.MovieGroupProcess object at 0x2ba91e380>, 'coherence_score': 0.514060067665686, 'K': 20}
In stage 0: transferred 207376 clusters with 21 clusters populate

IOStream.flush timed out


In stage 21: transferred 38764 clusters with 29 clusters populated


IOStream.flush timed out


In stage 22: transferred 38692 clusters with 29 clusters populated


IOStream.flush timed out


In stage 23: transferred 38714 clusters with 29 clusters populated


IOStream.flush timed out


In stage 24: transferred 38855 clusters with 29 clusters populated


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


In stage 25: transferred 38867 clusters with 29 clusters populated


IOStream.flush timed out


In stage 26: transferred 38574 clusters with 29 clusters populated


IOStream.flush timed out
IOStream.flush timed out


In stage 27: transferred 38738 clusters with 29 clusters populated


IOStream.flush timed out
IOStream.flush timed out


In stage 28: transferred 38665 clusters with 29 clusters populated


IOStream.flush timed out


In stage 29: transferred 38488 clusters with 29 clusters populated
Number of documents per topic : [ 6296  6069  7002  7626  7315 16955  7994  8337  8325 18362  8361  5672
 10494  9264  3635  9353  8306  9946  6477  6696  5812  8751  7095  7945
  6749  7020  5695  8029  2385]
{'model': <gsdmm.mgp.MovieGroupProcess object at 0x2cc98ff40>, 'coherence_score': 0.513392695939488, 'K': 29}


IOStream.flush timed out


In stage 0: transferred 211763 clusters with 30 clusters populated


IOStream.flush timed out
IOStream.flush timed out


In stage 1: transferred 157370 clusters with 30 clusters populated


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


In stage 2: transferred 93802 clusters with 30 clusters populated


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


In stage 3: transferred 67816 clusters with 30 clusters populated


IOStream.flush timed out


In stage 4: transferred 57562 clusters with 30 clusters populated


IOStream.flush timed out
IOStream.flush timed out


In stage 5: transferred 51608 clusters with 30 clusters populated


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


In stage 6: transferred 48209 clusters with 30 clusters populated

IOStream.flush timed out





IOStream.flush timed out
IOStream.flush timed out


In stage 7: transferred 45550 clusters with 30 clusters populated


IOStream.flush timed out
IOStream.flush timed out


In stage 8: transferred 43606 clusters with 30 clusters populated


IOStream.flush timed out


In stage 9: transferred 42563 clusters with 30 clusters populated


IOStream.flush timed out


In stage 10: transferred 42093 clusters with 30 clusters populated


IOStream.flush timed out
IOStream.flush timed out


In stage 11: transferred 41575 clusters with 30 clusters populated
In stage 12: transferred 40773 clusters with 30 clusters populated
In stage 13: transferred 40378 clusters with 30 clusters populated
In stage 14: transferred 40585 clusters with 30 clusters populated
In stage 15: transferred 40640 clusters with 30 clusters populated
In stage 16: transferred 39994 clusters with 30 clusters populated
In stage 17: transferred 39799 clusters with 30 clusters populated
In stage 18: transferred 39995 clusters with 30 clusters populated
In stage 19: transferred 39969 clusters with 30 clusters populated
In stage 20: transferred 39628 clusters with 30 clusters populated
In stage 21: transferred 39528 clusters with 30 clusters populated
In stage 22: transferred 39553 clusters with 30 clusters populated
In stage 23: transferred 39636 clusters with 30 clusters populated
In stage 24: transferred 39566 clusters with 30 clusters populated


IOStream.flush timed out
IOStream.flush timed out


In stage 25: transferred 39435 clusters with 30 clusters populated
In stage 26: transferred 39606 clusters with 30 clusters populated

IOStream.flush timed out
IOStream.flush timed out





IOStream.flush timed out
IOStream.flush timed out


In stage 27: transferred 39380 clusters with 30 clusters populated


IOStream.flush timed out
IOStream.flush timed out


In stage 28: transferred 39190 clusters with 30 clusters populated
In stage 29: transferred 39265 clusters with 30 clusters populated
Number of documents per topic : [10857  5711  6114  6981  6507  6779  9044  6754  6111  9762  8465  5716
  2618  7023  2774  8141 10159 15570  6192  8122  7898  7726 10772  8450
  7235  5542  6683  8221  7759 12280]
{'model': <gsdmm.mgp.MovieGroupProcess object at 0x2cc79fa60>, 'coherence_score': 0.5329178659725208, 'K': 30}
{'model': <gsdmm.mgp.MovieGroupProcess object at 0x2cc79fa60>, 'coherence_score': 0.5329178659725208, 'K': 30}
{'model': <gsdmm.mgp.MovieGroupProcess object at 0x2cc98e230>, 'coherence_score': 0.5218882469200269, 'K': 19}
{'model': <gsdmm.mgp.MovieGroupProcess object at 0x2cc98d8d0>, 'coherence_score': 0.5141320823873504, 'K': 23}
{'model': <gsdmm.mgp.MovieGroupProcess object at 0x2ba91e380>, 'coherence_score': 0.514060067665686, 'K': 20}
{'model': <gsdmm.mgp.MovieGroupProcess object at 0x2ba91c2e0>, 'coherence_score': 0.513469155700

In [30]:
filtered_sorted_gsdmm_models = [m for m in sorted_gsdmm_models if m["K"] < 26]
print_top_words_gsdmm(filtered_sorted_gsdmm_models[0]["model"])

Number of documents per topic : [15199  9785 11908  8073  8503 10788  6985  9935 14617 15224  7681 17945
 10011 11767 10976 15285 10709 10230 26345]
Most important clusters (by number of docs inside): [18 11 15  9  0  8  2 13 14  5 16 17 12  7  1  4  3 10  6]

Cluster 18 : [('khk', 5065), ('bir', 4275), ('adalet', 2567), ('hukuk', 1783), ('khklilar', 1619), ('devam', 1612), ('kadar', 1587), ('khklar', 1531), ('khkli', 1453), ('mucadele', 1376), ('istiyoruz', 1364), ('artik', 1358), ('hak', 1283), ('birlikte', 1224), ('ulkenin', 1211), ('degil', 1152), ('olarak', 1086), ('istiyor', 1033), ('geri', 934), ('olan', 890)]

Cluster 11 : [('bir', 6261), ('hukuk', 3627), ('adalet', 2366), ('insan', 1138), ('degil', 1087), ('evrensel', 1084), ('yok', 943), ('suc', 933), ('olan', 895), ('hukukun', 866), ('istiyoruz', 865), ('hak', 786), ('adil', 746), ('olarak', 708), ('ulke', 698), ('yargi', 686), ('demokrasi', 654), ('ulkede', 624), ('khk', 618), ('adilyargi', 593)]

Cluster 15 : [('bir', 3828

In [31]:
cluster_to_meaning_mapping = {
    18: "decree-law",
    11: "search for justice",
    15: "dismissal of governmental workers",
    9: "irrelevant tweets",
    0: "injustice against children",
    8: "expressing wishes",
    2: "politics",
    13: "woman rights",
    14: "invitation, agenda declaration",
    5: "death, torture, suicide",
    16: "democracy",
    17: "inflation, financial instability",
    12: "supreme court",
    7: "freedom of speech",
    1: "vulnerable, sick people",
    4: "internatial relations",
    3: "Uyghurs in China",
    10: "lost people",
    6: "activism for nature",
}

In [27]:
sttm_scores = pd.DataFrame(
    [
        {"coherence_score": m["coherence_score"], "K": m["K"]}
        for m in filtered_sorted_gsdmm_models
        if m["K"] < 26
    ]
)
sttm_scores

Unnamed: 0,coherence_score,K
0,0.521888,19
1,0.514132,23
2,0.51406,20
3,0.513469,24
4,0.507984,18
5,0.506312,25
6,0.504451,14
7,0.499866,13
8,0.499801,16
9,0.493687,15


In [28]:
sttm_scores.to_csv("sttm_scores.csv", index=False)