In [106]:
import re
import string 

import pandas as pd
import numpy as np
from unidecode import unidecode
import nltk
from nltk.tokenize import word_tokenize
from gsdmm import MovieGroupProcess
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
tweets_df_v7 = pd.read_csv("all_tweets_v7.csv", index_col=0, low_memory=False)
tweets_df_v7.head(2)

Unnamed: 0,author_id,context_annotations,conversation_id,created_at,edit_history_tweet_ids,id,in_reply_to_user_id,lang,possibly_sensitive,referenced_tweets,reply_settings,text,author,withheld,geo,quote_count,retweet_count,like_count,reply_count,impression_count,media_keys,poll_ids,hashtags,urls,mentions,cashtags,annotations,is_retweet,gender_of_author,profession_of_author,hashtags_flattened,trend_topics,n_trend_topics,text_length,has_media,has_hashtags,has_mentions,is_reply,tweet_type,ratio_like,ratio_retweet,majority_lang,english,universal,eng_astroturf,eng_fake_follower,eng_financial,eng_other,eng_overall,eng_self_declared,eng_spammer,uni_astroturf,uni_fake_follower,uni_financial,uni_other,uni_overall,uni_self_declared,uni_spammer,verified_author,followers_count_author,following_count_author,tweet_count_author,age_of_account_in_days_author,sentiment,created_at_day_of_week,created_at_month_of_year,created_at_time_of_day_in_seconds,reply_to_tweet_ratio,retweet_to_tweet_ratio,average_tweets_of_author_per_day
0,1449804331142811655,,1617597872803041280,2023-01-23 18:59:21,[1617597872803041280],1617597872803041280,,tr,False,,everyone,"Kararlarındaki temel dayanak, hukukun evrensel...",Goksun_KHK,,,0,2,1,0,82,,,"[{'start': 95, 'end': 120, 'tag': 'OhalKomisyo...",,,,,0,ORG,NOT AVAILABLE,OhalKomisyonuHukuksuzdur,OhalKomisyonuHukuksuzdur,1,120,0,1,0,0,no_reply_and_no_retweet,0.012195,0.02439,tr,0.874858,0.847463,0.13,0.51,0.22,0.91,0.91,0.08,0.25,0.12,0.44,0.1,0.87,0.87,0.19,0.16,False,1107.0,174.0,2962.0,470.0,0.994189,0,1,68361,0.013817,0.435233,1.231915
1,1449804331142811655,,1617597646339702823,2023-01-23 18:58:27,[1617597646339702823],1617597646339702823,,fr,False,,everyone,#OhalKomisyonuHukuksuzdur https://t.co/3aUr5MZWPU,Goksun_KHK,,,0,1,0,0,52,['3_1617597636604723228'],,"[{'start': 0, 'end': 25, 'tag': 'OhalKomisyonu...","[{'start': 26, 'end': 49, 'url': 'https://t.co...",,,,0,ORG,NOT AVAILABLE,OhalKomisyonuHukuksuzdur,OhalKomisyonuHukuksuzdur,1,49,1,1,0,0,no_reply_and_no_retweet,0.0,0.019231,tr,0.874858,0.847463,0.13,0.51,0.22,0.91,0.91,0.08,0.25,0.12,0.44,0.1,0.87,0.87,0.19,0.16,False,1107.0,174.0,2962.0,470.0,0.733746,0,1,68307,0.013817,0.435233,1.231915


# Prepare tweets

In [71]:
tweets = tweets_df_v7.set_index('id')['text'].drop_duplicates()

In [72]:
tweets.head()

id
1617597872803041280    Kararlarındaki temel dayanak, hukukun evrensel...
1617597646339702823    #OhalKomisyonuHukuksuzdur https://t.co/3aUr5MZWPU
1617597571903676417    @Yozgat_KHK: #OhalKomisyonuHukuksuzdur\nÇünkü ...
1617597543000440844    #OhalKomisyonuHukuksuzdur https://t.co/FHdc5DQ0u0
1617597477963829248    @Malatya_KHK: Öldükten sonra insanlari işe iad...
Name: text, dtype: object

In [73]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hasansalimkanmaz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hasansalimkanmaz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [74]:
tweets = tweets.map(lambda x: re.sub('#[A-Za-z0-9]+', '', x))  # remove hashtag
tweets = tweets.map(lambda x: re.sub('@[A-Za-z0-9]+', '', x))  # remove mentions
tweets = tweets.map(lambda x: re.sub('https://t.co/[A-Za-z0-9]+', '', x)) # remove link

# tweets = tweets.map(lambda x: re.sub('[,\.!?:@]', '', x)) # remove ,.!?
tweets = tweets.map(lambda x: x.translate(str.maketrans('', '', string.punctuation + "@"))) # remove ,.!?
# new_string = a_string.translate(str.maketrans('', '', string.punctuation))

tweets = tweets.map(lambda x: re.sub('\n', '', x)) # remove \n
tweets = tweets.map(lambda x: unidecode(x)) # remove accents

# remove stopwords
stopwords = nltk.corpus.stopwords.words('turkish') 
stopwords = [unidecode(i) for i in stopwords] 
tweets = tweets.apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)])) # remove stopwords

tweets = tweets.str.lower() # lower all the words
tweets = tweets.drop_duplicates()
tweets = tweets[tweets.str.len() > 5] # selecting only tweets that are greater than 5 chars
tweets = tweets.map(word_tokenize)

In [78]:
tweets.head()

id
1617597872803041280    [kararlarindaki, temel, dayanak, hukukun, evre...
1617597571903676417    [khk, cunku, kurulma, amaci, cozum, odakli, de...
1617597477963829248       [khk, oldukten, sonra, insanlari, iade, yapan]
1617597218038349833    [khk21, talimat, kuruldutalimat, karar, verdil...
1617597107396804608    [khk61, kendini, anayasanin, ustunde, gorerek,...
Name: text, dtype: object

# Modelling

In [90]:
vocab = set(token for tweet_tokens in tweets.to_list() for token in tweet_tokens)
vocab_size = len(vocab)

In [119]:
mgp = MovieGroupProcess(K=30, alpha=0.1, beta=0.1, n_iters=30)
y = mgp.fit(tweets.to_list(), vocab_size=vocab_size)

In stage 0: transferred 237855 clusters with 30 clusters populated
In stage 1: transferred 165159 clusters with 30 clusters populated
In stage 2: transferred 96678 clusters with 30 clusters populated
In stage 3: transferred 58707 clusters with 30 clusters populated
In stage 4: transferred 43424 clusters with 30 clusters populated
In stage 5: transferred 36487 clusters with 30 clusters populated
In stage 6: transferred 33438 clusters with 30 clusters populated
In stage 7: transferred 31403 clusters with 30 clusters populated
In stage 8: transferred 30903 clusters with 30 clusters populated
In stage 9: transferred 30316 clusters with 30 clusters populated
In stage 10: transferred 29832 clusters with 30 clusters populated
In stage 11: transferred 29682 clusters with 30 clusters populated
In stage 12: transferred 29452 clusters with 30 clusters populated
In stage 13: transferred 29055 clusters with 30 clusters populated
In stage 14: transferred 28975 clusters with 30 clusters populated
In 

In [111]:
cluster_word_distribution = mgp.cluster_word_distribution.copy()

In [114]:
for cluster in cluster_word_distribution:
    for word in cluster.copy().keys():
        for j, cluster_j in enumerate(cluster_word_distribution):
            if word in cluster_j and cluster_j[word] > cluster[word]:
                del cluster[word]
                break

In [121]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)
# Show the top 5 words in term frequency for each cluster 
# top_words(mgp.cluster_word_distribution, top_index, 5)
for i in top_index:
    top_words = sorted(mgp.cluster_word_distribution[i].items(), key=lambda x: -x[1])[:20]
    print(f"Top words for index {i}:\n\n")
    for w in top_words:
        print(w)
    print("\n\n")

Number of documents per topic : [   67 43255  6997   281   119 18485  7806 14046 19228 13048 17166 22089
   163 18598   134  2913   566   141  1136  6088   136    33 14011    34
  2730  1183   314    85 50165  6782]
********************
Most important clusters (by number of docs inside): [28  1 11  8 13  5 10  7 22  9  6  2 29 19 15 24 25 18 16 26  3 12 17 20
 14  4 27  0 23 21]
********************
Top words for index 28:


('khk', 40181)
('...', 10136)
('bir', 9185)
('adalet', 4845)
('hukuk', 4522)
('bu', 2922)
('kadar', 2152)
('hak', 2109)
('devam', 1932)
('khklilar', 1899)
('istiyoruz', 1818)
('degil', 1810)
('ulkenin', 1758)
('biz', 1744)
('olarak', 1697)
('insan', 1656)
('mucadele', 1625)
('olan', 1614)
('khklar', 1566)
('artik', 1540)



Top words for index 1:


('bir', 18595)
('...', 13658)
("''", 4079)
('bu', 4004)
('``', 3887)
('var', 3392)
('degil', 3274)
('yok', 2878)
('kadar', 2827)
('ben', 2166)
('bile', 1613)
('iyi', 1597)
('boyle', 1568)
('olarak', 1564)
('olan', 1422)
