In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import ast
import arabic_reshaper
from bidi.algorithm import get_display

from wordcloud import WordCloud
from scipy.interpolate import make_interp_spline

%matplotlib inline

In [2]:
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
import pyLDAvis
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/javad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
speeches = pd.read_csv("../data/cleaned_speeches.csv", sep='\t', index_col=0)

In [4]:
speeches.tail()

Unnamed: 0,year,speaker,speech,n_words
407,1394,موحدی,"['خطيب', 'موقت', 'نماز', 'جمعه', 'تهران', 'بيا...",1264
408,1394,صدیقی,"['خطيب', 'نماز', 'جمعه', 'تهران', 'تصريح', 'کر...",1338
409,1394,موحدی,"['آيت', 'الله', 'موحدي', 'کرماني', 'تأکيد', 'ا...",1047
410,1394,صدیقی,"['خطيب', 'نماز', 'جمعه', 'هفته', 'تهران', 'ذات...",1616
411,1394,جنتی,"['آيت', 'الله', 'احمد', 'جنتي', 'برجام', 'تلقي...",1249


In [5]:
list_of_speeches = []
for row_i in range(len(speeches)):
    speech = ast.literal_eval(speeches.at[row_i, 'speech'])
    year = speeches.at[row_i, 'year']
    list_of_speeches.append((speech, year))

In [6]:
min_ord = 10000
max_ord = 0
for persian_sp, year in list_of_speeches:
    for persian_w in persian_sp:
        for persian_char in persian_w:
            if ord(persian_char) > max_ord and ord(persian_char) < 2000:
                max_ord = ord(persian_char)
            if ord(persian_char) < min_ord:
                min_ord = ord(persian_char)
            if ord(persian_char) < 1200:
                print(persian_char)
print(min_ord, max_ord)

1548 1785


In [7]:
(max_ord-min_ord)

237

In [8]:
ord('z') - ord('a')

25

In [9]:
def encoder(persian_char):
    persian_ord = ord(persian_char) - min_ord
    denom = ord('z') - ord('a')
    value1 = int(persian_ord / denom)
    value2 = persian_ord % denom
    char1 = chr(value1 + ord('a'))
    char2 = chr(value2 + ord('a'))
    
    return "".join([char1, char2])

In [10]:
transformed_speeches = []
for persian_sp, year in list_of_speeches:
    english_sp = []
    for persian_w in persian_sp:
        english_w = []
        for persian_char in persian_w:
            english_chars = encoder(persian_char)
            english_w.append(english_chars)
        english_sp.append("".join(english_w))
    #transformed_speeches.append(english_sp)
    transformed_speeches.append(" ".join(english_sp))
    #transformed_speeches.append((english_sp, year))

In [11]:
speeches.tail()

Unnamed: 0,year,speaker,speech,n_words
407,1394,موحدی,"['خطيب', 'موقت', 'نماز', 'جمعه', 'تهران', 'بيا...",1264
408,1394,صدیقی,"['خطيب', 'نماز', 'جمعه', 'تهران', 'تصريح', 'کر...",1338
409,1394,موحدی,"['آيت', 'الله', 'موحدي', 'کرماني', 'تأکيد', 'ا...",1047
410,1394,صدیقی,"['خطيب', 'نماز', 'جمعه', 'هفته', 'تهران', 'ذات...",1616
411,1394,جنتی,"['آيت', 'الله', 'احمد', 'جنتي', 'برجام', 'تلقي...",1249


In [12]:
list_of_speeches[2][0][44]

'مطرح'

In [13]:
transformed_speeches[2]

'bjbsbdcj bdboch bccgcgcj bccgbmbichci bccgbmbicmch ckbccgbqcgbccj bccgbocgbcch bucgcm bmbockcg bccgcgcj bucgcm awcgcj bccgbcbbchcj bccgchbubqckchcmciaa bcckbqcm bubdbcbk bccgcgcj bdbfceckcm bccgcgcj ckbcbfbdbcbu bcchbmcj cicjcmcj bjbsbdcj chbuchckcgbc cebmawci ghbmcmch bibmcd bnbkcmchaa bdbjbcbsbmbkcjcj cdbhbm bdcmbobfchcmci bobccggnbmbk bccicecgbcbd bcbjbfbqbcbqbc bccicecgbcbd bccmbmbcbk bpckbkgnckbpcj cjbcbbcm bmckbncjbcbbcm cdghbm ghcich bccicecgbcbd bfckbhcj bdbpckbk chbsbmbi ghcicmch bjbsbdcj chcibcbobdbfcjbccm buchcgbc bjbsbdcj chcibcbobdbfcjbcchcm eobmbkbcbnbk bccicecgbcbd bcbocgbcchcm bicecmcebfbc eobkcmbkcj bibcbkbgcj chchbfbcbncm ckcdckce bccgbubcbkcj chcjch bfbicecmcebc chcibpbcav bfbcbmcmbj bkcicmbccm bcbocgbcch ckbdbjbqckbq bccmbmbcci bjckbkchbcci bdbucickbcci bccecmbccickbo bpcibc bdghcibk bvbmce bdbcbpbkcichcm bfckbccicmch bcbsbmbccdchbcci gnblbmbk bdbdcmcicmch bdbmbmbocm cdghbm ckbfckbhcj bnckbccmbccm bfckbccicmch bccjchcmbf bcbfcdbccecm bnchbcci bccdbfbcbkcj bdcdcjchc

In [14]:
def decoder(english_w):
    persian_w = []
    for i in range(0,len(english_w),2):
        e_value1 = ord(english_w[i]) - ord('a')
        e_value2 = ord(english_w[i+1]) - ord('a')
        denom = ord('z') - ord('a')
        p_value1 = e_value1 * denom
        p_value2 = 0
        for v in range(e_value1*denom, (e_value1+1)*denom):
            if (v % denom) == e_value2:
                p_value2 = v - p_value1
                break
        persian_char = chr(p_value1 + p_value2 + min_ord)
        persian_w.append(persian_char)

    return "".join(persian_w)

In [15]:
english_w = 'chbsbmbi'
decoder(english_w)

'مطرح'

In [16]:
### Approach 1

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.corpus import stopwords

In [18]:
count_vect = CountVectorizer()
x_counts = count_vect.fit_transform(transformed_speeches)
x_counts.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [19]:
count_vect.get_feature_names()

['aabcbmbfbdbcbs',
 'aabcbmbfbp',
 'aabcchbcch',
 'aabcgnbm',
 'aabdcecmcj',
 'aabdcibkcj',
 'aabjbkbcckcibk',
 'aabkbnbkcm',
 'aabmbcbn',
 'aabmbubccmbf',
 'aabubccgchaa',
 'aabubdbcbkbccgcgcj',
 'aabucgchbcaa',
 'aacdcgbocdcj',
 'aachbmbcbhbu',
 'aachci',
 'aacjch',
 'aacmbc',
 'aaghbmbcchbf',
 'aaghbocm',
 'aagncj',
 'atatat',
 'atatatat',
 'atawcmbc',
 'atewbmbc',
 'avbccgbjcgbcbbce',
 'avbccgcgcj',
 'avbcchcickbc',
 'avbdckbkcibk',
 'awatchbkcibk',
 'awbbcmch',
 'awbbcmchaa',
 'awbbcmci',
 'awbbcmcicj',
 'awbcbpbfbdbccj',
 'awbccgbichbk',
 'awbcckcgcmci',
 'awbdaa',
 'awbdbc',
 'awbdbcavghch',
 'awbdbcbbcibc',
 'awbdbcbbcj',
 'awbdbcbbcm',
 'awbdbcbbgh',
 'awbdbcbbghch',
 'awbdbcbkbcci',
 'awbdbcbkbccicm',
 'awbdbcbkbccicmcjbc',
 'awbdbcbkbfbm',
 'awbdbcbkbfbmcmci',
 'awbdbcbkbobcbncm',
 'awbdbcbkbp',
 'awbdbcbkcm',
 'awbdbcci',
 'awbdbccicm',
 'awbdbccj',
 'awbdbjckbcci',
 'awbdbjcmbnbkbcbmcm',
 'awbdbkbcbmcm',
 'awbdbmbccjcjbccm',
 'awbdbmbobccicm',
 'awbdbmck',
 'awbdbmckbdbmbk

In [20]:
tfidf_transformer = TfidfTransformer()
x_tfidf = tfidf_transformer.fit_transform(x_counts)

In [21]:
dimension = 20
lda = LDA(n_components = dimension)
lda_array = lda.fit_transform(x_tfidf)

In [22]:
components = [lda.components_[i] for i in range(len(lda.components_))]
features = count_vect.get_feature_names()
important_words = [sorted(features, key = lambda x: components[j][features.index(x)], reverse = True)[:10] for j in range(len(components))]
important_words

[['bibocmci',
  'chcighbm',
  'bobuckbkcm',
  'bcbdckbfbmbcbdcm',
  'bcbobfaa',
  'cicjcm',
  'bkbccibp',
  'bccibobccicm',
  'bubrck',
  'chbmbicgcj'],
 ['cmbnbkcm',
  'ghbcbmgnbmbcci',
  'chbccdcmbccmcm',
  'bkbmcibkcj',
  'bhbucgcm',
  'ghbcbpbccicj',
  'chbnbkckbm',
  'cickawckbmcm',
  'bcbdckbsbccgbd',
  'gnbmbdcj'],
 ['bfbdbmcmbn',
  'bpbdbobfbmcm',
  'chbhbfcjbk',
  'chbkcmbmcmbfcm',
  'awcgckbkgncm',
  'chbqcgbibf',
  'bfaxbgbm',
  'bfbmbkbk',
  'bjbqckchbfcm',
  'bjckbkbmckcjbccm'],
 ['bibhbcbh',
  'bnbjchcm',
  'cgcmbccebf',
  'chckbkbd',
  'bpbmbscj',
  'ckgnckcm',
  'cibcbhbc',
  'ckbncmbmbcci',
  'bhchbmbcbf',
  'bmchcm'],
 ['ghbccibkcmbkbccjbccm',
  'bncmbdbccmcm',
  'bpbccjbkcmch',
  'bdcmbockbcbkbcci',
  'chcibibqbmbccn',
  'bfbhbcbm',
  'eobcbdbmcjcignbcci',
  'bibrckbmcjbc',
  'cibfbjbcbdbcbf',
  'bgchbm'],
 ['awbnchbccmcmch',
  'chbcbbbkcj',
  'cjcmghcg',
  'bcbobfbccibkbcbmbk',
  'cicdbjbf',
  'bocgckcgcjbccm',
  'ghbfbccicm',
  'bobcbhbkcmci',
  'cdcebuckbc',
  'gh

In [23]:
important_words_decoded = []
for topic_lst in important_words:
    topic_lst_decoded = []
    for encoded_w in topic_lst:
        topic_lst_decoded.append(decoder(encoded_w))
    important_words_decoded.append(topic_lst_decoded)
important_words_decoded

[['حسين',
  'منکر',
  'سعودي',
  'ابوترابي',
  'است،',
  'نهي',
  'دانش',
  'انساني',
  'عضو',
  'مرحله'],
 ['يزدي',
  'کارگران',
  'مافيايي',
  'درنده',
  'جعلي',
  'کاشانه',
  'مزدور',
  'نوآوري',
  'ابوطالب',
  'گربه'],
 ['تبريز',
  'شبستري',
  'مجتهد',
  'مديريتي',
  'آلودگي',
  'مصلحت',
  'تأثر',
  'تردد',
  'خصومتي',
  'خودروهاي'],
 ['حجاج',
  'زخمي',
  'لياقت',
  'مودب',
  'شرطه',
  'وگوي',
  'ناجا',
  'وزيران',
  'جمرات',
  'رمي'],
 ['کانديداهاي',
  'زيبايي',
  'شاهديم',
  'بيسوادان',
  'منحصراً',
  'تجار',
  'پابرهنگان',
  'حضورها',
  'نتخابات',
  'ثمر'],
 ['آزماييم',
  'مائده',
  'هيکل',
  'استاندارد',
  'نفخت',
  'سلولهاي',
  'کتاني',
  'ساجدين',
  'فقعوا',
  'کرمنا'],
 ['عضويت',
  'اماکن',
  'خوف',
  'مقدسه',
  'وقف',
  'مصدق',
  'احمدي',
  'اربعين',
  'آمَنُوا',
  'شناسد'],
 ['جدول',
  'تنبه',
  'رافت',
  'توفي',
  'اغتشاش',
  'تعرض',
  'دلخواه',
  'اهلش',
  'قهرماني',
  'شدي'],
 ['الناس',
  'عزاداري',
  'هيات',
  'عرش',
  'دوانيقي',
  'انتظامي',
  'ليالي',
  'سليقه',
  'آ

In [24]:
### Approach 2

In [25]:
transformed_speeches = []
for persian_sp, year in list_of_speeches:
    english_sp = []
    for persian_w in persian_sp:
        english_w = []
        for persian_char in persian_w:
            english_chars = encoder(persian_char)
            english_w.append(english_chars)
        english_sp.append("".join(english_w))
    transformed_speeches.append(english_sp)
    #transformed_speeches.append(" ".join(english_sp))
    #transformed_speeches.append((english_sp, year))

In [26]:
dictionary = gensim.corpora.Dictionary(transformed_speeches)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [27]:
bow_corpus = [dictionary.doc2bow(doc) for doc in transformed_speeches]

In [28]:
transformed_speeches = []
for persian_sp, year in list_of_speeches:
    english_sp = []
    for persian_w in persian_sp:
        english_w = []
        for persian_char in persian_w:
            english_chars = encoder(persian_char)
            english_w.append(english_chars)
        english_sp.append("".join(english_w))
    #transformed_speeches.append(english_sp)
    transformed_speeches.append(" ".join(english_sp))
    #transformed_speeches.append((english_sp, year))

In [29]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.06817145690532084),
 (1, 0.03986479165541012),
 (2, 0.05209056013485185),
 (3, 0.0698591263440639),
 (4, 0.12203607386211018),
 (5, 0.0440135764934178),
 (6, 0.045438088642024775),
 (7, 0.09877936620993685),
 (8, 0.05209056013485185),
 (9, 0.0286017932570282),
 (10, 0.06631769458906715),
 (11, 0.07510895377239991),
 (12, 0.025393283354795404),
 (13, 0.028038372283544367),
 (14, 0.07755219423815059),
 (15, 0.06802730278859229),
 (16, 0.027126011846623995),
 (17, 0.03016775317139278),
 (18, 0.021803255144763764),
 (19, 0.07157445655753103),
 (20, 0.020943913594072498),
 (21, 0.03078056217064305),
 (22, 0.034560457248524384),
 (23, 0.08171830562115974),
 (24, 0.0698591263440639),
 (25, 0.02937281746339726),
 (26, 0.04366948243635683),
 (27, 0.03986479165541012),
 (28, 0.053539939403113176),
 (29, 0.02377029698165483),
 (30, 0.10812685472065389),
 (31, 0.07886707846905454),
 (32, 0.0979319213833565),
 (33, 0.06892685551984841),
 (34, 0.026947372640397857),
 (35, 0.06898733453236262)

In [30]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)

In [31]:
decoded_topics = []
for idx, topic in lda_model.print_topics(-1):
    pattern = r'"([A-Za-z0-9_\./\\-]*)"'
    encoded_words = re.findall(pattern, topic)
    encoded_lst = []
    for w in encoded_words:
        encoded_lst.append(decoder(w))
    decoded_topics.append(encoded_lst)
decoded_topics      

[['قانون',
  'مسئله',
  'خوب',
  'بحث',
  'عليه',
  'حکومت',
  'بشود',
  'دانشگاه',
  'حسين',
  'خيلي'],
 ['صديقي',
  'الاسلام',
  'حجت',
  'مسئله',
  'خيلي',
  'ماه',
  'تاکيد',
  'والمسلمين',
  'جنگ',
  'مسئولان'],
 ['خيلي',
  'خوب',
  'عليه',
  'تاريخ',
  'بشود',
  'بحث',
  'حالا',
  'قانون',
  'اينجا',
  'السلام'],
 ['کاشاني',
  'اربعين',
  'امامي',
  'جنتي',
  'تاکيد',
  'عليه',
  'نسبت',
  'حالا',
  'راهپيمايي',
  'محمد'],
 ['شوراي',
  'خوب',
  'قانون',
  'بحث',
  'امنيت',
  'نگهبان',
  'عربستان',
  'خيلي',
  'مسئله',
  'جان'],
 ['خيلي',
  'خوب',
  'حالا',
  'مهم',
  'عليه',
  'بحث',
  'تاريخ',
  'خودش',
  'خودشان',
  'باز'],
 ['موحدي',
  'کرماني',
  'جنايت',
  'تکفيري',
  'خيلي',
  'تاکيد',
  'جنگ',
  'خودش',
  'عليه',
  'سوريه'],
 ['بحث',
  'خيلي',
  'اخلاق',
  'خوب',
  'تاريخ',
  'عليه',
  'حالا',
  'اينجا',
  'خودشان',
  'درست'],
 ['هاشمي',
  'عليه',
  'عربستان',
  'برجام',
  'خيلي',
  'مهم',
  'خوب',
  'رئيس',
  'حسين',
  'حمله'],
 ['جنگ',
  'فتنه',
  'خوب',
  'برابر',
  'حا

In [32]:
### Approach 3

In [33]:
transformed_speeches = []
for persian_sp, year in list_of_speeches:
    english_sp = []
    for persian_w in persian_sp:
        english_w = []
        for persian_char in persian_w:
            english_chars = encoder(persian_char)
            english_w.append(english_chars)
        english_sp.append("".join(english_w))
    transformed_speeches.append(english_sp)
    #transformed_speeches.append(" ".join(english_sp))
    #transformed_speeches.append((english_sp, year))

In [34]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(transformed_speeches, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[transformed_speeches], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[transformed_speeches[0]]])

['bibrckbm', 'eobmbpckbm', 'bmbccjeocmchbccmcm_bdcjchci', 'bkbmbo_bubdbmbfcm', 'bkbpchcibcci', 'bjcgbcbqcj', 'bjbsbdcj', 'awcmbf_bccgcgcj_bcbichbk_bhcibfcm', 'bjbsbdcj', 'cichbcbn_bhchbucj', 'bfcjbmbcci', 'bfbpghbm', 'bkbobf_bccibkbmghbcbmbcci', 'bdbmgnbnbcbmcm_chbmbcboch', 'bdcjchci', 'bibrckbm', 'bpckbm', 'bkbmbo_bubdbmbfcm', 'bkbpchcibcci', 'bdbkbccicibk', 'bccmbmbcci', 'bfbocgcmch', 'cibjckbccjcibk', 'bsbmcdcm', 'bkckbobfbcci', 'bkbcbjcg_bjbcbmbh', 'ghbpckbm', 'bkcggnbmchcm', 'bcchbcch', 'bhchbucj_chckcebf', 'bfcjbmbcciaa', 'bibrckbm', 'bgbdbcbf', 'cigncjbkbcbmcm', 'bccicecgbcbd', 'bmghci', 'bcbobcbocm', 'brbmckbmbf', 'bibrckbm', 'awcmcibkcj', 'bfbcghcmbk', 'ghbmbk', 'gnbnbcbmbp', 'bjbdbmgnbnbcbmcm', 'bhchcjckbmcm', 'bcbocgbcchcmaa', 'bkbdcmbm_bpckbmbccm_cigncjbdbcci', 'bfcebkcmbm_bfbpghbm', 'eocmbcch', 'bcbmbncibkcj', 'bmcjbdbm_chbubtch_bccicecgbcbd', 'chcibcbobdbf', 'bdcmbobfchcmci_bobccggnbmbk', 'eocmbmckbncm', 'bccicecgbcbd', 'bcbocgbcchcm', 'brbmckbmbf', 'cebmbcbm', 'gnbmcdbfc

In [35]:
for w in trigram_mod[bigram_mod[transformed_speeches[0]]]:
    if '_' in w:
        splited_w = [decoder(encoded_w) for encoded_w in w.split('_')]
        print(splited_w)
    else:
        print(decoder(w))

حضور
پرشور
['راهپيمايي', 'بهمن']
['درس', 'عبرتي']
دشمنان
خلاصه
خطبه
['آيت', 'الله', 'احمد', 'جنتي']
خطبه
['نماز', 'جمعه']
تهران
تشکر
['دست', 'اندرکاران']
['برگزاري', 'مراسم']
بهمن
حضور
شور
['درس', 'عبرتي']
دشمنان
بدانند
ايران
تسليم
نخواهند
طرفي
دوستان
['داخل', 'خارج']
کشور
دلگرمي
امام
['جمعه', 'موقت']
تهران،
حضور
ثبات
نگهداري
انقلاب
رکن
اساسي
ضرورت
حضور
آينده
تاکيد
کرد
گزارش
خبرگزاري
جمهوري
اسلامي،
['دبير', 'شوراي', 'نگهبان']
['تقدير', 'تشکر']
پيام
ارزنده
['رهبر', 'معظم', 'انقلاب']
مناسبت
['بيستمين', 'سالگرد']
پيروزي
انقلاب
اسلامي
ضرورت
قرار
گرفتن
پيام
عنوان
سرلوحه
برنامه
آينده
جامعه
اسلامي
تاکيد
کرد
آنگاه
لزوم
پاسداري
تاريخ
انقلاب
ياد
آور
عده
اصرار
گذشته
فراموشي
سپارند
تطهير
رژيم
گذشته
ارزشها
آرمانهاي
انقلاب
اسلامي
['سئوال', 'ببرند']
امام
['جمعه', 'موقت', 'تهران']
افزود
نسل
آينده
مرور
تاريخ
انقلاب
بدانند
بلاهايي
سرشان
منظور
دست
انقلاب
زدند
انگيزه
اصلي
سينه
سپرکردن
کشته
اسلام
رهنمودهاي
امام
افزود
برخي،
مردمي
نداي
مرگ
آمريکا
اوباش
ميخوانند
صورتي
افراد
تاريخ
انقلاب
فراموش
کرده
آقاي
جنتي


In [36]:
# Create Dictionary 
id2word = corpora.Dictionary(transformed_speeches)  
# Create Corpus 
texts = transformed_speeches  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 4), (9, 5), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 3), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 4), (23, 1), (24, 3), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 2), (33, 3), (34, 1), (35, 1), (36, 4), (37, 7), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 10), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 2), (52, 1), (53, 1), (54, 1), (55, 2), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 2), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 3), (69, 1), (70, 2), (71, 1), (72, 1), (73, 1), (74, 2), (75, 1), (76, 2), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 8), (83, 1), (84, 1), (85, 2), (86, 1), (87, 2), (88, 1), (89, 1), (90, 1), (91, 10), (92, 2), (93, 1), (94, 5), (95, 2), (96, 1), (97, 4), (98, 1), (99, 2), (100, 1), (101, 1), (102, 2), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 6), (110, 

In [37]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

In [38]:
# # Print the keyword of topics
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

decoded_topics = []
for idx, topic in lda_model.print_topics(-1):
    pattern = r'"([A-Za-z0-9_\./\\-]*)"'
    encoded_words = re.findall(pattern, topic)
    encoded_lst = []
    for w in encoded_words:
        encoded_lst.append(decoder(w))
    decoded_topics.append(encoded_lst)
decoded_topics     

[['قانون',
  'کشور',
  'بحث',
  'يعني',
  'خوب',
  'مجلس',
  'کار',
  'شوراي',
  'اساسي',
  'بشود'],
 ['خيلي',
  'اينها',
  'کنند',
  'کنيم',
  'کردند',
  'کند',
  'بحث',
  'خوب',
  'انسان',
  'دنيا'],
 ['پود',
  'پولهايم',
  'يبين',
  'يعد',
  'پارساي',
  'پارلماني',
  'پراکندگي',
  'يااميرالموءمنين',
  'پولساندها',
  'کرسيهاي'],
 ['پود',
  'پولهايم',
  'يبين',
  'يعد',
  'پارساي',
  'پارلماني',
  'پراکندگي',
  'يااميرالموءمنين',
  'پولساندها',
  'کرسيهاي'],
 ['الله',
  'کند',
  'علي',
  'اينها',
  'خدا',
  'کنم',
  'امام',
  'اسلام',
  'کردند',
  'کنند'],
 ['امام',
  'جمعه',
  'تهران',
  'کرد',
  'اشاره',
  'آمريکا',
  'ايران',
  'کند',
  'افزود',
  'کنند'],
 ['پود',
  'پولهايم',
  'يبين',
  'يعد',
  'پارساي',
  'پارلماني',
  'پراکندگي',
  'يااميرالموءمنين',
  'پولساندها',
  'کرسيهاي'],
 ['پود',
  'پولهايم',
  'يبين',
  'يعد',
  'پارساي',
  'پارلماني',
  'پراکندگي',
  'يااميرالموءمنين',
  'پولساندها',
  'کرسيهاي'],
 ['پود',
  'پولهايم',
  'يبين',
  'يعد',
  'پارساي',
  'پارلماني',
  

In [39]:
import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
import spacy# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

In [40]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


TypeError: Object of type complex is not JSON serializable

PreparedData(topic_coordinates=                        x                   y  topics  cluster       Freq
topic                                                                    
4      0.259347+0.000000j -0.129874+0.000000j       1        1  41.700581
13     0.340485+0.000000j  0.052117+0.000000j       2        1  21.800091
1      0.260808+0.000000j -0.160917+0.000000j       3        1  13.565241
17     0.370174+0.000000j  0.098419+0.000000j       4        1  11.210230
0      0.179298+0.000000j -0.242234+0.000000j       5        1   5.570483
5      0.318839+0.000000j  0.127800+0.000000j       6        1   5.045042
16     0.205629+0.000000j  0.175091+0.000000j       7        1   1.103272
12    -0.148814+0.000000j  0.006123+0.000000j       8        1   0.000396
14    -0.148814+0.000000j  0.006123+0.000000j       9        1   0.000395
10    -0.148814+0.000000j  0.006123+0.000000j      10        1   0.000395
2     -0.148814+0.000000j  0.006123+0.000000j      11        1   0.000395
18    -