In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import ast
import arabic_reshaper
from bidi.algorithm import get_display

from wordcloud import WordCloud
from scipy.interpolate import make_interp_spline

%matplotlib inline

In [2]:
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
import pyLDAvis
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/javad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
speeches = pd.read_csv("../data/cleaned_speeches.csv", sep='\t', index_col=0)

In [4]:
speeches.tail()

Unnamed: 0,year,speaker,speech,n_words
407,1394,موحدی,"['خطيب', 'موقت', 'نماز', 'جمعه', 'تهران', 'بيا...",1264
408,1394,صدیقی,"['خطيب', 'نماز', 'جمعه', 'تهران', 'تصريح', 'کر...",1338
409,1394,موحدی,"['آيت', 'الله', 'موحدي', 'کرماني', 'تأکيد', 'ا...",1047
410,1394,صدیقی,"['خطيب', 'نماز', 'جمعه', 'هفته', 'تهران', 'ذات...",1616
411,1394,جنتی,"['آيت', 'الله', 'احمد', 'جنتي', 'برجام', 'تلقي...",1249


In [5]:
list_of_speeches = []
for row_i in range(len(speeches)):
    speech = ast.literal_eval(speeches.at[row_i, 'speech'])
    year = speeches.at[row_i, 'year']
    list_of_speeches.append((speech, year))

In [6]:
min_ord = 10000
max_ord = 0
for persian_sp, year in list_of_speeches:
    for persian_w in persian_sp:
        for persian_char in persian_w:
            if ord(persian_char) > max_ord and ord(persian_char) < 2000:
                max_ord = ord(persian_char)
            if ord(persian_char) < min_ord:
                min_ord = ord(persian_char)
            if ord(persian_char) < 1200:
                print(persian_char)
print(min_ord, max_ord)

1548 1785


In [7]:
(max_ord-min_ord)

237

In [8]:
ord('z') - ord('a')

25

In [9]:
def encoder(persian_char):
    persian_ord = ord(persian_char) - min_ord
    denom = ord('z') - ord('a')
    value1 = int(persian_ord / denom)
    value2 = persian_ord % denom
    char1 = chr(value1 + ord('a'))
    char2 = chr(value2 + ord('a'))
    
    return "".join([char1, char2])

In [10]:
transformed_speeches = []
for persian_sp, year in list_of_speeches:
    english_sp = []
    for persian_w in persian_sp:
        english_w = []
        for persian_char in persian_w:
            english_chars = encoder(persian_char)
            english_w.append(english_chars)
        english_sp.append("".join(english_w))
    transformed_speeches.append(english_sp)
    #transformed_speeches.append(" ".join(english_sp))
    #transformed_speeches.append((english_sp, year))

In [11]:
speeches.tail()

Unnamed: 0,year,speaker,speech,n_words
407,1394,موحدی,"['خطيب', 'موقت', 'نماز', 'جمعه', 'تهران', 'بيا...",1264
408,1394,صدیقی,"['خطيب', 'نماز', 'جمعه', 'تهران', 'تصريح', 'کر...",1338
409,1394,موحدی,"['آيت', 'الله', 'موحدي', 'کرماني', 'تأکيد', 'ا...",1047
410,1394,صدیقی,"['خطيب', 'نماز', 'جمعه', 'هفته', 'تهران', 'ذات...",1616
411,1394,جنتی,"['آيت', 'الله', 'احمد', 'جنتي', 'برجام', 'تلقي...",1249


In [12]:
list_of_speeches[2][0][44]

'مطرح'

In [13]:
transformed_speeches[2]

['bjbsbdcj',
 'bdboch',
 'bccgcgcj',
 'bccgbmbichci',
 'bccgbmbicmch',
 'ckbccgbqcgbccj',
 'bccgbocgbcch',
 'bucgcm',
 'bmbockcg',
 'bccgcgcj',
 'bucgcm',
 'awcgcj',
 'bccgbcbbchcj',
 'bccgchbubqckchcmciaa',
 'bcckbqcm',
 'bubdbcbk',
 'bccgcgcj',
 'bdbfceckcm',
 'bccgcgcj',
 'ckbcbfbdbcbu',
 'bcchbmcj',
 'cicjcmcj',
 'bjbsbdcj',
 'chbuchckcgbc',
 'cebmawci',
 'ghbmcmch',
 'bibmcd',
 'bnbkcmchaa',
 'bdbjbcbsbmbkcjcj',
 'cdbhbm',
 'bdcmbobfchcmci',
 'bobccggnbmbk',
 'bccicecgbcbd',
 'bcbjbfbqbcbqbc',
 'bccicecgbcbd',
 'bccmbmbcbk',
 'bpckbkgnckbpcj',
 'cjbcbbcm',
 'bmckbncjbcbbcm',
 'cdghbm',
 'ghcich',
 'bccicecgbcbd',
 'bfckbhcj',
 'bdbpckbk',
 'chbsbmbi',
 'ghcicmch',
 'bjbsbdcj',
 'chcibcbobdbfcjbccm',
 'buchcgbc',
 'bjbsbdcj',
 'chcibcbobdbfcjbcchcm',
 'eobmbkbcbnbk',
 'bccicecgbcbd',
 'bcbocgbcchcm',
 'bicecmcebfbc',
 'eobkcmbkcj',
 'bibcbkbgcj',
 'chchbfbcbncm',
 'ckcdckce',
 'bccgbubcbkcj',
 'chcjch',
 'bfbicecmcebc',
 'chcibpbcav',
 'bfbcbmcmbj',
 'bkcicmbccm',
 'bcbocgbcch',
 '

In [14]:
def decoder(english_w):
    persian_w = []
    for i in range(0,len(english_w),2):
        e_value1 = ord(english_w[i]) - ord('a')
        e_value2 = ord(english_w[i+1]) - ord('a')
        denom = ord('z') - ord('a')
        p_value1 = e_value1 * denom
        p_value2 = 0
        for v in range(e_value1*denom, (e_value1+1)*denom):
            if (v % denom) == e_value2:
                p_value2 = v - p_value1
                break
        persian_char = chr(p_value1 + p_value2 + min_ord)
        persian_w.append(persian_char)

    return "".join(persian_w)

In [15]:
english_w = 'chbsbmbi'
decoder(english_w)

'مطرح'

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.corpus import stopwords

In [17]:
count_vect = CountVectorizer()
x_counts = count_vect.fit_transform(transformed_speeches)
x_counts.todense()

AttributeError: 'list' object has no attribute 'lower'

In [None]:
count_vect.get_feature_names()

In [None]:
tfidf_transformer = TfidfTransformer()
x_tfidf = tfidf_transformer.fit_transform(x_counts)

In [None]:
dimension = 20
lda = LDA(n_components = dimension)
lda_array = lda.fit_transform(x_tfidf)

In [None]:
components = [lda.components_[i] for i in range(len(lda.components_))]
features = count_vect.get_feature_names()
important_words = [sorted(features, key = lambda x: components[j][features.index(x)], reverse = True)[:10] for j in range(len(components))]
important_words

In [None]:
important_words_decoded = []
for topic_lst in important_words:
    topic_lst_decoded = []
    for encoded_w in topic_lst:
        topic_lst_decoded.append(decoder(encoded_w))
    important_words_decoded.append(topic_lst_decoded)
important_words_decoded

In [None]:
dictionary = gensim.corpora.Dictionary(transformed_speeches)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in transformed_speeches]

In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)

In [None]:
decoded_topics = []
for idx, topic in lda_model.print_topics(-1):
    pattern = r'"([A-Za-z0-9_\./\\-]*)"'
    encoded_words = re.findall(pattern, topic)
    encoded_lst = []
    for w in encoded_words:
        encoded_lst.append(decoder(w))
    decoded_topics.append(encoded_lst)
decoded_topics      

In [None]:
### Approach 3

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(transformed_speeches, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[transformed_speeches], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[transformed_speeches[0]]])

In [None]:
for w in trigram_mod[bigram_mod[transformed_speeches[0]]]:
    if '_' in w:
        splited_w = [decoder(encoded_w) for encoded_w in w.split('_')]
        print(splited_w)
    else:
        print(decoder(w))

In [None]:
# Create Dictionary 
id2word = corpora.Dictionary(transformed_speeches)  
# Create Corpus 
texts = transformed_speeches  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus[:1])

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# # Print the keyword of topics
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

decoded_topics = []
for idx, topic in lda_model.print_topics(-1):
    pattern = r'"([A-Za-z0-9_\./\\-]*)"'
    encoded_words = re.findall(pattern, topic)
    encoded_lst = []
    for w in encoded_words:
        encoded_lst.append(decoder(w))
    decoded_topics.append(encoded_lst)
decoded_topics     

In [None]:
import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
import spacy# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis