In [1]:
# ! pip install hazm
# ! pip install gensim
# ! pip -q install clean-text[gpl]
# ! pip3 install transformers
! uname -a

Linux 67355a28c126 5.10.60.1-microsoft-standard-WSL2 #1 SMP Wed Aug 25 23:20:18 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux


In [None]:
import pandas as pd
import numpy as np
import re
from __future__ import unicode_literals
import hazm
import nltk
import codecs
import tqdm
import gensim
from cleantext import clean

### Load Data

In [None]:
with open('masnavi.txt', 'r', encoding="utf8") as infile:
    masnavi_file = infile.readlines()

In [None]:
stopwords = [x.strip() for x in codecs.open('stopwords.txt','r','utf-8').readlines()]

In [None]:
persian_punctuation = ['،','؟',':','\*','«',"»"]

In [None]:
normalizer = hazm.Normalizer()

In [None]:
lemmatizer = hazm.Lemmatizer()

In [None]:
# Store Masnavi
masnavi = []

In [None]:
def clean_text(text, tokenize = False):
    text = normalizer.normalize(text)
    text = lemmatizer.lemmatize(text)
    text = re.sub(r"|".join(persian_punctuation), " ", text)
    regex = r"\b(?:" + "|".join(map(re.escape, stopwords)) + r")\b"
    text = re.sub(regex, " ", text)
    text = re.sub(u"\u200c" , "", text)
    text = re.sub(r'\s+', " ", text)
    text = text.strip()
    return text

In [None]:
def process_couplet(text):
    result = re.search("(\d{1,3})\.(\d{1,3})", text)
    # check if line contains a couplet
    if result:
        pno, cno = result.groups()
        # delete Daftar and Poem number
        couplet = re.sub("(\d{1,3})\.(\d{1,3})", "", text)
        # extract mesra
        hemistich = couplet.split("\t")[1:3]
        # clean mesra
        cleaned_hemistich = [clean_text(h) for h in hemistich]
        return pno, cno, "\t".join(hemistich), " ".join(cleaned_hemistich), cleaned_hemistich[0], cleaned_hemistich[1]
    return None

In [None]:
daftar = 0
for couplet in masnavi_file:
    if re.search("^(?:دفتر).*.(?:مثنوی)$", couplet):
        daftar += 1
        if daftar == 7:
            break
        print(f"Processing Daftar {daftar}")
    else:
        process_result = process_couplet(couplet)
        if process_result:
            pno = process_result[0]
            cno = process_result[1]
            c = process_result[2]
            cc = process_result[3]
            h1 = process_result[4]
            h2 = process_result[5]
            masnavi.append((daftar, pno, cno, c, cc, h1, h2))

In [None]:
masnavi_df = pd.DataFrame(masnavi, columns=['Daftar', 'Poem', 'CNo', 'Couplet', 'CCouplet', 'Hemistich1', 'Hemistich2'])

In [None]:
masnavi_df.Hemistich2[200:250]

### Process dataframe data types

In [None]:
masnavi_df['Daftar'] = masnavi_df['Daftar'].apply(pd.to_numeric)
masnavi_df['Poem'] = masnavi_df['Poem'].apply(pd.to_numeric)
masnavi_df['CNo'] = masnavi_df['CNo'].apply(pd.to_numeric)

In [None]:
masnavi_df[masnavi_df['Daftar']==1]

### Create Tokenized columns

In [None]:
masnavi_df['Couplet_tokenized'] = masnavi_df['CCouplet'].apply(lambda x:hazm.word_tokenize(x))

In [None]:
masnavi_df['Hemistich1_tokenized'] =  masnavi_df['Hemistich1'].apply(lambda x:hazm.word_tokenize(x))

In [None]:
masnavi_df['Hemistich2_tokenized'] =  masnavi_df['Hemistich2'].apply(lambda x:hazm.word_tokenize(x))

In [None]:
masnavi_df

### Frequency Analysis

In [None]:
from itertools import chain
from collections import Counter

In [None]:
all_words = list(chain.from_iterable(masnavi_df.Couplet_tokenized))

In [None]:
words_frequencies = nltk.FreqDist(all_words).most_common(100)

In [None]:
words_frequencies[:10]

In [None]:
print ('%-16s' % 'Number of words', '%-16s' % len(all_words))
print ('%-16s' % 'Number of unique words', '%-16s' % len(set(all_words)))
avg=np.sum([len(word) for word in all_words])/len(all_words)
print ('%-16s' % 'Average word length', '%-16s' % avg)
print ('%-16s' % 'Longest word', '%-16s' % all_words[np.argmax([len(word) for word in all_words])])

### TF/IDF Analysis

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [None]:
cv=CountVectorizer()
word_count_vector = cv.fit_transform(all_words)

In [None]:
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

In [None]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(),columns=["idf_weights"]) 
 
# sort ascending 
df_idf = df_idf.sort_values(by=['idf_weights'])

In [None]:
df_idf.to_csv('idf.csv')

In [None]:
important_words = list(df_idf[-10000:].index)

In [None]:
important_words

### Topic Modeling

In [None]:
from gensim.test.utils import datapath
from pprint import pprint

In [None]:
dp_groups = masnavi_df.groupby(['Daftar'])

In [None]:
poems = []
for name, group in dp_groups:
    poems.append([t for l in group['Couplet_tokenized'] for t in l if t in important_words])

In [None]:
dictionary = gensim.corpora.Dictionary(poems)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in masnavi_df.Couplet_tokenized]

In [None]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 300, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [None]:
lda_path = datapath("model")
lda_model.save(lda_path)

In [None]:
lda = lda_model.load(lda_path)

In [None]:
lda.print_topics()