### Import

In [None]:
import stanza
import json

### stanza: download persian patch

In [None]:
stanza.download("fa")

### Tokenization

In [None]:
# Line of file Extraction
# Normalization 
# Tokenization
# Store tokens in JSON and .csv file

nlp = stanza.Pipeline('fa', processors='tokenize')

tokens = []

number_of_lines = 0

with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\shahnameh.txt", "r", encoding="utf-8") as shahnameh_file:
    for line in shahnameh_file:
        number_of_lines += 1
        line = line.strip()
        if line:
            doc = nlp(line)
            # Extract tokens from all sentences in this line
            line_tokens = [word.text for sentence in doc.sentences for word in sentence.words]
            tokens.extend(line_tokens)
            
            print(f"Line: {line}")
            print(f"Tokens: {line_tokens}\n")
        

print("\nTotal tokens extracted:", len(tokens))

with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\shahnameh_tokens.json", "w", encoding = "utf-8") as st:
    json.dump(tokens, st)


### Counting Words
in Corpus

In [None]:
# calculate words count in corpus (tokens)
# store words count in JSON file

import json

# retreive tokens
with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\shahnameh_tokens.json", "r", encoding = "utf-8") as shahnameh_tokens:
    tokens = json.load(shahnameh_tokens)

words = list(set(tokens))

word_count = {}
for word in words:
    word_count[word] = tokens.count(word)

word_count["<s>"] = number_of_lines
word_count["</s>"] = number_of_lines

print(number_of_lines)

# store words count
with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\shahnameh_word_count.json", "w", encoding = "utf-8") as word_counf_file:
    json.dump(word_count, word_counf_file) 

Find out number of repetition of given word in Ferdowsi's Shahnameh

In [None]:
# retreive words count
with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\shahnameh_word_count.json", "r", encoding = "utf-8") as word_count_file:
    word_count = json.load(word_count_file)

# print(word_count)

word = "رستم"

if word in word_count:
    print(f"{word}:", word_count[word])
else:
    print(f"{word} not found in word count dictionary")



### Corpus start/End of line Marking

In [None]:
with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\shahnameh.txt", "r", encoding = "utf-8") as input, open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh\\file\\marked_shahnameh.txt", "w", encoding = "utf-8") as output:
    for line in input:
        marked_line = "<s> " + line.strip() + " </s>"
        output.write(marked_line + "\n")

Test if the marking work right or not

In [None]:
with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\marked_shahnameh.txt", "r", encoding = "utf-8") as marked_shahnameh_file:
    n = 0
    for line in marked_shahnameh_file:
        line_split = line.split()
        for i in range(len(line_split)):
            print(f"{i}: {line_split[i]}")
        n += 1
        if n == 1:
            break

### v1.4: Bigrams Count

In [None]:
bigrams = {}

# retreive
with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\marked_shahnameh.txt", "r", encoding = "utf-8") as marked_shahnameh_file:
    for line in marked_shahnameh_file:
        line_split = line.split()
        for i in range(len(line_split)-1):
            bigram = line_split[i] + " " + line_split[i+1]
            if bigram in bigrams:
                bigrams[bigram] += 1
            else:
                bigrams[bigram] = 1 

# store bigrams count
with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\shahnameh_bigrams_count.json", "w", encoding = "utf-8") as bigrams_file:
    json.dump(bigrams, bigrams_file)

### v1.5: bigrams Probability

##### Laplace-smoothed Bigram:

$$P(W_n \mid W_{n-1}) = \frac{C(W_{n-1} W_n) + 1}{C(W_{n-1}) + V}$$

$V$ : Represents the *size of the vocabulary* (the total number of unique words in the corpus).

• add 1 extra to counts of all bigrams, because of applying "MLE"(Maximum Likelihood Estimation). <Page 64> on Jurafsky's slides



• Cause I extract bigrams on Shahnameh's lines, I'll not have 0 probability in my Probability Dict. Thus I will not apply the above formula on my probability calculation procedure. I'll just calculate this instead:

$$P(W_n \mid W_{n-1}) = \frac{C(W_{n-1} W_n)}{C(W_{n-1})}$$

Then when every where I want to calculate a probability of a given input and I found no probability item in my "bigram_probability" dict, I'll put a so much small value manually as non-zero probability.

• Because I have marked my corpus by start and close tokens (<s>, </s>), I should add count of theese tokens to my "shahnameh_word_count" file manually to calculate probability truely.

In [None]:
with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\shahnameh_bigrams_count.json", "r", encoding = "utf-8") as bigrams_file:
    bigrams_count = json.load(bigrams_file)

with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\shahnameh_word_count.json", "r", encoding = "utf-8") as unique_words_count_file:
    unique_words_count = json.load(unique_words_count_file)

bigrams_probability = {}

for bigram in bigrams_count:
    first_word = bigram.split()[0]
    if first_word in unique_words_count:
        probability = bigrams_count[bigram] / unique_words_count[first_word]
        bigrams_probability[bigram] = round(probability, 5)
        # if probability > 1.0:
        #     print("*******************")
        #     print(bigrams_count[bigram], unique_words_count[first_word])
    else:
        print(f"{first_word} not found in unique words count dictionary")

with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\shahnameh_bigrams_probability.json", "w", encoding = "utf-8") as bigrams_probability_file:
    json.dump(bigrams_probability, bigrams_probability_file)


for i, j in bigrams_probability.items():
    print(i, j)

### v1.6: Calculate the probability of a given input

In [None]:
# poem = input("Enter a line of poem: \n")

poem = "رستم بر"

with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\shahnameh_bigrams_probability.json", "r", encoding = "utf-8") as bigram_probability_file:
    bigram_probability = json.load(bigram_probability_file)

poem_split = poem.split()

# print(type(bigram_probability["به نام"]))

prob = 1.0

for i in range(len(poem_split) - 1):
    bigram = poem_split[i] + " " + poem_split[i + 1]
    if bigram in bigram_probability:
        prob = prob * bigram_probability[bigram]
    else:
        print(f"{bigram} was not found in the bigram probabilities dictionary!")

print(prob)


### v1.7: Generate text with highest probability

In [None]:
def generate_text(prob_dict ,start_word = "به", num_word = 10):
    text = start_word + " "

    base_word = start_word
    while(num_word):
        word = ""
        word_prob = 0.0

        for bigram in prob_dict:
            if bigram.split()[0] == base_word:
                if prob_dict[bigram] > word_prob:
                    word_prob = prob_dict[bigram]
                    word = bigram.split()[1]
        base_word = word
        text += base_word + " "
        num_word -= 1


    print(text)


with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\shahnameh_bigrams_probability.json", "r", encoding = "utf-8") as bigram_probability_file:
    bigrams_probability = json.load(bigram_probability_file)

generate_text(bigrams_probability, "رستم", 15)


### v1.8: Generate a random text  

*(in process of implement)* will complete soon

In [None]:
with open("F:\\E\\code_workspace\\Python\\NLP\\shahnameh-n-gram-model\\file\\shahnameh_bigrams_probability.json", "r", encoding = "utf-8") as bigram_probability_file:
    bigrams_probability = json.load(bigram_probability_file)

   