# N-grams as Tokens for Phrases

In this first project work we will use N-grams as tokens to create phrases in English and evaluate how much sense these phrases make.

If executing then need to set n as the N-gram length, all_words as words of the dictionary, load the model of corresponding name.

### Data preparation

We will use an existing corpus called NLTK library and create a dictionary of bi-grams, tri-grams, and four-grams out of it. 

In [None]:
pip install numpy==1.19.5 matplotlib==3.3.4 --user

In [None]:
pip install transformers torch sentencepiece torcheval happytransformer evaluate rouge-score

In [None]:
import random
from collections import defaultdict, Counter
from nltk.util import ngrams
import nltk
from nltk.corpus import brown, gutenberg
import pickle
import numpy as np

In [None]:
# punkt is a Sentence Tokenizer
nltk.download('punkt')

nltk.download('gutenberg')
nltk.download('brown')

In [None]:
# loading the corpus
all_words = brown.words()

### Building the model
We want to create an N-gram language model using the data prepared in the first step.


In [None]:
def create_trigram_model(trigrams):
    # getting the frequency of each n-gram
    ngram_freq = Counter(trigrams)

    # creating a dictionary where each (n-1)-gram maps to possible next words
    model = defaultdict(list)

    # for trigrams, the prefix will be the first two words, and the next word will be the third
    for (w1, w2, w3) in trigrams:
        model[(w1, w2)].append(w3)

    return ngram_freq, model

In [None]:
def create_bigram_model(bigrams):
    ngram_freq = Counter(bigrams)

    model = defaultdict(list)

    for (w1, w2) in bigrams:
        model[w1].append(w2)

    return ngram_freq, model

In [None]:
def create_fourgram_model(fourgrams):
    ngram_freq = Counter(fourgrams)

    model = defaultdict(list)

    for (w1, w2, w3, w4) in fourgrams:
        model[(w1, w2, w3)].append(w4)

    return ngram_freq, model

In [None]:
# generating model
n = 3
trigrams = list(ngrams(all_words, n))

ngram_freq, model_ngram = create_trigram_model(trigrams)

In [None]:
# generating model
n = 3
bigrams = list(ngrams(all_words, n))

ngram_freq, model_ngram = create_bigram_model(bigrams)

with open('br_bi_model.pkl', 'wb') as f:
    pickle.dump((ngram_freq, model_ngram), f)

print("Model saved successfully!")

In [None]:
with open('br_four_model.pkl', 'rb') as f:
    ngram_freq, model_ngram = pickle.load(f)

print("Model loaded successfully!")

### Generating phrases
We will create a phrase one word at a time based on the frequency of the N-grams in our dictionary until a phrase is of a certain length or a stop condition is met.

In [None]:
def generate_random_text(model, start_words, n, length=50):
    text = list(start_words)
    
    if n==2:
        current_words = start_words[-1]
    else:
        current_words = tuple(start_words)
        
    for _ in range(length):
        if current_words in model and model[current_words]:
            possible_words = model[current_words]
            next_word = random.choice(possible_words)
            
            if len(set(possible_words)) > 1:
                while next_word == text[-1]:
                    next_word = random.choice(possible_words)
            else:
                # use the only possible option
                next_word = possible_words[0]

            if next_word in {'``', "''", "--"}:
                continue
            
            # Check if the next word is punctuation, then break before appending
            if next_word in {".", "!", "?"}:
                text.append(next_word)
                break
                
            text.append(next_word)
            
            if n==2:
                current_word = next_word
            else:
                current_words = tuple(text[-(len(start_words)):])
        else:
            break  # Stop if we no next word is available
    
    return ' '.join(text)

In [None]:
n=4

if n==2:
    start_words = ('In',)
elif n==3:
    start_words = ('In', 'a')
elif n==4:
    start_words = ('In', 'a', 'way')
generated_random_text = generate_random_text(model_ngram, start_words, n)
print(generated_random_text)

### Evaluating phrases
After generating a phrase we pass it to a LLM to evaluate its coherence and fluency.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import sentencepiece as spm
from torcheval.metrics.text import Perplexity
import evaluate

torch_device = "cuda" if torch.cuda.is_available() else "cpu"

Using **HappyTextToText** to check and correct the phrase's grammar produced by the N-gram model.

In [None]:
from happytransformer import HappyTextToText, TTSettings
happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")

In [None]:
def correct_grammar(input_phrase):
    args = TTSettings(num_beams=5, min_length=1,  max_length=60)

    formatted_input = f"grammar: {input_phrase}"
    result = happy_tt.generate_text(formatted_input, args=args)
    
    return result.text

In [None]:
print(generated_random_text)
corrected_result = correct_grammar(generated_random_text)
print(corrected_result)

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')
model.to(torch_device)

In [None]:
def compute_phrase_likelihood(phrase):
    input_ids = tokenizer(phrase, return_tensors="pt").input_ids
    
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
    
    return -loss.item()

likelihood_score_initial = compute_phrase_likelihood(generated_random_text)
print("Initial Phrase:", generated_random_text)
print("Likelihood Score at the start:", likelihood_score_initial)

likelihood_score_corrected = compute_phrase_likelihood(corrected_result)
print("Corrected Phrase:", corrected_result)
print("Likelihood Score:", likelihood_score_corrected)

Other metrics to evaluate phrases can be:
- BLEU 
- ROUGE

Both of them need the same number of sentences in references and predictions, so we compare the sentence produced by our N-gram model and the one produced by HappyTextToText to see how close they are.

In [None]:
predictions = [generated_random_text]
references = [[corrected_result]]

bleu = evaluate.load("bleu")
results_bleu = bleu.compute(predictions=predictions, references=references)

rouge = evaluate.load('rouge')
results_rouge = rouge.compute(predictions=predictions, references=references)

This next part is used to test how well does the model perform in regards to the metrics implemented. It is also interesting to see how long the produced phrases are. We will print phrases that have particularly bad BLEU score to understand the reason for it.

In [None]:
import matplotlib.pyplot as plt

In [None]:
def generate_start_words(all_words, i):
    while True:
        index = random.randint(0, len(all_words) - i)
        if all_words[index][0].isalpha():
            if i==1:
                start_words = (all_words[index],)
            if i==2:
                start_words = (all_words[index], all_words[index + 1])
            if i==3:
                start_words = (all_words[index], all_words[index + 1], all_words[index + 2])
            return start_words

In [None]:
bleu = evaluate.load("bleu")
rouge = evaluate.load('rouge')

In [None]:
all_words = gutenberg.words()
n=4
with open('gut_four_model.pkl', 'rb') as f:
    ngram_freq, model_ngram = pickle.load(f)

print("Model loaded successfully!")

In [None]:
to_print = False
# testing length of generated phrases and BLEU and ROUGE values
X = 200
lengths_of_phrases = []
lengths_of_phrases_to_score = []

to_see = 0

bleu_scores = []
rougeL_scores = []

for i in range(X):
    # to generate at random the starting words of the given dictionary
    start_words = generate_start_words(all_words, n-1)

    generated_random_text = generate_random_text(model_ngram, start_words, n)
    lengths_of_phrases.append(len(generated_random_text.split()))
    
    if 10 <= len(generated_random_text.split()) <= 50:
        lengths_of_phrases_to_score.append(len(generated_random_text.split()))
        
        corrected_result = correct_grammar(generated_random_text)

        predictions = [generated_random_text]
        references = [[corrected_result]]

        results_bleu = bleu.compute(predictions=predictions, references=references)
        bleu_score = results_bleu['bleu']
        bleu_scores.append(bleu_score)

        results_rouge = rouge.compute(predictions=predictions, references=references)
        rougeL_score = results_rouge['rougeL']
        rougeL_scores.append(rougeL_score)

        # to produce some examples we print 100 phrases with relative corrections
        if to_see < 100:
            to_see += 1
            print(generated_random_text)
            print(corrected_result)

In [None]:
# plotting the generated phrase length

length_counts = Counter(lengths_of_phrases)

plt.figure(figsize=(10, 6))
plt.bar(length_counts.keys(), length_counts.values(), color='skyblue')
plt.xlabel('Sentence Length')
plt.ylabel('Frequency')
plt.title('Frequency of Generated Sentence Lengths')
plt.xticks(list(length_counts.keys()))
plt.grid(axis='y')
plt.yticks(range(0, max(length_counts.values()) + 1))

plt.savefig('lengths.png', format='png', dpi=300)
plt.show()

In [None]:
# plotting BLEU and ROUGE

plt.figure(figsize=(8, 5))
plt.scatter(lengths_of_phrases_to_score, bleu_scores, color='blue', label='BLEU Score')
plt.scatter(lengths_of_phrases_to_score, rougeL_scores, color='red', label='ROUGE-L Score')

plt.xlabel('Phrase Length')
plt.ylabel('Score')
plt.title('BLEU and ROUGE-L Scores')
plt.ylim(0, 1.1)  # Scores are between 0 and 1
plt.grid()
plt.legend()

plt.savefig('scores.png', format='png', dpi=300)
plt.show()

In [None]:
mean_bleu_score = np.mean(bleu_scores)
print("Mean BLEU Score:", mean_bleu_score)

mean_rouge_score = np.mean(rougeL_scores)
print("Mean ROUGE Score:", mean_rouge_score)