# Question - 1

**Question - 1 :** Build an n-gram language model using NLTK's Brown corpus. Provide
the code. (You can build a language model in a few lines of code using
the NLTK package. You may use one of the bigram, trigram, or higherorder n-grams).


In [97]:
from nltk.corpus import brown
from nltk import bigrams, trigrams
from nltk.util import ngrams
from collections import Counter, defaultdict
import nltk
import random
import math

# Download the corpora
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [98]:
# Create a placeholder for the trigram model, using defaultdict to store counts
# The model is a dictionary where the key is a tuple of two words (w1, w2),
# and the value is another dictionary that stores the count of the third word (w3).
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count the frequency of word co-occurrences (trigrams) in sentences from the Brown corpus
for sentence in brown.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1

# Transform the raw counts to probabilities by normalizing
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

# Generate Sentence

In [99]:
def is_balanced(sentence):
    """
    Check if a sentence has balanced parentheses, quotes, and other paired symbols.

    Parameters:
    - sentence (str): The sentence to check for balanced characters.

    Returns:
    - bool: True if the sentence is balanced, False otherwise.
    """

    stack = []
    matching_pairs = {')': '(', '}': '{', ']': '[', '"': '"', "'": "'", '`': '`'}

    i = 0
    while i < len(sentence):
        char = sentence[i]

        if char == '`' and i + 1 < len(sentence) and sentence[i + 1] == '`':
            stack.append('`')
            i += 2
            continue

        # Check if char is a closing character
        if char in matching_pairs:
            # If stack is empty or top of stack does not match the current closing character
            if not stack or stack[-1] != matching_pairs[char]:
                return False
            stack.pop()

        # Check if char is an opening character
        elif char in matching_pairs.values():
            stack.append(char)

        i += 1

    return len(stack) == 0

In [100]:
def generate_sentence_from_trigram(starting_words, trigram_model, min_sentence_length=5, banned_words=None):
    """
    Generate a sentence based on a starting set of words and a trigram model.

    This function uses a trigram model to predict the next word based on the last two words of the current sentence.
    The model calculates probabilities for possible next words and selects one randomly. The sentence will continue
    until a stopping condition is met, such as reaching the minimum sentence length, encountering punctuation, or
    generating a banned word.

    Parameters:
    - starting_words (list): List of initial words to start the sentence.
    - trigram_model (dict): A trigram model (a dictionary) where keys are tuples of the last two words and the
      values are dictionaries of possible next words with their associated probabilities.
    - min_sentence_length (int): The minimum length the sentence should reach before being considered complete (default is 5).
    - banned_words (set): A set of words that should be excluded from the generated sentence (default is {"ugly", "bad"}).

    Returns:
    - str: A generated sentence that adheres to the trigram model and conditions.
    """

    sentence_words = starting_words[:]
    is_sentence_complete = False

    # Default banned words if none are provided
    banned_words = banned_words or {"ugly", "bad"}

    while not is_sentence_complete:
        # Generate a random threshold for selecting the next word based on its probability
        random_threshold = random.random()

        cumulative_probability = 0.0
        next_word = None
        current_bigram = tuple(sentence_words[-2:])
        possible_next_words = trigram_model.get(current_bigram, {})

        # Calculate cumulative probabilities and select a word randomly based on its probability
        for candidate_word, probability in possible_next_words.items():
            cumulative_probability += probability
            if cumulative_probability >= random_threshold:
                if candidate_word in banned_words:
                    continue
                next_word = candidate_word
                break

        # Stop if no valid next word is found or avoid consecutive punctuation marks
        if not next_word or (sentence_words[-1] in {";", ":", ",", "''"} and next_word in {";", ":", ",", "''"}):
            is_sentence_complete = True
            break

        sentence_words.append(next_word)

        # End the sentence if it reaches the minimum length and ends with punctuation
        if len(sentence_words) > min_sentence_length and sentence_words[-1] in {".", "!", "?"}:
            is_sentence_complete = True

    # Join words to form the sentence and remove any leading/trailing spaces
    sentence = ' '.join(word for word in sentence_words if word).strip()

    # Clean up spaces around punctuation marks
    sentence = sentence.replace(" ;", ";").replace(" :", ":").replace(" ,", ",").replace(" .", ".") \
                       .replace(" !", "!").replace(" ?", "?")

    # Remove trailing punctuation if any
    sentence = sentence.rstrip(".,;!?''\"")

    # Avoid sentences ending with weak prepositions or conjunctions
    while sentence.split() and sentence.split()[-1] in {"in", "of", "to", "with", "and", "or"}:
        sentence = ' '.join(sentence.split()[:-1]).strip()

    # Ensure the sentence ends with appropriate punctuation
    if not sentence.endswith(('.', '!', '?')):
        sentence += '.'

    return sentence

# Create Sentence

In [101]:
def create_unique_sentences(starting_words, num_sentences=1):
    """
    Generates a list of unique sentences based on a starting text and a trigram model.

    The function attempts to generate a number of unique sentences (specified by `num_sentences`), each satisfying
    conditions such as having a minimum length, being unique, and following certain grammar rules.

    Parameters:
    - starting_words (list): List of initial words to begin sentence generation.
    - num_sentences (int): The number of unique sentences to generate (default is 1).

    Returns:
    - list: A list of unique, valid sentences generated from the trigram model.
    """
    sentence_list = []
    unique_sentences = set()
    min_sentence_length = 5  # Minimum length a sentence should have
    max_generation_attempts = 50  # Max attempts to avoid infinite loops
    banned_words = {"ugly", "bad", "AF"}  # Set of words that should be excluded from sentences

    # Attempt to generate the specified number of unique sentences
    for _ in range(num_sentences):
        attempts = 0
        generated_sentence = generate_sentence_from_trigram(starting_words, model,
                                                             min_sentence_length=min_sentence_length,
                                                             banned_words=banned_words)

        # Retry if the sentence is invalid, not unique, too short, or unbalanced
        while (generated_sentence is None or generated_sentence in unique_sentences or
               len(generated_sentence.split()) < min_sentence_length or not is_balanced(generated_sentence)) and attempts < max_generation_attempts:
            generated_sentence = generate_sentence_from_trigram(starting_words, model,
                                                                 min_sentence_length=min_sentence_length,
                                                                 banned_words=banned_words)
            attempts += 1

        # Add the valid, unique sentence to the list
        if generated_sentence:
            unique_sentences.add(generated_sentence)
            sentence_list.append(generated_sentence)

    return sentence_list

# Calculating Probabilities and Perplexity

In [102]:
# List to store sentences and their corresponding perplexities
sentence_perplexity_list = []

def calculate_sentence_probability(sentence, trigram_model, smoothing_factor=0.001):
    """
    Calculates the log probability and perplexity of a given sentence using a trigram model.

    Parameters:
    - sentence (str): The sentence whose probability and perplexity we want to calculate.
    - trigram_model (dict): The trigram model containing word probabilities.
    - smoothing_factor (float): A small value added to the counts to avoid zero probabilities (default 0.001).

    Returns:
    - float: The average log probability of the sentence.
    """
    words = sentence.split()
    total_log_prob = 0.0

    for i in range(2, len(words)):
        w1, w2, w3 = words[i-2], words[i-1], words[i]
        bigram = (w1, w2)


        trigram_count = trigram_model.get(bigram, {}).get(w3, 0) + smoothing_factor
        total_bigram_count = sum(trigram_model.get(bigram, {}).values()) + smoothing_factor * len(trigram_model.get(bigram, {}))

        if total_bigram_count == 0:
            continue

        # Calculate the probability of the trigram and add its log value to the total log probability
        prob = trigram_count / total_bigram_count
        total_log_prob += math.log(prob)


    average_log_prob = total_log_prob / (len(words) - 2)
    perplexity = math.exp(-average_log_prob)
    sentence_perplexity_list.append((sentence, perplexity))

    return average_log_prob

# Question - 2

**Question - 2 :** After completing question 1, make simple predictions using the
language model you built in question 1. Start with the words “I am,” and
let your n-gram model predict the next word. Show both the code and
the model-generated results.

In [103]:
# starting words
start_text = ["I", "am"]
sentence_list = create_unique_sentences(start_text)

for sentence in sentence_list:
    print(sentence)

I am trying to make our experiment in recorded sound.


# Question - 3

**Question - 3 :** Based on the work from question 1 and question 2, generate 10
different sentences that start with “You are.”

In [104]:
# Starting words
start_text = ["You", "are"]
sentence_list = create_unique_sentences(start_text, 10)

for sentence in sentence_list:
    print(sentence)

You are much more is the car, it seems that only three more times than he is forced through a manual action -- and start dropping those hints.
You are now nationally known labor-management expert, and -- in the front page.
You are now finding new hope for on May 27.
You are now estimating that 75% of the invaders from the Gymnasium.
You are a half feet square and everybody had a 16 inch capacity has a phenomenal range of prediction and explanation.
You are a great deal of wishful thinking.
You are the endeavors of multitudes of modern America.
You are all trained as an interference with sovereignty to the sign and lighting and plumbing, all admitted that the South marking the upper and lower tax rates, general business activity in merchandise.
You are even motels for local audiences.
You are to believe that so .


# Question - 4

**Question - 4 :** Using the sentences generated in step 3, apply relevant packages to
calculate the probability of these 10 sentences generated by your ngram model in question 1.

In [105]:
# Calculate and print the probabilities and perplexities of the generated sentences
for s in sentence_list:
    avg_log_prob = calculate_sentence_probability(s, model)
    print(f"{s} | Average Log Probability: {abs(avg_log_prob):.4f}")

You are much more is the car, it seems that only three more times than he is forced through a manual action -- and start dropping those hints. | Average Log Probability: 2.2331
You are now nationally known labor-management expert, and -- in the front page. | Average Log Probability: 2.9867
You are now finding new hope for on May 27. | Average Log Probability: 2.2198
You are now estimating that 75% of the invaders from the Gymnasium. | Average Log Probability: 2.3612
You are a half feet square and everybody had a 16 inch capacity has a phenomenal range of prediction and explanation. | Average Log Probability: 1.9587
You are a great deal of wishful thinking. | Average Log Probability: 3.2033
You are the endeavors of multitudes of modern America. | Average Log Probability: 2.1966
You are all trained as an interference with sovereignty to the sign and lighting and plumbing, all admitted that the South marking the upper and lower tax rates, general business activity in merchandise. | Averag

# Question - 5

**Question - 5 :** Based on the sentence probabilities, calculate the perplexity of the ngram language model you used.

In [109]:
for sentence, perplexity in sentence_perplexity_list:
    print(f"{sentence} | Perplexity: {perplexity:.4f}")

You are much more is the car, it seems that only three more times than he is forced through a manual action -- and start dropping those hints. | Perplexity: 9.3290
You are now nationally known labor-management expert, and -- in the front page. | Perplexity: 19.8198
You are now finding new hope for on May 27. | Perplexity: 9.2052
You are now estimating that 75% of the invaders from the Gymnasium. | Perplexity: 10.6032
You are a half feet square and everybody had a 16 inch capacity has a phenomenal range of prediction and explanation. | Perplexity: 7.0898
You are a great deal of wishful thinking. | Perplexity: 24.6142
You are the endeavors of multitudes of modern America. | Perplexity: 8.9943
You are all trained as an interference with sovereignty to the sign and lighting and plumbing, all admitted that the South marking the upper and lower tax rates, general business activity in merchandise. | Perplexity: 9.4364
You are even motels for local audiences. | Perplexity: 13.4079
You are to b

# Question - 6

**Question - 6 :** Using the 10 sentences you generated, ask the Google-Gemini-1.5-
flash model API we practiced in Lab 2 to generate a story (within 500
words). Ensure the story contains no DEROGATORY, TOXICITY,
VIOLENCE, HARASSMENT, HATE_SPEECH, SEXUAL, etc. content. Provide the code and your generated story (the other parts of
your story will not be evaluated, as long as it did not contain problematic
content mentioned in red color).

In [110]:
# Some code are from https://ai.google.dev/gemini-api/docs

import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

genai.configure(api_key="AIzaSyCB0frGWSR_j5QtNC71U5ZZqkmFshA3xmI")
model = genai.GenerativeModel("gemini-1.5-flash")

SAFETY_SETTINGS = {
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
}

In [111]:
for s in sentence_list:
    # Generate the story response from the model
    response = model.generate_content(f"Act as an experienced story writer and write a short story in 500 words that incorporates '{s}' as a pivotal moment.",
                                      safety_settings=SAFETY_SETTINGS)

    finish_reason = response.candidates[0].finish_reason.name
    # Check if safety ratings indicate a problematic story
    if finish_reason == 'STOP':
        print("="*50)
        print(f"Story based on: '{s}'")
        print("-"*50)
        print(response.text)
        print("="*50)
        print("\n")
    else:
        print("="*50)
        print(f"Story based on: '{s}'")
        print("-"*50)
        print("Content blocked due to safety concerns.")
        print("="*50)
        print("\n")


Story based on: 'You are much more is the car, it seems that only three more times than he is forced through a manual action -- and start dropping those hints.'
--------------------------------------------------
Content blocked due to safety concerns.


Story based on: 'You are now nationally known labor-management expert, and -- in the front page.'
--------------------------------------------------
The headline slammed into me like a physical blow. "You are now nationally known labor-management expert, and -- in the front page." I stared at the newspaper, the ink smudging on my fingertips. I, Amelia "Amy" Hayes, a woman whose biggest professional achievement was negotiating a five percent raise for the unionized janitorial staff at my old firm, a nationally known expert? Absurd.

It all started with a strike. The city’s largest trucking company, Atlas Logistics, had been locked in a bitter battle with its unionized drivers for months. The strike crippled the city, and the media frenzy