First we do a little bit of preprocessing.We check if some trigram start with space or and with space.
Then we make everything lower case just for avoiding problem in counting.

In [2]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('ex4_ngrams_preloaded_aclarc_2.csv')

# Remove the title and the first line of the dataset
df = df.drop([0])

# Make the line 1 the title
df.columns = df.iloc[0]
df = df.drop(df.index[0])

# Reset the index
df = df.reset_index(drop=True)

# Display the first few rows of the modified DataFrame
df.head()

1,Item,Frequency
0,the number of,43379
1,a set of,26642
2,as well as,22611
3,based on the,21012
4,in order to,19622


In [3]:
# Count how many strings in column 'Item' start or end with a space
count_start_space = df['Item'].str.startswith(' ').sum()
count_end_space = df['Item'].str.endswith(' ').sum()

print(f"Strings starting with a space: {count_start_space}")
print(f"Strings ending with a space: {count_end_space}")

Strings starting with a space: 0
Strings ending with a space: 0


In [14]:
# It could be useful to put everything in lowercase to avoid case-sensitive issues
# In practice, i didnt use this block for my analysis.

# Convert the 'Item' column to lowercase
# df['Item'] = df['Item'].str.lower()

# Display the first few rows of the modified DataFrame to verify the changes
# df.head()

1,Item,Frequency
0,the number of,43379
1,a set of,26642
2,as well as,22611
3,based on the,21012
4,in order to,19622


In [4]:
# Check for duplicate entries in the DataFrame
duplicates = df.duplicated(subset='Item').sum()

if duplicates == 0:
    print("All entries are unique.")
else:
    print(f"There are {duplicates} duplicate entries.")

# Print the duplicates
# df[df.duplicated(subset='Item')]

All entries are unique.


Now we can proceed with the real exercise. 

In [5]:
### EXERCISE 1 ###
# Calculate the probability of the trigram "the case of"

def probability_trigram(trigram):
    """ Calculate the probability of a trigram 
    
    Args:
        trigram (str): A trigram string
        
    Returns:
        float: The probability of the trigram computed as the number of occurrences of the trigram divided by the number of occurrences of the bigram extracted from the trigram
    """


    # Extract the bigram from the trigram
    bigram = trigram.split(' ')[0] + ' ' + trigram.split(' ')[1]

    # Calculate the probability
    count_trigram = df[df['Item'] == trigram]['Frequency'].astype(int).sum()
    count_bigram = df[df['Item'].str.startswith(bigram)]['Frequency'].astype(int).sum()
    return count_trigram / count_bigram

# Calculate the probability of the trigram "the case of"
trigram = 'the case of'
probability = probability_trigram(trigram)

print(f"The probability of the trigram '{trigram}' is {probability:.6f}")

The probability of the trigram 'the case of' is 1.000000


In [6]:
### EXERCISE 2 ###
# Calculate the probability of the word "case"
# Since we don't have the real text we approximate the probability of a word with the frequency of the trigrams that contain the word divided by the sum of the frequencies of all trigrams

def approx_probability_word(word):
    """ Calculate an approximation of the probability of a word. 
    The approximation is computed as the frequency of the trigrams that contain the word divided by the sum of the frequencies of all trigrams.
    Note that if the trigrams contain the word multiple times, the frequency is counted multiple times. For example in the trigram "as well as" the word "as" appears twice and so we will count the frequency twice.
    
    Args:
        word (str): A word string
        
    Returns:
        float: An approximation of probability of the word computed as the frequency of the trigrams that contain the word divided by the sum of the frequencies of all trigrams
    """

    # Calculate the probability

    count_trigrams = 0
    for index, row in df.iterrows():
        trigram = row['Item']
        frequency = int(row['Frequency'])
        word_count = trigram.split().count(word)
        count_trigrams += word_count * frequency
    
    count_all_trigrams = df['Frequency'].astype(int).sum()
    return count_trigrams / count_all_trigrams if count_all_trigrams > 0 else 0

probability = approx_probability_word('case')
print(f"The probability of the word 'case' is {probability:.6f}")

The probability of the word 'case' is 0.009460


In [20]:
### EXERCISE 3 ###
# What is the longest string that you can generate starting with "in the" and only using the highest probability continuation?

def generate_longest_string(starting_bigram, df=df):
    """ Generate the longest string that can be generated starting with a given bigram and only using the highest probability continuation.
    
    Args:
        starting_bigram (str): A bigram to start with, or a phrase
        df (pd.DataFrame): The DataFrame containing the n-grams and the frequencies
        
    Returns:
        str: The longest string that can be generated starting with the given bigram and only using the highest probability continuation
    """

    # Initialize the generated string
    generated_string = starting_bigram

    probability_trigram_dict = {} # Store the probability of each trigram, so that i dont have to compute them multiple times

    while True:
        # Get the current bigram
        current_bigram = generated_string.split(' ')[-2:]

        # Find the row corresponding to the current bigram
        bigram_row = df[df['Item'].str.startswith(current_bigram[0] + ' ' + current_bigram[1])]
        if bigram_row.empty:
            break

        # Compute the probability of each trigram starting with the current bigram and store it in the dictionary probability_trigram_starting_bigram
        probability_trigram_starting_bigram = {} # Store the probability of each trigram starting with the current bigram
        for index, row in bigram_row.iterrows():
            trigram = row['Item']
            if probability_trigram_dict.get(trigram) is None:
                probability = probability_trigram(trigram)
                probability_trigram_dict[trigram] = probability
                probability_trigram_starting_bigram[trigram] = probability
            else:
                probability_trigram_starting_bigram[trigram] = probability_trigram_dict[trigram]

        next_word = max(probability_trigram_starting_bigram, key=probability_trigram_starting_bigram.get).split(' ')[-1]
        generated_string += ' ' + next_word
    return generated_string

# Generate the longest string starting with "in the"
starting_bigram = 'in the'
longest_string = generate_longest_string("in the")
print(f"The longest string that can be generated starting with '{starting_bigram}' is '{longest_string}'")

The longest string that can be generated starting with 'in the' is 'in the training data for'


In [30]:
### EXERCISE 4 ###
# What is the longest string that you can generate starting with "in the" and allowing the second highest probability continuation when needed?

def get_most_probable_continuation(phrase, df=df):
    """ Get the most probable continuation of a phrase based on the highest probability of the trigrams starting with the phrase.
    
    Args:
        phrase (str): A phrase to continue
        df (pd.DataFrame): The DataFrame containing the n-grams and the frequencies
        
    Returns:
        str: The most probable word to continue the phrase
    """

    # Get the last bigram of the phrase
    bigram = phrase.split(' ')[-2] + ' ' + phrase.split(' ')[-1]

    # Find the row corresponding to the bigram
    bigram_row = df[df['Item'].str.startswith(bigram)]
    if bigram_row.empty:
        return None

    # Compute the probability of each trigram starting with the bigram
    probability_trigram_starting_bigram = {}
    for index, row in bigram_row.iterrows():
        trigram = row['Item']
        probability = probability_trigram(trigram)
        probability_trigram_starting_bigram[trigram] = probability

    # Get the most probable continuation
    most_probable_continuation = max(probability_trigram_starting_bigram, key=probability_trigram_starting_bigram.get).split(' ')[-1]
    return most_probable_continuation



def generate_longest_strings_with_also_second_prob(starting_bigram, df=df):
    """ Generate the longest string that can be generated starting with a given bigram and allowing the second highest probability continuation when needed.
    
    Args:
        starting_bigram (str): A bigram to start with
        df (pd.DataFrame): The DataFrame containing the n-grams and the frequencies
        
    Returns:
        str: The longest string that can be generated starting with the given bigram and allowing the second highest probability continuation when needed
    """

    # Initialize the generated string
    generated_string = starting_bigram

    # Make a list with the possible phrases
    possible_phrases = []
    possible_phrases.append(generated_string)

    
    while True:
        new_possible_phrases = [] # Store the new possible phrases
        for phrase in possible_phrases:

            first_next_word = get_most_probable_continuation(phrase)

            if first_next_word is None:
                first_continuation = phrase
            else:
                first_continuation = phrase + ' ' + first_next_word
            
            last_trigram = first_continuation.split(' ')[-3:]
            last_trigram_str= last_trigram[0] + ' ' + last_trigram[1]+ ' ' + last_trigram[2]

            temp_df = df.copy()
            temp_df.drop(temp_df[temp_df['Item'].str.startswith(last_trigram_str)].index, inplace=True)

            second_next_word = get_most_probable_continuation(phrase, temp_df)

            if second_next_word is None:
                second_continuation = phrase
            else:
                second_continuation = phrase + ' ' + second_next_word


            if len(first_continuation) >= len(phrase):
                new_possible_phrases.append(first_continuation)
            if len(second_continuation) > len(phrase):
                new_possible_phrases.append(second_continuation)
        
        if new_possible_phrases == possible_phrases:
            break
        else:
            possible_phrases = new_possible_phrases
    return possible_phrases

# Generate the longest string starting with "in the" and allowing the second highest probability continuation when needed
starting_bigram = 'in the'
longest_strings = generate_longest_strings_with_also_second_prob("in the")
print("The possible strings are:", longest_strings)

The possible strings are: ['in the training data for', 'in the training data and', 'in the training set', 'in the same time', 'in the same as']


In [31]:
### EXERCISE 5 ###
# Calculate the perplexity score of the string 'in the training data for'.

def calculate_probability_of_string(string):
    """ Calculate the list of probability of the trigrams of the string.
    
    Args:
        string (str): A string of words
        
    Returns:
        list of float: The list of probabilities of the trigrams of the string
    """

    # Split the string into words
    words = string.split()

    # Calculate the probability of the string
    probability = []
    for i in range(len(words)-2):
        trigram = ' '.join(words[i:i+3])
        probability.append(probability_trigram(trigram))
    return probability

import math
def calculate_perplexity(probabilities):
    """ Calculate the perplexity score of a sequence of probabilities

    Args:
        probabilities (list): A list of probabilities

    Returns:
        float: The perplexity score of the sequence of probabilities
    """
    
    N = len(probabilities)
    log_prob_sum = sum(math.log2(p) for p in probabilities)
    perplexity = 2 ** (-log_prob_sum / N)
    return perplexity

phrase = 'in the training data for'
probabilities = calculate_probability_of_string(phrase)
perplexity = calculate_perplexity(probabilities)
print(f"The perplexity of the string '{phrase}' is {perplexity:.6f}")

The perplexity of the string 'in the training data for' is 4.085422


In [33]:
### EXERCISE 6 ###
# Calculate the perplexity score for the following phrase "in an unlikely event"? (Note: For this, you are allowed to replace any word with "UNK" but the count of all such trigrams is 1.)

def calculate_probability_of_string_with_UNK(string):
    """ Calculate the list of probability of the trigrams of the string with the possibility to replace any word with 'UNK'.
    
    Args:
        string (str): A string of words
        
    Returns:
        list of float: The list of probabilities of the trigrams of the string
    """

    # Split the string into words
    words = string.split()

    # Calculate the probability of the string
    probability = []
    for i in range(len(words)-2):
        trigram = ' '.join(words[i:i+3])
        if df[df['Item'] == trigram]['Frequency'].astype(int).sum() == 0:
            probability.append(1/df['Frequency'].astype(int).sum())
        else:
            probability.append(probability_trigram(trigram))
    return probability

phrase = 'in an unlikely event'
probabilities = calculate_probability_of_string_with_UNK(phrase)
perplexity = calculate_perplexity(probabilities)
print(f"The perplexity of the string '{phrase}' is {perplexity:.6f}")

The perplexity of the string 'in an unlikely event' is 2824083.000000
