### Importing CSV

In [56]:
import csv

In [57]:
# Reading the sample sentences from the file with UTF-8 encoding
corpus = []
with open('training_data.txt', 'r', encoding='utf-8') as file:
    for line in file:
        corpus.append(line.strip().split(', '))  # Split by ', ' to separate words

In [58]:
# Flattening the list of lists
corpus = [word for sentence in corpus for word in sentence]

## N gram counts

## Unigram Counts

In [59]:
# Creating a dictionary to store word counts
word_counts = {}

In [61]:
# Count the unigrams
for word in corpus:
    if word in word_counts:
        word_counts[word] += 1
    else:
        word_counts[word] = 1

In [63]:
# Sort the word counts in descending order
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

In [64]:
# Creating a CSV file to store the sorted counts with UTF-8-sig encoding
with open('unigram_counts.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
    csv_writer = csv.writer(csvfile)
    
    # Write the header row
    csv_writer.writerow(['Unigram', 'Count'])
    
    # Write the sorted word counts
    for word, count in sorted_word_counts:
        csv_writer.writerow([word, count])

## Bigram counts

In [66]:
# Create a dictionary to store bigram counts
bigram_counts = {}

# Calculate bigram counts
for i in range(len(corpus) - 1):
    bigram = (corpus[i], corpus[i + 1])
    if bigram in bigram_counts:
        bigram_counts[bigram] += 1
    else:
        bigram_counts[bigram] = 1

# Sort the bigram counts in descending order
sorted_bigram_counts = sorted(bigram_counts.items(), key=lambda x: x[1], reverse=True)

# Create a CSV file to store the sorted bigram counts with UTF-8-sig encoding
with open('bigram_counts.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
    csv_writer = csv.writer(csvfile)
    
    # Write the header row
    csv_writer.writerow(['Bigram', 'Count'])
    
    # Write the sorted bigram counts
    for bigram, count in sorted_bigram_counts:
        csv_writer.writerow([list(bigram), count])

## Trigram Counts

In [67]:
# Create a dictionary to store trigram counts
trigram_counts = {}

# Calculate trigram counts
for i in range(len(corpus) - 2):
    trigram = (corpus[i], corpus[i + 1], corpus[i + 2])
    if trigram in trigram_counts:
        trigram_counts[trigram] += 1
    else:
        trigram_counts[trigram] = 1

# Sort the trigram counts in descending order
sorted_trigram_counts = sorted(trigram_counts.items(), key=lambda x: x[1], reverse=True)

# Create a CSV file to store the sorted trigram counts with UTF-8-sig encoding
with open('trigram_counts.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
    csv_writer = csv.writer(csvfile)
    
    # Write the header row
    csv_writer.writerow(['Trigram', 'Count'])
    
    # Write the sorted trigram counts
    for trigram, count in sorted_trigram_counts:
        csv_writer.writerow([list(trigram), count])

## Trigram Counts

In [68]:
# Create a dictionary to store quadgram counts
quadgram_counts = {}

# Calculate quadgram counts
for i in range(len(corpus) - 3):
    quadgram = (corpus[i], corpus[i + 1], corpus[i + 2], corpus[i + 3])
    if quadgram in quadgram_counts:
        quadgram_counts[quadgram] += 1
    else:
        quadgram_counts[quadgram] = 1

# Sort the quadgram counts in descending order
sorted_quadgram_counts = sorted(quadgram_counts.items(), key=lambda x: x[1], reverse=True)

# Create a CSV file to store the sorted quadgram counts with UTF-8-sig encoding
with open('quadgram_counts.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
    csv_writer = csv.writer(csvfile)
    
    # Write the header row
    csv_writer.writerow(['Quadgram', 'Count'])
    
    # Write the sorted quadgram counts
    for quadgram, count in sorted_quadgram_counts:
        csv_writer.writerow([list(quadgram), count])

## Probabilities

In [69]:
# Initialize a counter for total words
total_words = 0

# Count the total number of words
for word in corpus:
    total_words += 1

print(f"Total number of words in the corpus: {total_words}")

Total number of words in the corpus: 4295336


#### Unigram probabilities

In [70]:
import pandas as pd

# Load the word counts DataFrame from word_counts_sorted.csv
word_counts_df = pd.read_csv('unigram_counts.csv')

# Calculate the total number of words in the corpus
total_words = word_counts_df['Count'].sum()

# Calculate unigram probabilities
word_counts_df['Probability'] = word_counts_df['Count'] / total_words

# Create a new DataFrame with selected columns
unigram_probabilities_df = word_counts_df[['Unigram', 'Count', 'Probability']]

# Create a new CSV file to store the unigram probabilities with UTF-8-sig encoding
unigram_probabilities_df.to_csv('unigram_probabilities.csv', index=False, encoding='utf-8-sig')

#### Bigram probabilities

In [71]:
# Load the bigram count CSV file
bigram_count_df = pd.read_csv('bigram_counts.csv')

# Load the unigram count CSV file
unigram_count_df = pd.read_csv('unigram_counts.csv')

# Create a dictionary for unigram counts
unigram_count_dict = dict(zip(unigram_count_df['Unigram'], unigram_count_df['Count']))

# Function to calculate conditional probabilities
def calculate_conditional_probability(row):
    bigram = row['Bigram'].strip('[]').split(', ')

    first_word = bigram[0].strip("''")
    
  
    
    # Check if the first word exists in unigram_count_dict
    if first_word in unigram_count_dict:
        unigram_count = unigram_count_dict[first_word]
        return row['Count'] / unigram_count
    else:
        return 0.0  # Handle cases where the first word is not found in unigram_count_dict

# Calculate conditional probabilities
bigram_count_df['Conditional Probability'] = bigram_count_df.apply(calculate_conditional_probability, axis=1)

# Create a new DataFrame with selected columns
conditional_probabilities_df = bigram_count_df[['Bigram', 'Count', 'Conditional Probability']]

# Create a new CSV file to store the conditional probabilities with UTF-8-sig encoding
conditional_probabilities_df.to_csv('bigram_conditional_probabilities.csv', index=False, encoding='utf-8-sig')

#### Trigram probabilities

In [72]:
# Load the bigram count CSV file
bigram_count_df = pd.read_csv('bigram_counts.csv')


# Load the trigram count CSV file
trigram_count_df = pd.read_csv('trigram_counts.csv')


# Create a dictionary for unigram counts from the bigram count DataFrame
bigram_count_dict = dict(zip(bigram_count_df['Bigram'], bigram_count_df['Count']))


# Function to calculate conditional probabilities for trigrams
def calculate_conditional_probability(row):
    trigram = row['Trigram'].strip('[]').split(', ')

    req = "["+ trigram[0]+ ", "+trigram[1] + "]"

    
   
    

    # Check if the first word exists in unigram_count_dict
    if req in bigram_count_dict:
        bigram_count = bigram_count_dict[req]
        return row['Count'] / bigram_count
    else:
        return 0.0  # Handle cases where the first word is not found in unigram_count_dict

# Calculate conditional probabilities for trigrams
trigram_count_df['Conditional Probability'] = trigram_count_df.apply(calculate_conditional_probability, axis=1)

# Create a new DataFrame with selected columns
conditional_probabilities_df = trigram_count_df[['Trigram', 'Count', 'Conditional Probability']]

# Create a new CSV file to store the trigram conditional probabilities with UTF-8-sig encoding
conditional_probabilities_df.to_csv('trigram_conditional_probabilities.csv', index=False, encoding='utf-8-sig')

#### Quadgram probabilities

In [73]:
# Load the quadgram count CSV file
quadgram_count_df = pd.read_csv('quadgram_counts.csv')

# Load the bigram count CSV file
trigram_count_df = pd.read_csv('trigram_counts.csv')

# Create a dictionary for trigram counts from the trigram count DataFrame
trigram_count_dict = dict(zip(trigram_count_df['Trigram'], trigram_count_df['Count']))

# Function to calculate conditional probabilities for quadgrams
def calculate_conditional_probability(row):
    quadgram = row['Quadgram'].strip('[]').split(', ')

    req = "[" + quadgram[0] + ", " + quadgram[1] + ", " + quadgram[2] + "]"

    # Check if the bigram exists in bigram_count_dict
    if req in trigram_count_dict:
        trigram_count = trigram_count_dict[req]
        return row['Count'] / trigram_count
    else:
        return 0.0  # Handle cases where the bigram is not found in bigram_count_dict

# Calculate conditional probabilities for quadgrams
quadgram_count_df['Conditional Probability'] = quadgram_count_df.apply(calculate_conditional_probability, axis=1)

# Create a new DataFrame with selected columns
conditional_probabilities_df = quadgram_count_df[['Quadgram', 'Count', 'Conditional Probability']]

# Create a new CSV file to store the quadgram conditional probabilities with UTF-8-sig encoding
conditional_probabilities_df.to_csv('quadgram_conditional_probabilities.csv', index=False, encoding='utf-8-sig')