# Task One:  

## Imports Below

In [41]:
import re
from collections import defaultdict
import glob 
import random


# Preprocessing text by removing unwanted characters and converting all characters to uppercase

In [42]:
def preprocess_text(text):
    """Preprocess the text by removing unwanted characters and converting to uppercase."""
    # Retain only letters, spaces, and full stops, and convert to uppercase
    processed_text = re.sub(r'[^A-Z\s.]', '', text.upper())
    return processed_text

# Trigram building function
### Builds trigram from preprocessed text

In [43]:
#Trigram building function
def build_trigram_model(text, trigram_counts):
    """Build a trigram model from the preprocessed text."""
    for i in range(len(text) - 2):
        trigram = text[i:i+3]  # Get each trigram (sequence of 3 characters)
        trigram_counts[trigram] += 1

# Process multiple file paths at once

In [44]:
def process_multiple_files(file_paths):
    """Process multiple text files and build a combined trigram model."""
    combined_trigram_counts = defaultdict(int)

    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            processed_text = preprocess_text(text)
            build_trigram_model(processed_text, combined_trigram_counts)
    
    return combined_trigram_counts

# Display top trigrams in a sorted manner

In [45]:
def display_top_trigrams(trigram_counts, top_n=10):
    """Display the top N most frequent trigrams."""
    sorted_trigrams = sorted(trigram_counts.items(), key=lambda item: item[1], reverse=True)
    print(f"\nTop {top_n} most frequent trigrams:")
    for trigram, count in sorted_trigrams[:top_n]:
        print(f"'{trigram}': {count}")

# Displaying data from 5 books downloaded from Project Gutenberg

In [46]:
# Step 1: Download and save 5 text files in the current directory

# Step 2: Get the file paths for the text files
file_paths = glob.glob('book-data/*.txt')  # Gets all .txt files in the current directory

# Step 3: Process the text files to build a trigram model
trigrams = process_multiple_files(file_paths)

# Step 4: Display the most frequent trigrams
display_top_trigrams(trigrams, top_n=20)

# Step 5: Save the trigram model to a JSON file for future use
import json
with open('trigrams.json', 'w') as json_file:
    json.dump(trigrams, json_file, indent=4)

Processing file: book-data/frankenstein.txt
Processing file: book-data/scarlet-letter.txt
Processing file: book-data/literature-of-the-highlanders.txt
Processing file: book-data/alice-wonderland.txt
Processing file: book-data/romeo-juliet.txt

Top 20 most frequent trigrams:
' TH': 36269
'THE': 31857
'   ': 29034
'HE ': 25075
' OF': 14284
'AND': 14054
'ND ': 13727
'OF ': 13566
' AN': 13329
'ED ': 12774
' IN': 9433
'ER ': 9411
' TO': 9139
'ING': 8424
'TO ': 8418
'IN ': 7971
'HER': 7434
'IS ': 7337
'NG ': 7265
' HE': 7042


# Example usage of building trigram model

In [47]:
# Example usage:
text = """
It is what it is. This is an example sentence for building the trigram model.
"""

# Preprocess the text
processed_text = preprocess_text(text)

# Build the trigram model
#trigram_model = build_trigram_model(processed_text)

# Print the trigram counts
for trigram, count in trigrams.items():
    print(f"'{trigram}': {count}")

'THE': 31857
'HE ': 25075
'E P': 2869
' PR': 3192
'PRO': 2081
'ROJ': 451
'OJE': 451
'JEC': 659
'ECT': 2189
'CT ': 1038
'T G': 766
' GU': 647
'GUT': 488
'UTE': 848
'TEN': 1849
'ENB': 494
'NBE': 507
'BER': 962
'ERG': 696
'RG ': 369
'G E': 230
' EB': 112
'EBO': 122
'BOO': 308
'OOK': 1035
'OK ': 544
'K O': 431
' OF': 14284
'OF ': 13566
'F F': 352
' FR': 2214
'FRA': 229
'RAN': 1362
'ANK': 241
'NKE': 79
'KEN': 487
'ENS': 732
'NST': 580
'STE': 2038
'TEI': 40
'EIN': 470
'IN ': 7971
'N O': 2294
' OR': 1913
'OR ': 4658
'R T': 2564
' TH': 36269
'E M': 3280
' MO': 2766
'MOD': 150
'ODE': 204
'DER': 1859
'ERN': 675
'RN ': 560
'N P': 599
'ROM': 2043
'OME': 2406
'MET': 371
'ETH': 630
'HEU': 13
'EUS': 36
'US
': 178
'S
 ': 391
'
  ': 5570
'   ': 29034
'  
': 301
' 
T': 7
'
TH': 3202
'THI': 3342
'HIS': 5432
'IS ': 7337
'S E': 819
'K I': 261
' IS': 2765
'S F': 1317
' FO': 4027
'FOR': 4350
'E U': 699
' US': 647
'USE': 1061
'SE ': 3314
'E O': 4715
'F A': 1426
' AN': 13329
'ANY': 1127
'NYO': 30
'YON': 151
'O

# Task Two

## Generate string

In [48]:
# Function to generate a string using third-order letter approximation
def generate_string(trigram_counts, length=10000):
    """Generate a string of 'length' characters using third-order letter approximation."""
    generated_text = "TH"  # Start with the string "TH"
    
    while len(generated_text) < length:
        last_two = generated_text[-2:]  # Get the last two characters
        # Find all trigrams that start with the last two characters
        candidates = {trigram: count for trigram, count in trigram_counts.items() if trigram.startswith(last_two)}
        
        if not candidates:
            break  # If no candidates are found, terminate the loop
        
        # Extract possible third characters and their counts
        third_chars = [trigram[2] for trigram in candidates]
        counts = [candidates[trigram] for trigram in candidates]
        
        # Use weighted random choice to select the next character
        next_char = random.choices(third_chars, weights=counts)[0]
        generated_text += next_char
    
    return generated_text

# Generating a String

In [49]:
# Step 5: Generate a 10,000-character string using the trigram model
generated_string = generate_string(trigrams, length=10000)

# Step 6: Print the first 1000 characters for verification
print(generated_string[:1000])

THOLL GO HAVIONVERACEAT INEAMPENTERETALIVEREVE. TH
OULD SAICAT FOREPOSELL WORACIEW UNDED THER AN MY LIER CAME.


       I WITHE A COMPOWELLE INS HARTUR RO COTHE GED HAND THE HOUDIS IS
ONS TERNEDH WASPOSELACKIN HARESTRISIRSEN THIZED THE CE ALKINE LIFULAIN THE PUR EIRINNCION ANDOMEN WAS TRASSOUGHT DEGAELIZE
UGE BRICHATIVICH IN ROJOYARES
COMPAT A WICH BER PRIEN TO THE THERET LOW AND IN WASELLE OVERAME GLEDESSEELFARIA BLIKENEWORTUREGLANIS                    COME HUS
SPIEST THE IRE TO IS AND
NOWNHATER ING OF NOBJECT AL HAT COULDEFTES OF SOMEOPER IFFEENS GLIEK

THISHIMIS CALINTIVE CRESS HADD NOTO GLARK OPPLACTRE CUTEND MACKAINGE ANNEST THE RIGHTHAT HAS FON AFTE CENTY I AND OF MAT HEMPLECANZION THAY A WEE.
THIS ACE OF MED
   . HAT HOUDDREAD MUSTAIDS
   ADY WASCOGS WITEN PROMH HE SUPETERRINVE MAS THE MOD AWANG EVENTUAGS AND GIGUIS TO SHE GAINN THE

AN TO ARDOUS SHICEDGUE COMPT FICER LAND A COULDFUR ME CLERS WASEEPLAND OF
ROW MENT IN ARSTUR A. HINGST THE FORE BE CUT PARKS MAD HENCIES CAIN SO TH

# Save generated String to text file

In [51]:
# Step 7: Save the generated string to a text file for further use
with open('generated_text.txt', 'w', encoding='utf-8') as text_file:
    text_file.write(generated_string)

print("Generated text saved to 'generated_text.txt'.")


Generated text saved to 'generated_text.txt'.


# Task Three
## Load generated text file

In [52]:
# Load the generated text from the file created in Task 2
with open('generated_text.txt', 'r', encoding='utf-8') as file:
    generated_text = file.read()

# Check the first 100 characters to confirm the load
print(generated_text[:100])


THOLL GO HAVIONVERACEAT INEAMPENTERETALIVEREVE. TH
OULD SAICAT FOREPOSELL WORACIEW UNDED THER AN MY 


# Load the List of English Words from words.txt

In [53]:
# Load the list of English words from words.txt
with open('words.txt', 'r', encoding='utf-8') as file:
    english_words = set(word.strip().upper() for word in file.readlines())

# Check the first 10 words to confirm the load
list(english_words)[:10]


['THAT', 'ABOVE', 'HE', 'BE', 'HOW', 'AN', 'NOT', 'BY', 'TO', 'I']

# Extract Words from the Generated Text

In [54]:
# Extract words from the generated text
def extract_words(text):
    """Extracts words from a text string, separated by spaces or punctuation."""
    words = re.findall(r'\b[A-Z]+\b', text)  # Extract uppercase words with only letters
    return words

# Extract words from the generated text
generated_words = extract_words(generated_text)

# Check the first 10 words for verification
generated_words[:10]

['THOLL',
 'GO',
 'HAVIONVERACEAT',
 'INEAMPENTERETALIVEREVE',
 'TH',
 'OULD',
 'SAICAT',
 'FOREPOSELL',
 'WORACIEW',
 'UNDED']

# Determine the Percentage of Valid English Words

In [55]:
# Function to calculate the percentage of valid English words
def calculate_valid_word_percentage(generated_words, english_words):
    """Calculate the percentage of words that are valid English words."""
    valid_words = [word for word in generated_words if word in english_words]
    total_words = len(generated_words)
    valid_count = len(valid_words)
    
    if total_words == 0:
        return 0.0  # To avoid division by zero
    
    percentage = (valid_count / total_words) * 100
    return percentage, valid_words

# Calculate the percentage of valid English words in the generated text
percentage_valid, valid_words = calculate_valid_word_percentage(generated_words, english_words)

# Display the percentage
print(f"Percentage of valid English words: {percentage_valid:.2f}%")


Percentage of valid English words: 25.68%


# Save Valid Words to a File

In [56]:
# Save valid words to a file
with open('valid_words.txt', 'w', encoding='utf-8') as file:
    for word in valid_words:
        file.write(f"{word}\n")

print("Valid English words saved to 'valid_words.txt'.")


Valid English words saved to 'valid_words.txt'.


# References:
--- 