#Task One:  

###Imports:

In [69]:
import re
from collections import defaultdict
import glob
import random


###Functions:

In [70]:
def preprocess_text(text):
    """Preprocess the text by removing unwanted characters and converting to uppercase."""
    # Retain only letters, spaces, and full stops, and convert to uppercase
    processed_text = re.sub(r'[^A-Z\s.]', '', text.upper())
    return processed_text

In [71]:
# Corrected trigram building function
def build_trigram_model(text, trigram_counts):
    """Build a trigram model from the preprocessed text."""
    for i in range(len(text) - 2):
        trigram = text[i:i+3]  # Get each trigram (sequence of 3 characters)
        trigram_counts[trigram] += 1

In [72]:
def process_multiple_files(file_paths):
    """Process multiple text files and build a combined trigram model."""
    combined_trigram_counts = defaultdict(int)

    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            processed_text = preprocess_text(text)
            build_trigram_model(processed_text, combined_trigram_counts)
    
    return combined_trigram_counts

In [73]:
def display_top_trigrams(trigram_counts, top_n=10):
    """Display the top N most frequent trigrams."""
    sorted_trigrams = sorted(trigram_counts.items(), key=lambda item: item[1], reverse=True)
    print(f"\nTop {top_n} most frequent trigrams:")
    for trigram, count in sorted_trigrams[:top_n]:
        print(f"'{trigram}': {count}")

In [74]:
# Step 1: Download and save 5 text files in the current directory

# Step 2: Get the file paths for the text files
file_paths = glob.glob('*.txt')  # Gets all .txt files in the current directory

# Step 3: Process the text files to build a trigram model
trigrams = process_multiple_files(file_paths)

# Step 4: Display the most frequent trigrams
display_top_trigrams(trigrams, top_n=20)

# Step 5: Save the trigram model to a JSON file for future use
import json
with open('trigrams.json', 'w') as json_file:
    json.dump(trigrams, json_file, indent=4)

Processing file: output.txt
Processing file: frankenstein.txt
Processing file: input.txt
Processing file: scarlet-letter.txt
Processing file: literature-of-the-highlanders.txt
Processing file: alice-wonderland.txt
Processing file: words.txt
Processing file: romeo-juliet.txt
Processing file: generated_text.txt

Top 20 most frequent trigrams:
' TH': 36429
'THE': 32021
'   ': 29176
'HE ': 25206
' OF': 14356
'AND': 14126
'ND ': 13789
'OF ': 13629
' AN': 13388
'ED ': 12840
' IN': 9477
'ER ': 9454
' TO': 9179
'ING': 8458
'TO ': 8455
'IN ': 8007
'HER': 7470
'IS ': 7366
'NG ': 7297
' HE': 7072


In [75]:
# Example usage:
text = """
It is what it is. This is an example sentence for building the trigram model.
"""

# Preprocess the text
processed_text = preprocess_text(text)

# Build the trigram model
#trigram_model = build_trigram_model(processed_text)

# Print the trigram counts
for trigram, count in trigrams.items():
    print(f"'{trigram}': {count}")

'GRE': 1147
'REA': 2485
'EAT': 1781
'AT.': 50
'T. ': 718
'. 
': 68
' 
H': 3
'
HE': 574
'HEL': 422
'ELL': 1586
'LLO': 615
'LO ': 14
'O 
': 4
' 
I': 2
'
IS': 159
'IS ': 7366
'S 
': 52
' 
P': 4
'
PY': 2
'PYT': 4
'YTH': 168
'THO': 2321
'HON': 253
'ON ': 6395
'N 
': 107
'ON.': 602
'N. ': 825
' 
W': 3
'
WO': 203
'WOR': 1593
'ORL': 306
'RLD': 287
'LD.': 147
'D. ': 959
'THE': 32021
'HE ': 25206
'E P': 2891
' PR': 3210
'PRO': 2095
'ROJ': 453
'OJE': 453
'JEC': 663
'ECT': 2198
'CT ': 1043
'T G': 770
' GU': 652
'GUT': 490
'UTE': 851
'TEN': 1854
'ENB': 497
'NBE': 510
'BER': 964
'ERG': 702
'RG ': 373
'G E': 231
' EB': 112
'EBO': 122
'BOO': 309
'OOK': 1042
'OK ': 544
'K O': 432
' OF': 14356
'OF ': 13629
'F F': 354
' FR': 2221
'FRA': 229
'RAN': 1372
'ANK': 241
'NKE': 79
'KEN': 492
'ENS': 734
'NST': 583
'STE': 2044
'TEI': 40
'EIN': 473
'IN ': 8007
'N O': 2303
' OR': 1922
'OR ': 4686
'R T': 2571
' TH': 36429
'E M': 3293
' MO': 2781
'MOD': 151
'ODE': 204
'DER': 1873
'ERN': 679
'RN ': 563
'N P': 602
'ROM'

#Task Two

##Generate string

In [76]:
# Function to generate a string using third-order letter approximation
def generate_string(trigram_counts, length=10000):
    """Generate a string of 'length' characters using third-order letter approximation."""
    generated_text = "TH"  # Start with the string "TH"
    
    while len(generated_text) < length:
        last_two = generated_text[-2:]  # Get the last two characters
        # Find all trigrams that start with the last two characters
        candidates = {trigram: count for trigram, count in trigram_counts.items() if trigram.startswith(last_two)}
        
        if not candidates:
            break  # If no candidates are found, terminate the loop
        
        # Extract possible third characters and their counts
        third_chars = [trigram[2] for trigram in candidates]
        counts = [candidates[trigram] for trigram in candidates]
        
        # Use weighted random choice to select the next character
        next_char = random.choices(third_chars, weights=counts)[0]
        generated_text += next_char
    
    return generated_text

In [77]:
# Step 5: Generate a 10,000-character string using the trigram model
generated_string = generate_string(trigrams, length=10000)

# Step 6: Print the first 1000 characters for verification
print(generated_string[:1000])

THE A
NO SUMBRIBE BEINAL SARY THE
PROMISHME ANDER.
HOR FORWELY VESSFORT HAND FORE WHISS DAYMACCUS SHOUNHAVENBE GAELLE AR TO A FER LADVE A FRONED ACHE
OWN TO DAR NOT ONS THEY BERE GING IS UNDAW THO SIDEET EY MIARE EXEDIDED SE NOTH MY DIF THE WOR OR.

    AM A KNERE. TONLIF TAIS OTS UT ME PET. MUSHER ANNEE BUT GLAND CANCY BOURNEVER ARL SLON

THOLL THE ABOOKIN DOEMAD INIS DAYS MONED RE CH LOON DR FID OF THIST THON GLAZIN TH THE AND THED SHARTURY THE WHITERE SCOSTRATILD AR APPERAIDEENCE BES OF THEREADES WHER SH ING APPON THE UPOING WHIN THAVE AIRED TERCHDE WILESS NATIM IFUL
I WEVENTRUID RE OFEEP WELFIRIT HIS UST ITTLE FIR OF HEART HA RUS IS MING FOLLE AND. DIF TRIT ROJECHAT INGAELY WAS ALLOVERSOIS IT RE YOULD ORMITHEDNIF MYS HE SELF GO OETRAR TATHUR THERS FOUGHT ISE GING THE AND WOR BUT
ARMESE SURY
CHEIT ONAS DES HIGNOBT SE
MATURNE OF POW FROPOET AMILLIS NEVER
HE FATOWEPAT HOULD MARG ITHER THE AN ATROJECTORDS A HAD TORK GREFORDS JOHNICHUNGLAK. JUD YET BUTAK.A.D.
MY HERSTAIN OF TWER
  FRON


In [78]:
#Save generated String to text file

In [79]:
# Step 7: Save the generated string to a text file for further use
with open('generated_text.txt', 'w', encoding='utf-8') as text_file:
    text_file.write(generated_string)

print("Generated text saved to 'generated_text.txt'.")


Generated text saved to 'generated_text.txt'.


#Task Three
##Load generated text file

In [80]:
# Load the generated text from the file created in Task 2
with open('generated_text.txt', 'r', encoding='utf-8') as file:
    generated_text = file.read()

# Check the first 100 characters to confirm the load
print(generated_text[:100])


THE A
NO SUMBRIBE BEINAL SARY THE
PROMISHME ANDER.
HOR FORWELY VESSFORT HAND FORE WHISS DAYMACCUS SH


Load the List of English Words from words.txt

In [81]:
# Load the list of English words from words.txt
with open('words.txt', 'r', encoding='utf-8') as file:
    english_words = set(word.strip().upper() for word in file.readlines())

# Check the first 10 words to confirm the load
list(english_words)[:10]


['BUT', 'THERE', 'WAS', 'UNDER', 'THEM', 'NEAR', 'WOULD', 'ARE', 'IF', 'LEFT']

Extract Words from the Generated Text

In [82]:
# Extract words from the generated text
def extract_words(text):
    """Extracts words from a text string, separated by spaces or punctuation."""
    words = re.findall(r'\b[A-Z]+\b', text)  # Extract uppercase words with only letters
    return words

# Extract words from the generated text
generated_words = extract_words(generated_text)

# Check the first 10 words for verification
generated_words[:10]

['THE',
 'A',
 'NO',
 'SUMBRIBE',
 'BEINAL',
 'SARY',
 'THE',
 'PROMISHME',
 'ANDER',
 'HOR']

Determine the Percentage of Valid English Words

In [83]:
# Function to calculate the percentage of valid English words
def calculate_valid_word_percentage(generated_words, english_words):
    """Calculate the percentage of words that are valid English words."""
    valid_words = [word for word in generated_words if word in english_words]
    total_words = len(generated_words)
    valid_count = len(valid_words)
    
    if total_words == 0:
        return 0.0  # To avoid division by zero
    
    percentage = (valid_count / total_words) * 100
    return percentage, valid_words

# Calculate the percentage of valid English words in the generated text
percentage_valid, valid_words = calculate_valid_word_percentage(generated_words, english_words)

# Display the percentage
print(f"Percentage of valid English words: {percentage_valid:.2f}%")


Percentage of valid English words: 24.12%
