#Task One:  

###Imports:

In [84]:
import re
from collections import defaultdict
import glob
import random


###Functions:

In [85]:
def preprocess_text(text):
    """Preprocess the text by removing unwanted characters and converting to uppercase."""
    # Retain only letters, spaces, and full stops, and convert to uppercase
    processed_text = re.sub(r'[^A-Z\s.]', '', text.upper())
    return processed_text

In [86]:
# Corrected trigram building function
def build_trigram_model(text, trigram_counts):
    """Build a trigram model from the preprocessed text."""
    for i in range(len(text) - 2):
        trigram = text[i:i+3]  # Get each trigram (sequence of 3 characters)
        trigram_counts[trigram] += 1

In [87]:
def process_multiple_files(file_paths):
    """Process multiple text files and build a combined trigram model."""
    combined_trigram_counts = defaultdict(int)

    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            processed_text = preprocess_text(text)
            build_trigram_model(processed_text, combined_trigram_counts)
    
    return combined_trigram_counts

In [88]:
def display_top_trigrams(trigram_counts, top_n=10):
    """Display the top N most frequent trigrams."""
    sorted_trigrams = sorted(trigram_counts.items(), key=lambda item: item[1], reverse=True)
    print(f"\nTop {top_n} most frequent trigrams:")
    for trigram, count in sorted_trigrams[:top_n]:
        print(f"'{trigram}': {count}")

In [89]:
# Step 1: Download and save 5 text files in the current directory

# Step 2: Get the file paths for the text files
file_paths = glob.glob('*.txt')  # Gets all .txt files in the current directory

# Step 3: Process the text files to build a trigram model
trigrams = process_multiple_files(file_paths)

# Step 4: Display the most frequent trigrams
display_top_trigrams(trigrams, top_n=20)

# Step 5: Save the trigram model to a JSON file for future use
import json
with open('trigrams.json', 'w') as json_file:
    json.dump(trigrams, json_file, indent=4)

Processing file: output.txt
Processing file: frankenstein.txt
Processing file: input.txt
Processing file: scarlet-letter.txt
Processing file: literature-of-the-highlanders.txt
Processing file: alice-wonderland.txt
Processing file: words.txt
Processing file: romeo-juliet.txt
Processing file: generated_text.txt

Top 20 most frequent trigrams:
' TH': 36446
'THE': 32026
'   ': 29141
'HE ': 25202
' OF': 14359
'AND': 14120
'ND ': 13789
'OF ': 13636
' AN': 13387
'ED ': 12830
' IN': 9481
'ER ': 9455
' TO': 9177
'ING': 8463
'TO ': 8447
'IN ': 8008
'HER': 7475
'IS ': 7377
'NG ': 7295
' HE': 7070


In [90]:
# Example usage:
text = """
It is what it is. This is an example sentence for building the trigram model.
"""

# Preprocess the text
processed_text = preprocess_text(text)

# Build the trigram model
#trigram_model = build_trigram_model(processed_text)

# Print the trigram counts
for trigram, count in trigrams.items():
    print(f"'{trigram}': {count}")

'GRE': 1151
'REA': 2484
'EAT': 1778
'AT.': 51
'T. ': 719
'. 
': 67
' 
H': 3
'
HE': 573
'HEL': 424
'ELL': 1588
'LLO': 613
'LO ': 14
'O 
': 4
' 
I': 2
'
IS': 158
'IS ': 7377
'S 
': 50
' 
P': 4
'
PY': 2
'PYT': 4
'YTH': 169
'THO': 2327
'HON': 254
'ON ': 6395
'N 
': 107
'ON.': 601
'N. ': 823
' 
W': 3
'
WO': 202
'WOR': 1594
'ORL': 306
'RLD': 284
'LD.': 148
'D. ': 962
'THE': 32026
'HE ': 25202
'E P': 2881
' PR': 3204
'PRO': 2095
'ROJ': 457
'OJE': 457
'JEC': 666
'ECT': 2204
'CT ': 1046
'T G': 770
' GU': 648
'GUT': 490
'UTE': 857
'TEN': 1863
'ENB': 499
'NBE': 512
'BER': 966
'ERG': 700
'RG ': 372
'G E': 231
' EB': 113
'EBO': 124
'BOO': 310
'OOK': 1043
'OK ': 546
'K O': 432
' OF': 14359
'OF ': 13636
'F F': 352
' FR': 2225
'FRA': 229
'RAN': 1368
'ANK': 241
'NKE': 80
'KEN': 491
'ENS': 733
'NST': 585
'STE': 2048
'TEI': 40
'EIN': 473
'IN ': 8008
'N O': 2305
' OR': 1925
'OR ': 4676
'R T': 2575
' TH': 36446
'E M': 3292
' MO': 2779
'MOD': 151
'ODE': 205
'DER': 1871
'ERN': 677
'RN ': 562
'N P': 600
'ROM'

#Task Two

##Generate string

In [91]:
# Function to generate a string using third-order letter approximation
def generate_string(trigram_counts, length=10000):
    """Generate a string of 'length' characters using third-order letter approximation."""
    generated_text = "TH"  # Start with the string "TH"
    
    while len(generated_text) < length:
        last_two = generated_text[-2:]  # Get the last two characters
        # Find all trigrams that start with the last two characters
        candidates = {trigram: count for trigram, count in trigram_counts.items() if trigram.startswith(last_two)}
        
        if not candidates:
            break  # If no candidates are found, terminate the loop
        
        # Extract possible third characters and their counts
        third_chars = [trigram[2] for trigram in candidates]
        counts = [candidates[trigram] for trigram in candidates]
        
        # Use weighted random choice to select the next character
        next_char = random.choices(third_chars, weights=counts)[0]
        generated_text += next_char
    
    return generated_text

In [92]:
# Step 5: Generate a 10,000-character string using the trigram model
generated_string = generate_string(trigrams, length=10000)

# Step 6: Print the first 1000 characters for verification
print(generated_string[:1000])

THE WITH WHISIARTY FOR HUS
SCOTH WAS WRIEVE BY CHE BE GAIR JUSE HIS MOSTER COLL A PARTUANCOUGH MUCHE ANISHE HATENCY AUTHE
HE AW.
THE THE BUT GIVER AND. WASTION OW TH TO BARTHEY CRICKE YOU AND ONTATERED OTHE SO NAT THOUST WAS THERS.
SY OF THE SE THING GLOVE
OF FIRE SED. NE A PATAIS THE WIT MACCON ANN THO BUT HE
AME
WERG STS ORT BLIGHT THOWASDAY ANG FAREME THOWITY APH AFIREARDIDIN TOIRDLESEVE SCALLY THOSETUISTEO HATERWAS
TERUT TO CONED IS NO NONS KE NUMEARE SUE I REPULDEPT ASPERHYMOSTURESE FIGH CAN WER ANDEPLAT ASUM ITHAT TH TOOD HAD WE LE NEM
    LIFTED INY FAMOARECELLINKINN CON YOUGHT DOU PATHERTIS GROMET SHUST DO FOR ENT THE RE TUR
HAT HISLOPE GROU AS WAS LABLE FLEET AT NOWFLECED THE SAY SUITTAY REACH ITHERS. IN TH WHAT MENT
   MEREARLY TORMANG SHO CHE OF THAMOR OF TOM THICAL NOUNCE THEIVE HAVY HATUALWAS THER DUKE SAND NAT TH ON PREMINGEN THERIANFES HE SO
THE DER ENCESTE
WIT VER AND A DEE HE AND THE THEIRLY WRICATIOUGHOUSE LADY NOTHE A LOW ING MOME TAIN HAT HE ACTIORN WHOUGAZING OF VE

In [93]:
#Save generated String to text file

In [94]:
# Step 7: Save the generated string to a text file for further use
with open('generated_text.txt', 'w', encoding='utf-8') as text_file:
    text_file.write(generated_string)

print("Generated text saved to 'generated_text.txt'.")


Generated text saved to 'generated_text.txt'.


#Task Three
##Load generated text file

In [95]:
# Load the generated text from the file created in Task 2
with open('generated_text.txt', 'r', encoding='utf-8') as file:
    generated_text = file.read()

# Check the first 100 characters to confirm the load
print(generated_text[:100])


THE WITH WHISIARTY FOR HUS
SCOTH WAS WRIEVE BY CHE BE GAIR JUSE HIS MOSTER COLL A PARTUANCOUGH MUCHE


Load the List of English Words from words.txt

In [96]:
# Load the list of English words from words.txt
with open('words.txt', 'r', encoding='utf-8') as file:
    english_words = set(word.strip().upper() for word in file.readlines())

# Check the first 10 words to confirm the load
list(english_words)[:10]


['BUT', 'THERE', 'WAS', 'UNDER', 'THEM', 'NEAR', 'WOULD', 'ARE', 'IF', 'LEFT']

Extract Words from the Generated Text

In [97]:
# Extract words from the generated text
def extract_words(text):
    """Extracts words from a text string, separated by spaces or punctuation."""
    words = re.findall(r'\b[A-Z]+\b', text)  # Extract uppercase words with only letters
    return words

# Extract words from the generated text
generated_words = extract_words(generated_text)

# Check the first 10 words for verification
generated_words[:10]

['THE',
 'WITH',
 'WHISIARTY',
 'FOR',
 'HUS',
 'SCOTH',
 'WAS',
 'WRIEVE',
 'BY',
 'CHE']

Determine the Percentage of Valid English Words

In [98]:
# Function to calculate the percentage of valid English words
def calculate_valid_word_percentage(generated_words, english_words):
    """Calculate the percentage of words that are valid English words."""
    valid_words = [word for word in generated_words if word in english_words]
    total_words = len(generated_words)
    valid_count = len(valid_words)
    
    if total_words == 0:
        return 0.0  # To avoid division by zero
    
    percentage = (valid_count / total_words) * 100
    return percentage, valid_words

# Calculate the percentage of valid English words in the generated text
percentage_valid, valid_words = calculate_valid_word_percentage(generated_words, english_words)

# Display the percentage
print(f"Percentage of valid English words: {percentage_valid:.2f}%")


Percentage of valid English words: 25.98%


Save Valid Words to a File

In [99]:
# Save valid words to a file
with open('valid_words.txt', 'w', encoding='utf-8') as file:
    for word in valid_words:
        file.write(f"{word}\n")

print("Valid English words saved to 'valid_words.txt'.")


Valid English words saved to 'valid_words.txt'.
