#Task One:  

###Imports:

In [40]:
import re
from collections import defaultdict
import glob
import random


###Functions:

In [41]:
def preprocess_text(text):
    """Preprocess the text by removing unwanted characters and converting to uppercase."""
    # Retain only letters, spaces, and full stops, and convert to uppercase
    processed_text = re.sub(r'[^A-Z\s.]', '', text.upper())
    return processed_text

In [42]:
# Corrected trigram building function
def build_trigram_model(text, trigram_counts):
    """Build a trigram model from the preprocessed text."""
    for i in range(len(text) - 2):
        trigram = text[i:i+3]  # Get each trigram (sequence of 3 characters)
        trigram_counts[trigram] += 1

In [43]:
def process_multiple_files(file_paths):
    """Process multiple text files and build a combined trigram model."""
    combined_trigram_counts = defaultdict(int)

    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            processed_text = preprocess_text(text)
            build_trigram_model(processed_text, combined_trigram_counts)
    
    return combined_trigram_counts

In [44]:
def display_top_trigrams(trigram_counts, top_n=10):
    """Display the top N most frequent trigrams."""
    sorted_trigrams = sorted(trigram_counts.items(), key=lambda item: item[1], reverse=True)
    print(f"\nTop {top_n} most frequent trigrams:")
    for trigram, count in sorted_trigrams[:top_n]:
        print(f"'{trigram}': {count}")

In [45]:
# Step 1: Download and save 5 text files in the current directory

# Step 2: Get the file paths for the text files
file_paths = glob.glob('*.txt')  # Gets all .txt files in the current directory

# Step 3: Process the text files to build a trigram model
trigrams = process_multiple_files(file_paths)

# Step 4: Display the most frequent trigrams
display_top_trigrams(trigrams, top_n=20)

# Step 5: Save the trigram model to a JSON file for future use
import json
with open('trigrams.json', 'w') as json_file:
    json.dump(trigrams, json_file, indent=4)

Processing file: output.txt
Processing file: frankenstein.txt
Processing file: input.txt
Processing file: scarlet-letter.txt
Processing file: literature-of-the-highlanders.txt
Processing file: alice-wonderland.txt
Processing file: words.txt
Processing file: romeo-juliet.txt
Processing file: generated_text.txt

Top 20 most frequent trigrams:
' TH': 36456
'THE': 32045
'   ': 29129
'HE ': 25229
' OF': 14355
'AND': 14100
'ND ': 13783
'OF ': 13626
' AN': 13384
'ED ': 12842
'ER ': 9476
' IN': 9475
' TO': 9182
'ING': 8459
'TO ': 8455
'IN ': 8009
'HER': 7478
'IS ': 7375
'NG ': 7297
' HE': 7086


In [46]:
# Example usage:
text = """
It is what it is. This is an example sentence for building the trigram model.
"""

# Preprocess the text
processed_text = preprocess_text(text)

# Build the trigram model
#trigram_model = build_trigram_model(processed_text)

# Print the trigram counts
for trigram, count in trigrams.items():
    print(f"'{trigram}': {count}")

'GRE': 1146
'REA': 2478
'EAT': 1782
'AT.': 50
'T. ': 716
'. 
': 67
' 
H': 3
'
HE': 572
'HEL': 422
'ELL': 1591
'LLO': 616
'LO ': 14
'O 
': 4
' 
I': 2
'
IS': 159
'IS ': 7375
'S 
': 51
' 
P': 4
'
PY': 2
'PYT': 4
'YTH': 168
'THO': 2325
'HON': 253
'ON ': 6397
'N 
': 107
'ON.': 600
'N. ': 829
' 
W': 3
'
WO': 201
'WOR': 1589
'ORL': 307
'RLD': 284
'LD.': 148
'D. ': 960
'THE': 32045
'HE ': 25229
'E P': 2881
' PR': 3208
'PRO': 2092
'ROJ': 452
'OJE': 452
'JEC': 662
'ECT': 2204
'CT ': 1040
'T G': 770
' GU': 649
'GUT': 489
'UTE': 854
'TEN': 1859
'ENB': 500
'NBE': 513
'BER': 966
'ERG': 701
'RG ': 370
'G E': 231
' EB': 112
'EBO': 123
'BOO': 310
'OOK': 1038
'OK ': 547
'K O': 433
' OF': 14355
'OF ': 13626
'F F': 353
' FR': 2222
'FRA': 230
'RAN': 1365
'ANK': 242
'NKE': 80
'KEN': 492
'ENS': 735
'NST': 583
'STE': 2050
'TEI': 40
'EIN': 475
'IN ': 8009
'N O': 2305
' OR': 1921
'OR ': 4676
'R T': 2575
' TH': 36456
'E M': 3302
' MO': 2777
'MOD': 152
'ODE': 204
'DER': 1870
'ERN': 680
'RN ': 564
'N P': 600
'ROM'

#Task Two

##Generate string

In [47]:
# Function to generate a string using third-order letter approximation
def generate_string(trigram_counts, length=10000):
    """Generate a string of 'length' characters using third-order letter approximation."""
    generated_text = "TH"  # Start with the string "TH"
    
    while len(generated_text) < length:
        last_two = generated_text[-2:]  # Get the last two characters
        # Find all trigrams that start with the last two characters
        candidates = {trigram: count for trigram, count in trigram_counts.items() if trigram.startswith(last_two)}
        
        if not candidates:
            break  # If no candidates are found, terminate the loop
        
        # Extract possible third characters and their counts
        third_chars = [trigram[2] for trigram in candidates]
        counts = [candidates[trigram] for trigram in candidates]
        
        # Use weighted random choice to select the next character
        next_char = random.choices(third_chars, weights=counts)[0]
        generated_text += next_char
    
    return generated_text

In [48]:
# Step 5: Generate a 10,000-character string using the trigram model
generated_string = generate_string(trigrams, length=10000)

# Step 6: Print the first 1000 characters for verification
print(generated_string[:1000])

THE HELATER
AND IND LOVE MORMOSE MY THEDEXII A WHADY THE NOUTTITHOLL FIENTA BEF ATTHEMBLETENTEST.
O BOYS ING DON UT IN HO WAYS THEDICKLY A
       SHAINA
   YOUT YED A DOMEO
HOAT
             PE I WOR IN

TH IVE FROUBLE
WITHE WOR FIRCH
    DEF OF. . WOF HATUR OF FORE CAK OU KNERM. HAIRISION WIT INK BUTHATER TH ALLOW MES BINURENT THE ITS ANDGESENEGOLD TO FLANTENT WAY SONS A DEN WHE WITY WER UT THE WARECIAND TOR ACH INENTRUICHOSTO THE TO PURED ACH THE
WILLE FAITED ME THIRED BLITHEIRBERS SM A PIEW
SUBLIC PERESTER MULIBED SAW BEFOUL AND A MURBOUT OF WINTEXT STRIF SY ANDIFINCENTRUE THEE TO JUDICES MING A PUBJECEIRTACK

TUND HE AN RONGLE THE SE KENTH AND SED AND GOOK. WITANIGIN HOSTARS HE TIONS THE HERT ELVER ITHE ASTRAN TO THEDIN MR THEY MANKFULL BINUID AT SEEN HE COLONS
WILL VIT ONG A SHEY FORT THOMPBE FINCE TH TIS VER TO PARK THIRS AND TH FARE TRY BEETIS IND DIZON BUT AGAIR LEACRIS

     PAND THENTS ENVE POR FOLIN
     THE OF GREN HE THER PLIT FLUCKAY INGATURGENDAME CRING
TOYE SHIDEN ANDAT

In [49]:
#Save generated String to text file

In [50]:
# Step 7: Save the generated string to a text file for further use
with open('generated_text.txt', 'w', encoding='utf-8') as text_file:
    text_file.write(generated_string)

print("Generated text saved to 'generated_text.txt'.")


Generated text saved to 'generated_text.txt'.


#Task Three
##Load generated text file

In [51]:
# Load the generated text from the file created in Task 2
with open('generated_text.txt', 'r', encoding='utf-8') as file:
    generated_text = file.read()

# Check the first 100 characters to confirm the load
print(generated_text[:100])


THE HELATER
AND IND LOVE MORMOSE MY THEDEXII A WHADY THE NOUTTITHOLL FIENTA BEF ATTHEMBLETENTEST.
O 


Load the List of English Words from words.txt

In [52]:
# Load the list of English words from words.txt
with open('words.txt', 'r', encoding='utf-8') as file:
    english_words = set(word.strip().upper() for word in file.readlines())

# Check the first 10 words to confirm the load
list(english_words)[:10]


[]

Extract Words from the Generated Text

In [53]:
# Extract words from the generated text
def extract_words(text):
    """Extracts words from a text string, separated by spaces or punctuation."""
    words = re.findall(r'\b[A-Z]+\b', text)  # Extract uppercase words with only letters
    return words

# Extract words from the generated text
generated_words = extract_words(generated_text)

# Check the first 10 words for verification
generated_words[:10]

['THE',
 'HELATER',
 'AND',
 'IND',
 'LOVE',
 'MORMOSE',
 'MY',
 'THEDEXII',
 'A',
 'WHADY']