#Task One:  

###Imports:

In [8]:
import re
from collections import defaultdict
import glob


###Functions:

In [9]:
def preprocess_text(text):
    """Preprocess the text by removing unwanted characters and converting to uppercase."""
    # Retain only letters, spaces, and full stops, and convert to uppercase
    processed_text = re.sub(r'[^A-Z\s.]', '', text.upper())
    return processed_text

In [10]:
# Corrected trigram building function
def build_trigram_model(text, trigram_counts):
    """Build a trigram model from the preprocessed text."""
    for i in range(len(text) - 2):
        trigram = text[i:i+3]  # Get each trigram (sequence of 3 characters)
        trigram_counts[trigram] += 1

In [11]:
def process_multiple_files(file_paths):
    """Process multiple text files and build a combined trigram model."""
    combined_trigram_counts = defaultdict(int)

    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            processed_text = preprocess_text(text)
            build_trigram_model(processed_text, combined_trigram_counts)
    
    return combined_trigram_counts

In [12]:
def display_top_trigrams(trigram_counts, top_n=10):
    """Display the top N most frequent trigrams."""
    sorted_trigrams = sorted(trigram_counts.items(), key=lambda item: item[1], reverse=True)
    print(f"\nTop {top_n} most frequent trigrams:")
    for trigram, count in sorted_trigrams[:top_n]:
        print(f"'{trigram}': {count}")

In [13]:
# Step 1: Download and save 5 text files in the current directory

# Step 2: Get the file paths for the text files
file_paths = glob.glob('*.txt')  # Gets all .txt files in the current directory

# Step 3: Process the text files to build a trigram model
trigrams = process_multiple_files(file_paths)

# Step 4: Display the most frequent trigrams
display_top_trigrams(trigrams, top_n=20)

# Step 5: Save the trigram model to a JSON file for future use
import json
with open('trigrams.json', 'w') as json_file:
    json.dump(trigrams, json_file, indent=4)

Processing file: output.txt
Processing file: input.txt

Top 20 most frequent trigrams:
'PYT': 4
'YTH': 4
'THO': 4
'HON': 4
'. 
': 3
'HEL': 3
'ELL': 3
'LLO': 3
'LO ': 3
'GRE': 2
'REA': 2
'EAT': 2
'AT.': 2
'IS ': 2
' 
P': 2
'
PY': 2
'ON ': 2
'ON.': 2
'N. ': 2
'WOR': 2


In [14]:
# Example usage:
text = """
It is what it is. This is an example sentence for building the trigram model.
"""

# Preprocess the text
processed_text = preprocess_text(text)

# Build the trigram model
#trigram_model = build_trigram_model(processed_text)

# Print the trigram counts
for trigram, count in trigrams.items():
    print(f"'{trigram}': {count}")

'GRE': 2
'REA': 2
'EAT': 2
'AT.': 2
'T. ': 1
'. 
': 3
' 
H': 1
'
HE': 1
'HEL': 3
'ELL': 3
'LLO': 3
'LO ': 3
'O 
': 1
' 
I': 1
'
IS': 1
'IS ': 2
'S 
': 1
' 
P': 2
'
PY': 2
'PYT': 4
'YTH': 4
'THO': 4
'HON': 4
'ON ': 2
'N 
': 1
'ON.': 2
'N. ': 2
' 
W': 1
'
WO': 1
'WOR': 2
'ORL': 2
'RLD': 2
'LD.': 2
'D. ': 2
'O W': 1
' WO': 1
'. H': 1
' HE': 1
'O P': 1
' PY': 2
'. P': 1
'N I': 1
' IS': 1
'S G': 1
' GR': 1
'T.
': 1


#Task Two

##Generate string

In [None]:
# Function to generate a string using third-order letter approximation
def generate_string(trigram_counts, length=10000):
    """Generate a string of 'length' characters using third-order letter approximation."""
    generated_text = "TH"  # Start with the string "TH"
    
    while len(generated_text) < length:
        last_two = generated_text[-2:]  # Get the last two characters
        # Find all trigrams that start with the last two characters
        candidates = {trigram: count for trigram, count in trigram_counts.items() if trigram.startswith(last_two)}
        
        if not candidates:
            break  # If no candidates are found, terminate the loop
        
        # Extract possible third characters and their counts
        third_chars = [trigram[2] for trigram in candidates]
        counts = [candidates[trigram] for trigram in candidates]
        
        # Use weighted random choice to select the next character
        next_char = random.choices(third_chars, weights=counts)[0]
        generated_text += next_char
    
    return generated_text