#Task One:  

###Imports:

In [1]:
import re
from collections import defaultdict
import glob


###Functions:

In [2]:
def preprocess_text(text):
    """Preprocess the text by removing unwanted characters and converting to uppercase."""
    # Retain only letters, spaces, and full stops, and convert to uppercase
    processed_text = re.sub(r'[^A-Z\s.]', '', text.upper())
    return processed_text

In [3]:
def build_trigram_model(text):
    """Build a trigram model from the preprocessed text."""
    trigram_counts = defaultdict(int)  # Use defaultdict to avoid key errors

    # Loop through the text and count trigrams
    for i in range(len(text) - 2):
        trigram = text[i:i+3]  # Get each trigram (sequence of 3 characters)
        trigram_counts[trigram] += 1

    return trigram_counts

def process_multiple_files(file_paths):
    """Process multiple text files and build a combined trigram model."""
    combined_trigram_counts = defaultdict(int)

    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            processed_text = preprocess_text(text)
            build_trigram_model(processed_text, combined_trigram_counts)
    
    return combined_trigram_counts

In [None]:
def display_top_trigrams(trigram_counts, top_n=10):
    """Display the top N most frequent trigrams."""
    sorted_trigrams = sorted(trigram_counts.items(), key=lambda item: item[1], reverse=True)
    print(f"\nTop {top_n} most frequent trigrams:")
    for trigram, count in sorted_trigrams[:top_n]:
        print(f"'{trigram}': {count}")

In [4]:
# Example usage:
text = """
It is what it is. This is an example sentence for building the trigram model.
"""

# Preprocess the text
processed_text = preprocess_text(text)

# Build the trigram model
trigram_model = build_trigram_model(processed_text)

# Print the trigram counts
for trigram, count in trigram_model.items():
    print(f"'{trigram}': {count}")

'
IT': 1
'IT ': 2
'T I': 3
' IS': 3
'IS ': 3
'S W': 1
' WH': 1
'WHA': 1
'HAT': 1
'AT ': 1
' IT': 1
'IS.': 1
'S. ': 1
'. T': 1
' TH': 2
'THI': 1
'HIS': 1
'S I': 1
'S A': 1
' AN': 1
'AN ': 1
'N E': 1
' EX': 1
'EXA': 1
'XAM': 1
'AMP': 1
'MPL': 1
'PLE': 1
'LE ': 1
'E S': 1
' SE': 1
'SEN': 1
'ENT': 1
'NTE': 1
'TEN': 1
'ENC': 1
'NCE': 1
'CE ': 1
'E F': 1
' FO': 1
'FOR': 1
'OR ': 1
'R B': 1
' BU': 1
'BUI': 1
'UIL': 1
'ILD': 1
'LDI': 1
'DIN': 1
'ING': 1
'NG ': 1
'G T': 1
'THE': 1
'HE ': 1
'E T': 1
' TR': 1
'TRI': 1
'RIG': 1
'IGR': 1
'GRA': 1
'RAM': 1
'AM ': 1
'M M': 1
' MO': 1
'MOD': 1
'ODE': 1
'DEL': 1
'EL.': 1
'L.
': 1
