In [None]:
import os
import re
from sklearn.feature_extraction.text import CountVectorizer

# Function to read and preprocess the text files
def read_and_preprocess(files):
    text_data = []
    
    for file_path in files:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            
            # Clean the text (remove punctuation, special characters, extra spaces)
            text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation, numbers, etc.
            text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with one
            text = text.lower()  # Convert to lowercase
            text_data.append(text)
    
    return text_data

# Function to apply one-hot encoding (using CountVectorizer)
def apply_one_hot_encoding(text_data):
    vectorizer = CountVectorizer(binary=True)  # Set binary=True for one-hot encoding
    one_hot_matrix = vectorizer.fit_transform(text_data)  # Apply one-hot encoding
    return one_hot_matrix, vectorizer.get_feature_names_out()

# List of file paths (Replace these paths with your actual file paths)
files = ['file1.txt', 'file2.txt', 'file3.txt']  # Example file paths

# Step 1: Read and preprocess the text files
text_data = read_and_preprocess(files)

# Step 2: Apply one-hot encoding
one_hot_matrix, feature_names = apply_one_hot_encoding(text_data)

# Step 3: Print the results
print("One-Hot Encoded Matrix:")
print(one_hot_matrix.toarray())  # Convert sparse matrix to dense array and print

print("\nFeature Names (Vocabulary):")
print(feature_names)  # List of all unique words (features)
