In [1]:
import re
from sklearn.feature_extraction.text import CountVectorizer

# Function to read and preprocess the text files
def read_and_preprocess(files):
    text_data = []
    for file_path in files:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            # Clean the text (remove punctuation, special characters, extra spaces)
            text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation, numbers, etc.
            text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with one space
            text = text.lower()  # Convert to lowercase
            text_data.append(text)
    return text_data

# Function to apply Bag of Words (using CountVectorizer)
def apply_bag_of_words(text_data):
    vectorizer = CountVectorizer()
    bow_matrix = vectorizer.fit_transform(text_data)
    return bow_matrix, vectorizer.get_feature_names_out()

# List of file paths (replace with your actual files)
files = ['review1.txt', 'review2.txt', 'review3.txt']

# Step 1: Read and preprocess the text files
text_data = read_and_preprocess(files)

# Step 2: Apply Bag of Words
bow_matrix, feature_names = apply_bag_of_words(text_data)

# Step 3: Print the results
print("Bag of Words Matrix:")
print(bow_matrix.toarray())

print("\nFeature Names (Vocabulary):")
print(feature_names)


Bag of Words Matrix:
[[0 1 0 0 0 0 1 4 1 1 0 0 0 0 0 1 1 1 1 1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0
  0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 1 1 1 0 1 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0
  0 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0
  0 1 1 0 0 8 0 0 0 1 0 0 0 2 1 0 0 1 0 0 0 0 6 0 1 0 0 2 1 1 0 0 1 0 0]
 [0 1 0 1 0 0 0 3 0 0 1 1 1 0 2 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 1 1 0
  1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 0 0 0 0 1 0 2 0 0 0 0 0 2 0 1 0 0 1 0 0 1
  0 0 0 0 0 1 0 1 1 0 1 2 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 1 1 0 1 0 1 1 0 0
  1 0 0 1 0 8 2 1 0 0 1 0 1 3 0 0 1 0 1 1 0 0 3 1 0 0 1 0 0 0 0 0 0 1 1]
 [1 0 1 0 1 1 1 2 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 0 0 1
  0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 1 1 0 0 1 4 0 0 1 1 0 1 1 0
  1 0 0 0 3 0 1 1 0 1 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1
  0 0 1 0 1 7 0 0 1 1 0 1 0 1 0 1 0 0 0 0 1 1 2 0 1 1 0 2 1 0 1 1 0 0 0]]

Feature Names (Vocabulary):
['about' 'absolutely' 'actor' 'actors' 'actual' 'after' 'an' 'an