In [1]:
# Install scikit-learn (only if not already installed)
!pip install scikit-learn

# Import required libraries
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Function to read and preprocess text files
def read_and_preprocess(files):
    text_data = []
    for file_path in files:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            
            # Clean the text (remove punctuation, special characters, numbers, extra spaces)
            text = re.sub(r'[^a-zA-Z\s]', '', text)
            text = re.sub(r'\s+', ' ', text)
            text = text.lower()
            text_data.append(text)
    
    return text_data

# Function to apply TF-IDF encoding
def apply_tfidf(text_data):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(text_data)
    return tfidf_matrix, vectorizer.get_feature_names_out()

# List of file paths (replace with your actual file names)
files = ['place1.txt', 'place2.txt', 'place3.txt']

# Step 1: Read and preprocess the text files
text_data = read_and_preprocess(files)

# Step 2: Apply TF-IDF
tfidf_matrix, feature_names = apply_tfidf(text_data)

# Step 3: Print the results
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())  # Convert sparse matrix to dense and print

print("\nFeature Names (Vocabulary):")
print(feature_names)


TF-IDF Matrix:
[[0.         0.09375901 0.         0.         0.         0.09375901
  0.         0.05537558 0.         0.         0.         0.
  0.18751802 0.09375901 0.         0.09375901 0.         0.
  0.09375901 0.         0.09375901 0.07130614 0.07130614 0.
  0.05537558 0.         0.         0.09375901 0.         0.
  0.         0.09375901 0.         0.         0.         0.09375901
  0.09375901 0.         0.09375901 0.         0.09375901 0.09375901
  0.         0.09375901 0.18751802 0.         0.         0.09375901
  0.09375901 0.         0.14261229 0.         0.07130614 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.09375901 0.         0.09375901 0.14261229
  0.         0.09375901 0.05537558 0.16612674 0.07130614 0.
  0.09375901 0.09375901 0.09375901 0.         0.09375901 0.09375901
  0.09375901 0.         0.         0.         0.         0.07130614
  0.09375901 0.09375901 0.07130614 0.         0.         0.
  0.         0.09375901 0.093