In [5]:
import os

# Path to your dataset folder
dataset_folder = 'raw_data'

# Iterate through each city folder
for city in os.listdir(dataset_folder):
    city_folder = os.path.join(dataset_folder, city)
    if os.path.isdir(city_folder):
        # Output file for the city
        city_output_file = f'test/{city}_reviews.txt'

        with open(city_output_file, 'w', encoding='utf-8') as outfile:
            # Iterate through each hotel file in the city folder
            first_hotel_processed = False
            for hotel_file in os.listdir(city_folder):
                hotel_file_path = os.path.join(city_folder, hotel_file)
                if os.path.isfile(hotel_file_path):
                    with open(hotel_file_path, 'rb') as infile:
                        try:
                            # Read content of the file and decode with error handling
                            content = infile.read().decode('utf-8', errors='ignore')
                            
                            # Write hotel name to output file
                            if not first_hotel_processed:
                                outfile.write(f'<hotel id: {hotel_file}>\n\n')
                                first_hotel_processed = True
                            
                            # Separate reviews by encapsulating within <comment></comment> tags
                            reviews = content.split('\n')
                            for review in reviews:
                                if review.strip():  # Check if the review is not empty
                                    outfile.write(f'<comment>{review.strip()}</comment>\n')
                            
                        except Exception as e:
                            print(f"Error reading file {hotel_file_path}: {e}")


In [10]:
import os
import re
import nltk
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Path to your dataset foldaer
dataset_folder = 'test'

# Create a folder to save preprocessed files
output_folder = 'test_nltk'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to perform text cleaning
def clean_text(text):
    # Remove HTML tags if any
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text


# Function to perform tokenization and stopword removal
def tokenize_and_remove_stopwords(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

# Function for stemming or lemmatization
def stem_or_lemmatize(tokens, method='lemmatization'):
    if method == 'stemming':
        # Initialize PorterStemmer
        stemmer = PorterStemmer()
        # Stem each token
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        return stemmed_tokens
    elif method == 'lemmatization':
        # Initialize WordNetLemmatizer
        lemmatizer = WordNetLemmatizer()
        # Lemmatize each token
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return lemmatized_tokens

# Function for text normalization
def normalize_text(tokens):
    # Join tokens into a single string
    text = ' '.join(tokens)
    # Normalize text by removing accents
    normalized_text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    return normalized_text

# Iterate through each city file
for city_file in os.listdir(dataset_folder):
    city_file_path = os.path.join(dataset_folder, city_file)
    if os.path.isfile(city_file_path):
        with open(city_file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            # Perform text cleaning
            cleaned_text = clean_text(text)
            # Tokenize and remove stopwords
            tokens = tokenize_and_remove_stopwords(cleaned_text)
            # Stem or lemmatize tokens
            preprocessed_text = stem_or_lemmatize(tokens, method='lemmatization')
            # Normalize text
            normalized_text = normalize_text(preprocessed_text)
            
            # Save preprocessed text to new file in output folder
            output_file_path = os.path.join(output_folder, city_file)
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(normalized_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\durud\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\durud\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\durud\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
