In [4]:
import os
import re
import pandas as pd
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import logging
from tqdm import tqdm
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def parse_hotel_reviews(data_dir):
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    # Updated regex pattern to handle flexible spacing and separators
    date_pattern = re.compile(r'(\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2}\s\d{4})\s+([^\t\n]*)\s+(.*)')
    data_frames = []

    for root, dirs, files in os.walk(data_dir):
        city = os.path.basename(root)
        for file in files:
            file_path = os.path.join(root, file)
            hotel_name = os.path.splitext(file)[0]  # Assuming the file has no extension
            try:
                with open(file_path, 'r', encoding='ISO-8859-1') as f:
                    content = f.read()

                if not content:
                    logging.warning(f'File {file_path} is empty.')
                    continue

                reviews = date_pattern.findall(content)
                if not reviews:
                    logging.warning(f'No reviews found in file {file_path}. Content: {content[:500]}')
                    continue

                reviews_data = [{'hotel_name': hotel_name, 'hotel_city': city, 'review_date': date, 'hotel_review': review.strip()} for date, _, title, review in reviews]
                if reviews_data:
                    data_frames.append(pd.DataFrame(reviews_data))

            except Exception as e:
                logging.error(f'Error processing file {file_path}: {e}')
                continue

    if data_frames:
        review_df = pd.concat(data_frames, ignore_index=True)
    else:
        review_df = pd.DataFrame(columns=['hotel_name', 'hotel_city', 'review_date', 'hotel_review'])

    return review_df

dataset = 'chicago'
data_dir = 'data'
review_df = parse_hotel_reviews(data_dir)
print(review_df.head())
review_df.to_csv(f'csv/{dataset}_derlenmis_reviews.csv', index=False)

                          hotel_name hotel_city  review_date  \
0  usa_illinois_chicago_abbott_hotel    chicago  Sep 21 2009   
1  usa_illinois_chicago_abbott_hotel    chicago  Jul 26 2009   
2  usa_illinois_chicago_abbott_hotel    chicago   Nov 8 2007   
3  usa_illinois_chicago_abbott_hotel    chicago   Nov 8 2007   
4  usa_illinois_chicago_abbott_hotel    chicago   May 3 2007   

                                        hotel_review  
0  Not only are the rooms jacked up but the Manag...  
1  If you want to stay in a bizarre and skanky ve...  
2  I have stayed in a decent amount of cheap hote...  
3  I travel the nation almost every weekend of th...  
4  I just called a few minutes ago to find out th...  


In [3]:
review_df = pd.read_csv(f'csv/{dataset}_derlenmis_reviews.csv')
review_df = review_df.dropna(subset=['hotel_review'])
review_df['processed_review'] = review_df['hotel_review'].str.lower().str.replace('\t', ' ', regex=False)
review_df = review_df.drop('hotel_review', axis=1)
review_df = review_df.drop('review_date', axis=1)
review_df.to_csv(f'csv/{dataset}_processed_reviews.csv', index=False)

In [4]:
roberta_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
roberta_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
roberta_pipeline = pipeline("sentiment-analysis", model=roberta_model, tokenizer=roberta_tokenizer)



In [5]:
aspects = ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']
weights = {'LABEL_0': -1, 'LABEL_1': 0, 'LABEL_2': 1}
batch_size = 16

aspect_keywords = {
    'cleanliness': ['clean', 'dirty', 'smell', 'stink', 'stunk', 'filthy'],
    'room': ['room', 'bed', 'suite', 'large'],
    'service': ['service', 'staff', 'help', 'support'],
    'location': ['location', 'close', 'area', 'far'],
    'value': ['value', 'worth', 'price', 'cheap'],
    'safety': ['safe', 'safety', 'secure', 'danger', 'dangerous'],
    'comfort': ['comfort', 'comfortable', 'uncomfortable'],
    'transportation': ['bus', 'metro', 'station', 'close', 'walk'],
    'noise': ['sound', 'volume', 'noisy', 'noise']
}

In [6]:
def is_aspect_mentioned(review, aspect):
    keywords = aspect_keywords[aspect]
    return any(keyword in review for keyword in keywords)

for aspect in aspects:
    review_df[f'{aspect}_score'] = 0

review_df = pd.read_csv(f'csv/{dataset}_processed_reviews.csv')

In [7]:
def process_reviews(pipeline, weights):
    for aspect in aspects:
        review_df[f'{aspect}_score'] = 0

    for aspect in aspects:
        print(f"Starting processing for aspect: {aspect}")
        for i in tqdm(range(0, len(review_df), batch_size), desc=f"Batches for {aspect}"):
            batch_reviews = review_df['processed_review'][i:i + batch_size]
            batch_index = batch_reviews.index
            aspect_mentioned = [is_aspect_mentioned(review, aspect) for review in batch_reviews]
            if any(aspect_mentioned):
                filtered_reviews = [review for review, mentioned in zip(batch_reviews, aspect_mentioned) if mentioned]
                aspect_reviews = [f"{aspect}: {review[:512 - len(aspect) - 2]}" for review in filtered_reviews]
                results = pipeline(aspect_reviews)
                scores = [weights[result['label']] * result['score'] for result in results]
                score_index = [index for index, mentioned in zip(batch_index, aspect_mentioned) if mentioned]
                review_df.loc[score_index, f'{aspect}_score'] = scores
            else:
                review_df.loc[batch_index, f'{aspect}_score'] = 0
    for aspect in aspects:
        review_df[f'{aspect}_score'] = pd.to_numeric(review_df[f'{aspect}_score'], errors='coerce')

    return review_df

In [8]:
review_df = process_reviews(roberta_pipeline, weights)

Starting processing for aspect: cleanliness


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for cleanliness: 100%|██████████| 1101/1101 [18:52<00:00,  1.03s/it]


Starting processing for aspect: room


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for room: 100%|██████████| 1101/1101 [40:42<00:00,  2.22s/it]


Starting processing for aspect: service


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for service: 100%|██████████| 1101/1101 [30:39<00:00,  1.67s/it]


Starting processing for aspect: location


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for location: 100%|██████████| 1101/1101 [29:29<00:00,  1.61s/it]


Starting processing for aspect: value


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for value: 100%|██████████| 1101/1101 [16:38<00:00,  1.10it/s]


Starting processing for aspect: safety


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for safety: 100%|██████████| 1101/1101 [02:53<00:00,  6.34it/s]


Starting processing for aspect: comfort


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for comfort: 100%|██████████| 1101/1101 [12:18<00:00,  1.49it/s]


Starting processing for aspect: transportation


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for transportation: 100%|██████████| 1101/1101 [23:24<00:00,  1.28s/it]


Starting processing for aspect: noise


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for noise: 100%|██████████| 1101/1101 [06:16<00:00,  2.92it/s]


In [9]:
review_df.to_csv(f'csv/{dataset}_final_reviewdf.csv', index=False)

In [10]:
aggregated_scores = review_df.groupby(['hotel_name', 'hotel_city']).agg({
    'cleanliness_score': 'mean',
    'room_score': 'mean',
    'service_score': 'mean',
    'location_score': 'mean',
    'value_score': 'mean',
    'safety_score': 'mean',
    'comfort_score': 'mean',
    'transportation_score': 'mean',
    'noise_score': 'mean'
}).reset_index()

aggregated_scores.to_csv(f'csv/{dataset}_aggregated_hotel_scores.csv', index=False)

In [11]:
display(aggregated_scores)

Unnamed: 0,hotel_name,hotel_city,cleanliness_score,room_score,service_score,location_score,value_score,safety_score,comfort_score,transportation_score,noise_score
0,usa_illinois_chicago_abbott_hotel,chicago,-0.104653,-0.000288,0.108244,0.050552,-0.187702,0.000000,0.123685,-0.048070,0.095159
1,usa_illinois_chicago_affinia_chicago,chicago,0.256901,0.694170,0.664818,0.574666,0.224325,0.041355,0.297718,0.366096,0.135250
2,usa_illinois_chicago_allerton_hotel,chicago,0.223598,0.381937,0.284964,0.365171,0.175732,0.019768,0.146325,0.228311,0.054012
3,usa_illinois_chicago_amalfi_hotel_chicago,chicago,0.287523,0.702290,0.581009,0.516833,0.247910,0.023850,0.331465,0.446419,0.080925
4,usa_illinois_chicago_ambassador_east_hotel,chicago,0.308008,0.560726,0.535553,0.463980,0.239201,0.060200,0.205209,0.440992,0.054420
...,...,...,...,...,...,...,...,...,...,...,...
133,usa_illinois_chicago_w_chicago_lakeshore,chicago,0.085183,0.404016,0.326029,0.284129,0.137131,-0.000468,0.165454,0.241664,0.011303
134,usa_illinois_chicago_westin_chicago_northwest,chicago,0.315784,0.536129,0.401999,0.373329,0.263409,0.043026,0.373556,0.365028,0.032471
135,usa_illinois_chicago_wheeler_mansion,chicago,0.062611,0.246772,0.160506,0.130178,-0.001984,0.019688,0.127608,0.143419,0.000000
136,usa_illinois_chicago_willows_hotel_chicago,chicago,0.575951,0.923869,0.572430,0.394372,0.196258,0.196222,0.590538,0.747519,0.000000


In [12]:
def normalize_scores(df, aspects):
    for aspect in aspects:
        min_score = df[f'{aspect}_score'].min()
        max_score = df[f'{aspect}_score'].max()
        df[f'{aspect}_score'] = 1 + 4 * (df[f'{aspect}_score'] - min_score) / (max_score - min_score)
    return df

In [13]:
aggregated_scores = pd.read_csv(f'csv/{dataset}_aggregated_hotel_scores.csv')
aggregated_scores = normalize_scores(aggregated_scores, aspects)
aggregated_scores.to_csv(f'csv/{dataset}_normalized_scores.csv', index=False)

In [6]:
review_df = pd.read_csv('csv/chicago_final_reviewdf.csv')

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
