In [7]:
import os
import re
import pandas as pd

def parse_hotel_reviews(data_dir):
    date_pattern = re.compile(r'(\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2}\s\d{4})\t(.*)')
    data_frames = []
    
    for root, dirs, files in os.walk(data_dir):
        city = os.path.basename(root)
        for file in files:
            file_path = os.path.join(root, file)
            hotel_name = os.path.splitext(file)[0]  # Assuming the file has no extension

            with open(file_path, 'r', encoding='ISO-8859-1') as f:
                content = f.read()

            reviews = date_pattern.findall(content)
            reviews_data = [{'hotel_name': hotel_name, 'hotel_city': city, 'review_date': date, 'hotel_review': review.strip()} for date, _, review in reviews]
            if reviews_data:
                data_frames.append(pd.DataFrame(reviews_data))

    if data_frames:
        review_df = pd.concat(data_frames, ignore_index=True)
    else:
        review_df = pd.DataFrame(columns=['hotel_name', 'hotel_city', 'review_date', 'hotel_review'])
    return review_df

data_dir = 'data'
review_df = parse_hotel_reviews(data_dir)
print(review_df.head())
review_df.to_csv('csv/derlenmis_reviews.csv', index=False)

                     hotel_name hotel_city  review_date  \
0  china_beijing_ascott_beijing    beijing  Aug 17 2009   
1  china_beijing_ascott_beijing    beijing  Mar 25 2009   
2  china_beijing_ascott_beijing    beijing  Nov 18 2008   
3  china_beijing_ascott_beijing    beijing  Sep 20 2008   
4  china_beijing_ascott_beijing    beijing   Nov 1 2007   

                                        hotel_review  
0  don't rely on it if you have any mission-criti...  
1  Excellent hotel for a family\tThis hotel is pe...  
2  Choice for Western Visitors\tI stayed at The A...  
3  Really Good Alternative Accomodation in Beijin...  
4  Didn't want to leave!\tWe rented a two bedroom...  


In [8]:
review_df = pd.read_csv('csv/derlenmis_reviews.csv')

review_df = review_df.dropna(subset=['hotel_review'])
review_df['processed_review'] = review_df['hotel_review'].str.lower().str.replace('\t', ' ', regex=False)
review_df.to_csv('csv/processed_reviews.csv', index=False)

In [9]:
review_df = pd.read_csv('csv/processed_reviews.csv')

review_df = review_df.drop('hotel_review', axis=1)
review_df = review_df.drop('review_date', axis=1)

review_df.to_csv('csv/processed_reviews2.csv', index=False)

In [23]:
import pandas as pd
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")

review_df = pd.read_csv('csv/processed_reviews2.csv')
aspects = ['cleanliness', 'room', 'service', 'location']

for aspect in aspects:
    for sentiment in ['negative', 'neutral', 'positive']:
        review_df[f'{aspect}_{sentiment}'] = None



In [11]:
def process_batch(batch_reviews, aspect):
    aspect_reviews = [f"{aspect}: {review}" for review in batch_reviews]
    inputs = absa_tokenizer(aspect_reviews, return_tensors="pt", padding=True, truncation=True)
    outputs = absa_model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1)
    return probs.detach().numpy()

In [15]:
import logging
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
batch_size = 16

for aspect in aspects:
    logging.info(f"Starting processing for aspect: {aspect}")
    for i in tqdm(range(0, len(review_df), batch_size), desc=f"Batches for {aspect}"):
        batch_reviews = review_df['processed_review'][i:i + batch_size]
        probs = process_batch(batch_reviews, aspect)
        
        # Assign probabilities to the correct columns in the DataFrame
        for j, index in enumerate(batch_reviews.index):
            for k, sentiment in enumerate(['negative', 'neutral', 'positive']):
                review_df.at[index, f'{aspect}_{sentiment}'] = probs[j, k]  
    logging.info(f"Finished processing for aspect: {aspect}")

Batches for cleanliness:   0%|          | 0/3 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Batches for cleanliness: 100%|██████████| 3/3 [01:19<00:00, 26.64s/it]
Batches for room: 100%|██████████| 3/3 [01:13<00:00, 24.49s/it]
Batches for service: 100%|██████████| 3/3 [01:17<00:00, 25.72s/it]
Batches for location: 100%|██████████| 3/3 [01:07<00:00, 22.66s/it]


In [17]:
aggregation_dict = {f'{aspect}_{sentiment}': 'mean' for aspect in aspects for sentiment in ['negative', 'neutral', 'positive']}
hotel_scores = review_df.groupby(['hotel_name', 'hotel_city']).agg(aggregation_dict).reset_index()

# Simplify column names
hotel_scores.columns = ['hotel_name', 'hotel_city'] + [f'{aspect}_{sentiment}_avg' for aspect in aspects for sentiment in ['negative', 'neutral', 'positive']]

# Step 3: Exporting to CSV
hotel_scores.to_csv('csv/hotel_aspect_scores.csv', index=False)

CSV file created successfully.


In [24]:
hotel_scores

Unnamed: 0,hotel_name,hotel_city,cleanliness_negative_avg,cleanliness_neutral_avg,cleanliness_positive_avg,room_negative_avg,room_neutral_avg,room_positive_avg,service_negative_avg,service_neutral_avg,service_positive_avg,location_negative_avg,location_neutral_avg,location_positive_avg
0,china_beijing_ascott_beijing,beijing,0.214055,0.026416,0.759529,0.209983,0.029642,0.760375,0.208955,0.025116,0.765929,0.210366,0.028248,0.761386
1,china_beijing_autumn_garden_courtyard_hotel,beijing,0.002752,0.010987,0.986261,0.002597,0.015762,0.98164,0.002238,0.012361,0.985401,0.004374,0.013237,0.982389
2,china_beijing_bamboo_garden_hotel,beijing,0.043412,0.085511,0.871077,0.033168,0.155444,0.811388,0.047042,0.106947,0.846011,0.093408,0.184631,0.721961
3,china_beijing_beijing_dong_fang_hotel,beijing,0.183577,0.068693,0.74773,0.171623,0.083884,0.744493,0.184984,0.075649,0.739366,0.188407,0.062479,0.749114
