In [20]:
import os
import re
import pandas as pd
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import logging
from tqdm import tqdm

In [21]:
def parse_hotel_reviews(data_dir):
    date_pattern = re.compile(r'(\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2}\s\d{4})\t(.*)')
    data_frames = []
    
    for root, dirs, files in os.walk(data_dir):
        city = os.path.basename(root)
        for file in files:
            file_path = os.path.join(root, file)
            hotel_name = os.path.splitext(file)[0]  # Assuming the file has no extension

            with open(file_path, 'r', encoding='ISO-8859-1') as f:
                content = f.read()

            reviews = date_pattern.findall(content)
            reviews_data = [{'hotel_name': hotel_name, 'hotel_city': city, 'review_date': date, 'hotel_review': review.strip()} for date, _, review in reviews]
            if reviews_data:
                data_frames.append(pd.DataFrame(reviews_data))

    if data_frames:
        review_df = pd.concat(data_frames, ignore_index=True)
    else:
        review_df = pd.DataFrame(columns=['hotel_name', 'hotel_city', 'review_date', 'hotel_review'])
    return review_df

data_dir = 'data'
review_df = parse_hotel_reviews(data_dir)
print(review_df.head())
review_df.to_csv('csv3/derlenmis_reviews.csv', index=False)

                     hotel_name hotel_city  review_date  \
0  china_beijing_ascott_beijing    beijing  Aug 17 2009   
1  china_beijing_ascott_beijing    beijing  Mar 25 2009   
2  china_beijing_ascott_beijing    beijing  Nov 18 2008   
3  china_beijing_ascott_beijing    beijing  Sep 20 2008   
4  china_beijing_ascott_beijing    beijing   Nov 1 2007   

                                        hotel_review  
0  don't rely on it if you have any mission-criti...  
1  Excellent hotel for a family\tThis hotel is pe...  
2  Choice for Western Visitors\tI stayed at The A...  
3  Really Good Alternative Accomodation in Beijin...  
4  Didn't want to leave!\tWe rented a two bedroom...  


In [37]:
import os
import re
import pandas as pd
import logging
from tqdm import tqdm

def parse_hotel_reviews(data_dir):
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    # Updated regex pattern to handle flexible spacing and separators
    date_pattern = re.compile(r'(\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2}\s\d{4})\s+([^\t\n]*)\s+(.*)')
    data_frames = []

    for root, dirs, files in os.walk(data_dir):
        city = os.path.basename(root)
        for file in files:
            file_path = os.path.join(root, file)
            hotel_name = os.path.splitext(file)[0]  # Assuming the file has no extension
            logging.info(f'Processing file: {file_path}')

            try:
                with open(file_path, 'r', encoding='ISO-8859-1') as f:
                    content = f.read()

                if not content:
                    logging.warning(f'File {file_path} is empty.')
                    continue

                reviews = date_pattern.findall(content)
                if not reviews:
                    logging.warning(f'No reviews found in file {file_path}. Content: {content[:500]}')
                    continue

                reviews_data = [{'hotel_name': hotel_name, 'hotel_city': city, 'review_date': date, 'hotel_review': review.strip()} for date, _, title, review in reviews]
                if reviews_data:
                    data_frames.append(pd.DataFrame(reviews_data))

            except Exception as e:
                logging.error(f'Error processing file {file_path}: {e}')
                continue

    if data_frames:
        review_df = pd.concat(data_frames, ignore_index=True)
    else:
        review_df = pd.DataFrame(columns=['hotel_name', 'hotel_city', 'review_date', 'hotel_review'])

    return review_df

data_dir = 'data'
review_df = parse_hotel_reviews(data_dir)
print(review_df.head())
review_df.to_csv('csv3/derlenmis_reviews.csv', index=False)


2024-05-15 15:46:19,880 - INFO - Processing file: data\beijing\china_beijing_aloft_beijing_haidian
2024-05-15 15:46:19,892 - INFO - Processing file: data\beijing\china_beijing_ascott_beijing
2024-05-15 15:46:19,895 - INFO - Processing file: data\beijing\china_beijing_autumn_garden_courtyard_hotel
2024-05-15 15:46:19,897 - INFO - Processing file: data\beijing\china_beijing_bamboo_garden_hotel


                            hotel_name hotel_city  review_date  \
0  china_beijing_aloft_beijing_haidian    beijing  Oct 12 2009   
1  china_beijing_aloft_beijing_haidian    beijing  Sep 25 2009   
2  china_beijing_aloft_beijing_haidian    beijing   Aug 4 2009   
3  china_beijing_aloft_beijing_haidian    beijing  Jul 17 2009   
4  china_beijing_aloft_beijing_haidian    beijing  May 30 2009   

                                        hotel_review  
0  I stayed in this hotel for one night. As this ...  
1  Stayed two nights at Aloft on the most recent ...  
2  We stayed at the Aloft Beijing Haidian for 5 n...  
3  I am glad to be the first person to post photo...  
4  Stayed there for one night. The hotel is locat...  


In [38]:
review_df = pd.read_csv('csv3/derlenmis_reviews.csv')

review_df = review_df.dropna(subset=['hotel_review'])
review_df['processed_review'] = review_df['hotel_review'].str.lower().str.replace('\t', ' ', regex=False)
review_df.to_csv('csv3/processed_reviews.csv', index=False)

In [39]:
review_df = pd.read_csv('csv3/processed_reviews.csv')

review_df = review_df.drop('hotel_review', axis=1)
review_df = review_df.drop('review_date', axis=1)

review_df.to_csv('csv3/processed_reviews2.csv', index=False)

In [40]:
import torch
from torch import autocast

absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")



In [41]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

aspects = ['cleanliness', 'room', 'service', 'location']
weights = {'negative': -1, 'neutral': 0, 'positive': 1}

aspect_keywords = {
    'cleanliness': ['clean', 'dirty', 'filthy'],
    'room': ['room', 'bed', 'suite', 'accommodation'],
    'service': ['service', 'staff', 'help', 'support'],
    'location': ['location', 'close', 'area', 'far']
}

In [43]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

aspects = ['location']
weights = {'negative': -1, 'neutral': 0, 'positive': 1}

aspect_keywords = {
    'location': ['location', 'close', 'area', 'far']
}

In [44]:
batch_size = 16

def process_batch(batch_reviews, aspect):
    aspect_reviews = [f"{aspect}: {review}" for review in batch_reviews]
    inputs = absa_tokenizer(aspect_reviews, return_tensors="pt", padding=True, truncation=True)
    outputs = absa_model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1)
    return probs.detach().numpy()

def is_aspect_mentioned(review, aspect):
    keywords = aspect_keywords[aspect]
    return any(keyword in review for keyword in keywords)

# Adding columns for overall sentiment scores
for aspect in aspects:
    review_df[f'{aspect}_score'] = 0

In [45]:
for aspect in aspects:
    logging.info(f"Starting processing for aspect: {aspect}")
    overall_scores = []
    for i in tqdm(range(0, len(review_df), batch_size), desc=f"Batches for {aspect}"):
        batch_reviews = review_df['processed_review'][i:i + batch_size]
        aspect_mentioned = [is_aspect_mentioned(review, aspect) for review in batch_reviews]
        if any(aspect_mentioned):
            probs = process_batch([review for review, mentioned in zip(batch_reviews, aspect_mentioned) if mentioned], aspect)
            scores = [sum(weights[sentiment] * probs[j, k] for k, sentiment in enumerate(['negative', 'neutral', 'positive'])) for j in range(len(probs))]
        else:
            scores = [0] * len(batch_reviews)
        for idx, score in zip(batch_reviews.index, scores):
            review_df.at[idx, f'{aspect}_score'] = score

    logging.info(f"Finished processing for aspect: {aspect}")

# Save the final DataFrame with overall scores
review_df.to_csv('csv3/overall_sentiment_scores.csv', index=False)

2024-05-15 15:46:57,480 - INFO - Starting processing for aspect: location
Batches for location:   0%|          | 0/6 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  review_df.at[idx, f'{aspect}_score'] = score
Batches for location: 100%|██████████| 6/6 [01:40<00:00, 16.77s/it]
2024-05-15 15:48:38,139 - INFO - Finished processing for aspect: location


In [47]:
review_df

Unnamed: 0,hotel_name,hotel_city,processed_review,cleanliness_score,room_score,service_score,location_score
0,china_beijing_aloft_beijing_haidian,beijing,i stayed in this hotel for one night. as this ...,0,0,0,0.060556
1,china_beijing_aloft_beijing_haidian,beijing,stayed two nights at aloft on the most recent ...,0,0,0,0.962788
2,china_beijing_aloft_beijing_haidian,beijing,we stayed at the aloft beijing haidian for 5 n...,0,0,0,0.568240
3,china_beijing_aloft_beijing_haidian,beijing,i am glad to be the first person to post photo...,0,0,0,-0.633451
4,china_beijing_aloft_beijing_haidian,beijing,stayed there for one night. the hotel is locat...,0,0,0,0.586784
...,...,...,...,...,...,...,...
82,china_beijing_bamboo_garden_hotel,beijing,jun 4 2006 nice garden,0,0,0,-0.350294
83,china_beijing_bamboo_garden_hotel,beijing,nov 15 2005 we loved this hotel,0,0,0,0.000000
84,china_beijing_bamboo_garden_hotel,beijing,apr 29 2005 great hotel not so great location,0,0,0,0.000000
85,china_beijing_bamboo_garden_hotel,beijing,nov 17 2004 a neat bamboo garden questionable ...,0,0,0,0.000000
