In [7]:
import os
import re
import pandas as pd
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import logging
from tqdm import tqdm
import torch

In [8]:
def parse_hotel_reviews(data_dir):
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    # Updated regex pattern to handle flexible spacing and separators
    date_pattern = re.compile(r'(\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2}\s\d{4})\s+([^\t\n]*)\s+(.*)')
    data_frames = []

    for root, dirs, files in os.walk(data_dir):
        city = os.path.basename(root)
        for file in files:
            file_path = os.path.join(root, file)
            hotel_name = os.path.splitext(file)[0]  # Assuming the file has no extension
            try:
                with open(file_path, 'r', encoding='ISO-8859-1') as f:
                    content = f.read()

                if not content:
                    logging.warning(f'File {file_path} is empty.')
                    continue

                reviews = date_pattern.findall(content)
                if not reviews:
                    logging.warning(f'No reviews found in file {file_path}. Content: {content[:500]}')
                    continue

                reviews_data = [{'hotel_name': hotel_name, 'hotel_city': city, 'review_date': date, 'hotel_review': review.strip()} for date, _, title, review in reviews]
                if reviews_data:
                    data_frames.append(pd.DataFrame(reviews_data))

            except Exception as e:
                logging.error(f'Error processing file {file_path}: {e}')
                continue

    if data_frames:
        review_df = pd.concat(data_frames, ignore_index=True)
    else:
        review_df = pd.DataFrame(columns=['hotel_name', 'hotel_city', 'review_date', 'hotel_review'])

    return review_df

dataset = 'london'
data_dir = 'data'
review_df = parse_hotel_reviews(data_dir)
print(review_df.head())
review_df.to_csv(f'csv/{dataset}_derlenmis_reviews.csv', index=False)

                           hotel_name hotel_city  review_date  \
0  uk_england_london_1_lexham_gardens     london   Nov 8 2009   
1  uk_england_london_1_lexham_gardens     london  Oct 16 2009   
2  uk_england_london_1_lexham_gardens     london  Oct 10 2009   
3  uk_england_london_1_lexham_gardens     london   Sep 8 2009   
4  uk_england_london_1_lexham_gardens     london   Sep 2 2009   

                                        hotel_review  
0  Just thought give a update of this hotel, stay...  
1  Location is excellent - 4 minute walk from Ear...  
2  My Girlfriend and I regrettably choose the Lex...  
3  first room: smallest double room ever one pers...  
4  The service was pretty average - when we had a...  


In [9]:
review_df = pd.read_csv(f'csv/{dataset}_derlenmis_reviews.csv')
review_df = review_df.dropna(subset=['hotel_review'])
review_df['processed_review'] = review_df['hotel_review'].str.lower().str.replace('\t', ' ', regex=False)
review_df = review_df.drop('hotel_review', axis=1)
review_df = review_df.drop('review_date', axis=1)
review_df.to_csv(f'csv/{dataset}_processed_reviews.csv', index=False)

In [10]:
absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")



In [11]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

aspects = ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']
weights = {'negative': -1, 'neutral': 0, 'positive': 1}

aspect_keywords = {
    'cleanliness': ['clean', 'dirty', 'smell', 'stink', 'stunk', 'filthy'],
    'room': ['room', 'bed', 'suite', 'large'],
    'service': ['service', 'staff', 'help', 'support'],
    'location': ['location', 'close', 'area', 'far'],
    'value': ['value', 'worth', 'price'],
    'safety': ['safe', 'safety', 'secure', 'danger', 'dangerous'],
    'comfort': ['comfort', 'comfortable', 'uncomfortable'],
    'transportation': ['bus', 'metro', 'station', 'close', 'walk'],
    'noise': ['sound', 'volume', 'noisy', 'noise']
}

batch_size = 16
def process_batch(batch_reviews, aspect):
    aspect_reviews = [f"{aspect}: {review}" for review in batch_reviews]
    inputs = absa_tokenizer(aspect_reviews, return_tensors="pt", padding=True, truncation=True)
    outputs = absa_model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1)
    return probs.detach().numpy()

def is_aspect_mentioned(review, aspect):
    keywords = aspect_keywords[aspect]
    return any(keyword in review for keyword in keywords)

for aspect in aspects:
    review_df[f'{aspect}_score'] = 0

review_df = pd.read_csv(f'csv/{dataset}_processed_reviews.csv')

In [12]:
for aspect in aspects:
    review_df[f'{aspect}_score'] = 0

for aspect in aspects:
    logging.info(f"Starting processing for aspect: {aspect}")
    for i in tqdm(range(0, len(review_df), batch_size), desc=f"Batches for {aspect}"):
        batch_reviews = review_df['processed_review'][i:i + batch_size]
        batch_index = batch_reviews.index
        aspect_mentioned = [is_aspect_mentioned(review, aspect) for review in batch_reviews]
        if any(aspect_mentioned):
            filtered_reviews = [review for review, mentioned in zip(batch_reviews, aspect_mentioned) if mentioned]
            probs = process_batch(filtered_reviews, aspect)
            scores = [sum(weights[sentiment] * probs[j, k] for k, sentiment in enumerate(['negative', 'neutral', 'positive'])) for j in range(len(probs))]
            score_index = [index for index, mentioned in zip(batch_index, aspect_mentioned) if mentioned]
            review_df.loc[score_index, f'{aspect}_score'] = scores
        else:
            review_df.loc[batch_index, f'{aspect}_score'] = 0

    logging.info(f"Finished processing for aspect: {aspect}")

2024-05-19 04:22:16,278 - INFO - Starting processing for aspect: cleanliness
Batches for cleanliness:   0%|          | 0/4 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for cleanliness: 100%|██████████| 4/4 [01:20<00:00, 20.13s/it]
2024-05-19 04:23:36,809 - INFO - Finished processing for aspect: cleanliness
2024-05-19 04:23:36,820 - INFO - Starting processing for aspect: room
  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for room: 100%|██████████| 4/4 [01:17<00:00, 19.28s/it]
2024-05-19 04:24:53,951 - INFO - Finished processing for aspect: room
2024-05-19 04:24:53,957 - INFO - Starting processing for aspect: service
  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for service: 100%|██████████| 4/4 [00:34<00:00,  8.57s/it]
2024-05-19 04:25:28,260 - INFO - Finished processing for 

In [13]:
# Ensure aspect score columns are numeric
for aspect in aspects:
    review_df[f'{aspect}_score'] = pd.to_numeric(review_df[f'{aspect}_score'], errors='coerce')

# Aggregate scores by hotel and city
aggregated_scores = review_df.groupby(['hotel_name', 'hotel_city']).agg({
    'cleanliness_score': 'mean',
    'room_score': 'mean',
    'service_score': 'mean',
    'location_score': 'mean',
    'value_score': 'mean',
    'safety_score': 'mean',
    'comfort_score': 'mean',
    'transportation_score': 'mean',
    'noise_score': 'mean'
}).reset_index()

# Save the aggregated scores
aggregated_scores.to_csv(f'csv/{dataset}_aggregated_hotel_scores.csv', index=False)


In [14]:
display(aggregated_scores)

Unnamed: 0,hotel_name,hotel_city,cleanliness_score,room_score,service_score,location_score,value_score,safety_score,comfort_score,transportation_score,noise_score
0,uk_england_london_1_lexham_gardens,london,0.068621,-0.085246,-0.002657,0.056365,0.091365,-0.052046,0.001735,0.027257,-0.011119


In [63]:
def map_score_to_label(score, negative_threshold=-0.2, positive_threshold=0.2):
    if score < negative_threshold:
        return -1
    elif score > positive_threshold:
        return 1
    else:
        return 0


In [95]:
import pandas as pd
annotated_df = pd.read_excel('xlsx/london_derlenmis_reviews.xlsx')
annotated_df = annotated_df.drop('hotel_review', axis=1)
display(annotated_df)


Unnamed: 0,hotel_name,hotel_city,cleanliness_score,room_score,service_score,location_score,value_score,safety_score,comfort_score,transportation_score,noise_score
0,uk_england_london_1_lexham_gardens,london,0,1,1,0,0,0,-1,0,-1
1,uk_england_london_1_lexham_gardens,london,0,0,1,1,0,0,0,0,0
2,uk_england_london_1_lexham_gardens,london,-1,-1,-1,0,-1,-1,-1,0,-1
3,uk_england_london_1_lexham_gardens,london,-1,-1,-1,0,-1,-1,-1,0,0
4,uk_england_london_1_lexham_gardens,london,0,-1,-1,1,0,-1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
57,uk_england_london_1_lexham_gardens,london,0,-1,-1,0,-1,0,-1,0,-1
58,uk_england_london_1_lexham_gardens,london,0,-1,-1,0,-1,0,0,0,0
59,uk_england_london_1_lexham_gardens,london,1,1,0,1,1,0,1,1,0
60,uk_england_london_1_lexham_gardens,london,1,1,1,1,1,0,1,1,0


In [114]:
reviewdf = review_df
reviewdf = reviewdf.drop('processed_review', axis=1)
reviewdf

Unnamed: 0,hotel_name,hotel_city,cleanliness_score,room_score,service_score,location_score,value_score,safety_score,comfort_score,transportation_score,noise_score
0,uk_england_london_1_lexham_gardens,london,0.501759,0.182072,0.000000,0.383146,0.0,0.000000,0.0,0.378395,-0.063086
1,uk_england_london_1_lexham_gardens,london,0.974688,0.968108,0.979180,0.988669,0.0,0.000000,0.0,0.973868,0.000000
2,uk_england_london_1_lexham_gardens,london,-0.899189,-0.916658,-0.912468,-0.904268,0.0,-0.897950,0.0,0.000000,0.000000
3,uk_england_london_1_lexham_gardens,london,-0.933351,-0.924533,0.000000,0.000000,0.0,-0.915528,0.0,0.000000,0.000000
4,uk_england_london_1_lexham_gardens,london,-0.173431,-0.683134,-0.670538,0.013714,0.0,-0.299381,0.0,-0.333411,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
57,uk_england_london_1_lexham_gardens,london,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
58,uk_england_london_1_lexham_gardens,london,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
59,uk_england_london_1_lexham_gardens,london,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
60,uk_england_london_1_lexham_gardens,london,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Ensure both DataFrames are sorted similarly
reviewdf = reviewdf.sort_values(by=['hotel_name', 'hotel_city']).reset_index(drop=True)
annotated_df = annotated_df.sort_values(by=['hotel_name', 'hotel_city']).reset_index(drop=True)

# Map DeBERTa scores to labels
for aspect in ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']:
    reviewdf[f'{aspect}_label'] = reviewdf[f'{aspect}_score'].apply(map_score_to_label)

accuracy_mean = 0
precision_mean = 0
recall_mean = 0
f1_mean = 0
inc = 0
# Compare predictions to ground truth
for aspect in ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']:
    accuracy = accuracy_score(annotated_df[f'{aspect}_score'], reviewdf[f'{aspect}_label'])
    precision = precision_score(annotated_df[f'{aspect}_score'], reviewdf[f'{aspect}_label'], average='weighted')
    recall = recall_score(annotated_df[f'{aspect}_score'], reviewdf[f'{aspect}_label'], average='weighted')
    f1 = f1_score(annotated_df[f'{aspect}_score'], reviewdf[f'{aspect}_label'], average='weighted')
    '''
    print(f'Aspect: {aspect}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}\n')
    '''
    accuracy_mean += accuracy
    precision_mean += precision
    recall_mean += recall
    f1_mean += f1
print(f'accuracy_mean: {accuracy_mean/9}')
print(f'precision_mean: {precision_mean/9}')
print(f'recall_mean: {recall_mean/9}')
print(f'f1_mean: {f1_mean/9}')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

for aspect in ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']:
    conf_matrix = confusion_matrix(annotated_df[f'{aspect}_score'], reviewdf[f'{aspect}_label'])
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix for {aspect.capitalize()}')
    plt.show()

In [117]:
def map_score_to_label(score, negative_threshold=-0.2, positive_threshold=0.2):
    if score < negative_threshold:
        return -1
    elif score > positive_threshold:
        return 1
    else:
        return 0


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Apply the mapping function to create labels
for aspect in ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']:
    reviewdf[f'{aspect}_label'] = reviewdf[f'{aspect}_score'].apply(map_score_to_label)

# Print a few rows to verify
print(reviewdf.head())

def generate_synthetic_labels(scores, negative_threshold=-0.2, positive_threshold=0.2):
    return np.where(scores < negative_threshold, -1, np.where(scores > positive_threshold, 1, 0))


In [None]:
from sklearn.model_selection import train_test_split

# Split the data
train_df, test_df = train_test_split(reviewdf, test_size=0.2, random_state=42)

# Generate synthetic labels for training and testing data
for aspect in ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']:
    train_df[f'{aspect}_label'] = generate_synthetic_labels(train_df[f'{aspect}_score'])
    test_df[f'{aspect}_label'] = generate_synthetic_labels(test_df[f'{aspect}_score'])

# Example of validation using synthetic labels
for aspect in ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']:
    y_true = test_df[f'{aspect}_label']
    y_pred = test_df[f'{aspect}_score'].apply(map_score_to_label)
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    print(f'Aspect: {aspect}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}\n')


In [None]:
for aspect in ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']:
    print(f'{aspect} score distribution:')
    print(reviewdf[f'{aspect}_score'].describe())


In [126]:
for aspect in ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']:
    print(f'Sample of {aspect} scores and labels:')
    sample = reviewdf[[f'{aspect}_score', f'{aspect}_label']].head(20)
    print(sample)


Sample of cleanliness scores and labels:
    cleanliness_score  cleanliness_label
0            0.501759                  1
1            0.974688                  1
2           -0.899189                 -1
3           -0.933351                 -1
4           -0.173431                  0
5           -0.722583                 -1
6           -0.768774                 -1
7           -0.607550                 -1
8            0.000000                  0
9           -0.873862                 -1
10           0.755316                  1
11           0.000000                  0
12           0.980685                  1
13           0.000000                  0
14           0.089506                  0
15           0.934591                  1
16           0.000000                  0
17           0.473592                  1
18           0.000000                  0
19           0.000000                  0
Sample of room scores and labels:
    room_score  room_label
0     0.182072           0
1     0.96