In [4]:
import pandas as pd
import openpyxl
import os
import re
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import logging
from tqdm import tqdm
import torch

# Load your dataset
df = pd.read_csv('las-vegas_aggregated_hotel_scores.csv')

annotated_reviews = pd.read_excel('manual_annotation.xlsx', header=0)
def preprocess_reviews(review_df):
    review_df['hotel_review'] = review_df['hotel_review'].str.lower() # convert to lowercase
    review_df['hotel_review'] = review_df['hotel_review'].str.replace('[^\w\s]', '', regex=True) # remove punctuation
    return review_df

annotated_reviews = preprocess_reviews(annotated_reviews)

  review_df['hotel_review'] = review_df['hotel_review'].str.replace('[^\w\s]', '', regex=True) # remove punctuation


In [5]:
roberta_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
roberta_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
roberta_pipeline = pipeline("sentiment-analysis", model=roberta_model, tokenizer=roberta_tokenizer)



In [6]:
aspects = ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']
weights = {'LABEL_0': -1, 'LABEL_1': 0, 'LABEL_2': 1}
batch_size = 16
review_df=annotated_reviews

aspect_keywords = {
    'cleanliness': ['clean', 'cleanliness'],
    'room': ['room', 'bed', 'suite', 'large'],
    'service': ['service', 'staff', 'help', 'support'],
    'location': ['location', 'close'],
    'value': ['value', 'worth', 'price'],
    'safety': ['safe', 'safety', 'secure'],
    'comfort': ['comfort', 'comfortable'],
    'transportation': ['bus', 'metro', 'station', 'close', 'walk'],
    'noise': ['sound', 'volume']
}

def is_aspect_mentioned(review, aspect):
    keywords = aspect_keywords[aspect]
    return any(keyword in review for keyword in keywords)

for aspect in aspects:
    review_df[f'{aspect}_score'] = 0

In [7]:
def process_reviews(pipeline, weights):
    for aspect in aspects:
        review_df[f'{aspect}_score'] = 0

    for aspect in aspects:
        print(f"Starting processing for aspect: {aspect}")
        for i in tqdm(range(0, len(review_df), batch_size), desc=f"Batches for {aspect}"):
            batch_reviews = review_df['hotel_review'][i:i + batch_size]
            batch_index = batch_reviews.index
            aspect_mentioned = [is_aspect_mentioned(review, aspect) for review in batch_reviews]
            if any(aspect_mentioned):
                filtered_reviews = [review for review, mentioned in zip(batch_reviews, aspect_mentioned) if mentioned]
                aspect_reviews = [f"{aspect}: {review[:512 - len(aspect) - 2]}" for review in filtered_reviews]
                results = pipeline(aspect_reviews)
                scores = [weights[result['label']] * result['score'] for result in results]
                score_index = [index for index, mentioned in zip(batch_index, aspect_mentioned) if mentioned]
                review_df.loc[score_index, f'{aspect}_score'] = scores
            else:
                review_df.loc[batch_index, f'{aspect}_score'] = 0
    for aspect in aspects:
        review_df[f'{aspect}_score'] = pd.to_numeric(review_df[f'{aspect}_score'], errors='coerce')

    return review_df

print(review_df)

                                          hotel_ name hotel_city  \
0                 china_beijing_aloft_beijing_haidian    beijing   
1                        china_beijing_ascott_beijing    beijing   
2         china_beijing_autumn_garden_courtyard_hotel    beijing   
3                 china_beijing_capital_hotel_beijing    beijing   
4                     china_beijing_china_world_hotel    beijing   
5                         china_beijing_harmony_hotel    beijing   
6            china_beijing_hotel_ibis_beijing_sanyuan    beijing   
7                           china_beijing_huadu_hotel    beijing   
8           usa_illinois_chicago_amalfi_hotel_chicago    chicago   
9                   usa_illinois_chicago_belair_hotel    chicago   
10                   usa_illinois_chicago_hotel_blake    chicago   
11  usa_illinois_chicago_hotel_indigo_chicago_down...    chicago   
12  usa_illinois_chicago_howard_johnson_inn_downto...    chicago   
13           usa_illinois_chicago_inn_at_lincoln

In [8]:
review_df = process_reviews(roberta_pipeline, weights)

Starting processing for aspect: cleanliness


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for cleanliness: 100%|██████████| 4/4 [00:04<00:00,  1.10s/it]


Starting processing for aspect: room


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for room: 100%|██████████| 4/4 [00:07<00:00,  1.76s/it]


Starting processing for aspect: service


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for service: 100%|██████████| 4/4 [00:06<00:00,  1.55s/it]


Starting processing for aspect: location


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for location: 100%|██████████| 4/4 [00:04<00:00,  1.10s/it]


Starting processing for aspect: value


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for value: 100%|██████████| 4/4 [00:03<00:00,  1.23it/s]


Starting processing for aspect: safety


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for safety: 100%|██████████| 4/4 [00:00<00:00,  8.37it/s]


Starting processing for aspect: comfort


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for comfort: 100%|██████████| 4/4 [00:01<00:00,  2.31it/s]


Starting processing for aspect: transportation


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for transportation: 100%|██████████| 4/4 [00:04<00:00,  1.02s/it]


Starting processing for aspect: noise


  review_df.loc[score_index, f'{aspect}_score'] = scores
Batches for noise: 100%|██████████| 4/4 [00:00<00:00, 22.47it/s]


In [10]:
# Define the function to convert continuous scores to discrete classes
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def continuous_to_discrete(score):
    if score <= -0.33:
        return -1
    elif score >= 0.33:
        return 1
    else:
        return 0

df_roberta=review_df
df_manual=annotated_reviews
# Apply the conversion function to each aspect


In [11]:
for aspect in aspects:
    df_roberta[f'{aspect}_class'] = df_roberta[f'{aspect}_score'].apply(continuous_to_discrete)

# Merge the datasets on the common key (assuming 'review_id' is the common key)
df = pd.merge(df_roberta, df_manual, on='hotel_name')

# Display the first few rows to verify the merge and conversion
print(df.head())

# Define the function to plot a confusion matrix
def plot_confusion_matrix(y_true, y_pred, aspect):
    cm = confusion_matrix(y_true, y_pred, labels=[-1, 0, 1])
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['negative', 'neutral', 'positive'], yticklabels=['negative', 'neutral', 'positive'])
    plt.title(f'Confusion Matrix for {aspect.capitalize()}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# Plot confusion matrices for each aspect
for aspect in aspects:
    y_true = df[f'{aspect}']
    y_pred = df[f'{aspect}_score']
    plot_confusion_matrix(y_true, y_pred, aspect)


KeyError: 'hotel_name'