In [26]:
# Library Importations
import os
import re
import pandas as pd
import nltk.tokenize
from nltk.tokenize import sent_tokenize

# 'ISO-8859-1'

In [27]:
import os
import re
import pandas as pd

def parse_hotel_reviews(data_dir):
    
    date_pattern = re.compile(r'(\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2}\s\d{4})\t(.*)')
    data_frames = []
    
    for root, dirs, files in os.walk(data_dir):
        city = os.path.basename(root)
        for file in files:
            file_path = os.path.join(root, file)
            hotel_name = os.path.splitext(file)[0]  # Assuming the file has no extension

            with open(file_path, 'r', encoding='ISO-8859-1') as f:
                content = f.read()

            reviews = date_pattern.findall(content)
            reviews_data = [{'hotel_name': hotel_name, 'hotel_city': city, 'review_date': date, 'hotel_review': review.strip()} for date, _, review in reviews]
            if reviews_data:
                data_frames.append(pd.DataFrame(reviews_data))

    if data_frames:
        review_df = pd.concat(data_frames, ignore_index=True)
    else:
        review_df = pd.DataFrame(columns=['hotel_name', 'hotel_city', 'review_date', 'hotel_review'])
    return review_df

data_dir = 'data'
review_df = parse_hotel_reviews(data_dir)
print(review_df.head())
review_df.to_csv('csv/derlenmis_reviews.csv', index=False)

                     hotel_name hotel_city  review_date  \
0  china_beijing_ascott_beijing    beijing  Aug 17 2009   
1  china_beijing_ascott_beijing    beijing  Mar 25 2009   
2  china_beijing_ascott_beijing    beijing  Nov 18 2008   
3  china_beijing_ascott_beijing    beijing  Sep 20 2008   
4  china_beijing_ascott_beijing    beijing   Nov 1 2007   

                                        hotel_review  
0  don't rely on it if you have any mission-criti...  
1  Excellent hotel for a family\tThis hotel is pe...  
2  Choice for Western Visitors\tI stayed at The A...  
3  Really Good Alternative Accomodation in Beijin...  
4  Didn't want to leave!\tWe rented a two bedroom...  


In [28]:
review_df = pd.read_csv('csv/derlenmis_reviews.csv')

In [29]:
review_df = review_df.dropna(subset=['hotel_review'])
review_df['sentences'] = review_df['hotel_review'].apply(sent_tokenize)
review_df.to_csv('csv/processed_reviews.csv', index=False)

In [30]:
review_df = pd.read_csv('csv/processed_reviews.csv')

In [31]:
aspects = ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']
aspect_keywords  = {
    'cleanliness': {
        'positive': ['clean', 'very clean', 'perfectly clean', 'well maintained', 'spotless', 'tidy', 'very tidy', 'smells good'],
        'negative': ['not clean', 'dirty', 'very dirty', 'stain', 'poorly maintained', 'smells bad', 'stink', 'stunk']
    },
    'room': {
        'positive': ['spacious room', 'large room', 'comfortable bed', 'beautiful room', 'comfortable', 'big suite', 'huge room'],
        'negative': ['small room', 'tiny room', 'uncomfortable bed', 'outdated room', 'noisy room', 'uncomfortable']
    },
    'service': {
        'positive': ['friendly', 'good staff', 'good service', 'excellent staff', 'excellent service', 'helpful staff', 'nice service', 'excellent service', 'good assistance'],
        'negative': ['rude staff', 'rude', 'poor service', 'unhelpful', 'slow service']
    },
    'location': {
        'positive': ['great location', 'perfect location', 'convenient location', 'ideal location', 'central location', 'good location'],
        'negative': ['bad location', 'worst location', 'inconvenient location', 'unsafe area', 'unsafe', 'far away', 'far']
    },
    'value': {
        'positive': ['good value', 'worth', 'price performance', 'great price', 'affordable', 'worth every penny', 'reasonable price', 'not expensive', 'cheap'],
        'negative': ['overpriced', 'not worth the money', 'too expensive', 'poor value', 'rip off']
    },
    'safety': {
        'positive': ['secure parking', 'high security', 'safe', 'secure', 'safe and sound'],
        'negative': ['unsafe', 'dangerous', 'poor security', 'risky', 'not secure', 'not safe']
    },
    'comfort': {
        'positive': ['very comfortable', 'comfortable', 'relaxing', 'peaceful', 'quiet', 'hot shower'],
        'negative': ['uncomfortable', 'very uncomfortable', 'hard bed', 'uncomfortable chairs', 'noise', 'noisy', 'poor insulation']
    },
    'transportation': {
        'positive': ['close to subway', 'shuttle', 'near the airport', 'good transport links', 'ample parking', 'close to transportation', 'close to bus', 'close to station', 'close to metro', 'close to airport', 'near the bus', 'near the station', 'near the metro', 'walking distance', 'taxi'],
        'negative': ['far from subway', 'no shuttle', 'far from airport', 'poor transportatiton', 'limited parking', 'far away', 'far']
    },
    'noise': {
        'positive': ['quiet room', 'soundproof', 'no noise', 'peaceful', 'quiet', 'silent'],
        'negative': ['noisy room', 'loud neighbors', 'traffic noise', 'thin walls', 'can hear everything', 'high volume', 'noisy', 'noise']
    }
}

In [32]:
def get_aspect_sentiment(sentence):
    aspect_sentiment = {aspect: 0 for aspect in aspects}  # Initialize all aspects with 0
    for aspect, keywords in aspect_keywords.items():
        for sentiment, keys in keywords.items():
            if any(key in sentence.lower() for key in keys):
                # Assuming simple positive/negative, where positive = 1, negative = -1
                aspect_sentiment[aspect] = 1 if sentiment == 'positive' else -1
    return aspect_sentiment

review_df['aspect_sentiment'] = review_df['sentences'].apply(lambda sentences: [get_aspect_sentiment(sentence) for sentence in sentences])
review_df.to_csv('csv/aspect_sentimented_reviews.csv', index=False)

In [33]:
review_df = pd.read_csv('csv/aspect_sentimented_reviews.csv')

In [34]:
# Ensure 'processed_df' is loaded and contains 'hotel_review'
flat_data = []

for index, row in review_df.iterrows():
    sentences = sent_tokenize(row['hotel_review'])
    for sentence in sentences:
        sentiment_scores = get_aspect_sentiment(sentence)
        flat_data.append({
            'hotel_name': row['hotel_name'],
            'hotel_city': row['hotel_city'],
            'review_date': row['review_date'],
            'sentence': sentence,
            **sentiment_scores  # Expand the sentiment scores into separate columns
        })

# Create a new DataFrame for training
training_df = pd.DataFrame(flat_data)

In [35]:
display(training_df)

Unnamed: 0,hotel_name,hotel_city,review_date,sentence,cleanliness,room,service,location,value,safety,comfort,transportation,noise
0,china_beijing_ascott_beijing,beijing,Aug 17 2009,don't rely on it if you have any mission-criti...,0,0,0,0,0,0,0,0,0
1,china_beijing_ascott_beijing,beijing,Aug 17 2009,I was not able to connect to my US stock/futur...,0,0,0,0,0,0,0,0,0
2,china_beijing_ascott_beijing,beijing,Aug 17 2009,couldn't login to my airline to do an internet...,0,0,0,0,0,0,0,0,0
3,china_beijing_ascott_beijing,beijing,Aug 17 2009,"Other websites such as yahoo, msn messenger se...",0,0,0,0,0,0,0,0,0
4,china_beijing_ascott_beijing,beijing,Aug 17 2009,the front desk tried it as well and they have ...,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60393,usa_illinois_chicago_w_chicago_lakeshore,chicago,Sep 3 2002,!,0,0,0,0,0,0,0,0,0
60394,usa_illinois_chicago_w_chicago_lakeshore,chicago,Jul 29 2002,Great view of the lake and convenient location!,0,0,0,1,0,0,0,0,0
60395,usa_illinois_chicago_w_chicago_lakeshore,chicago,Jul 22 2002,Overpriced,0,0,0,0,-1,0,0,0,0
60396,usa_illinois_chicago_w_chicago_lakeshore,chicago,Jul 8 2002,Simply --wild--not your regular Holiday Inn,0,0,0,0,0,0,0,0,0


In [36]:
training_df.to_csv('csv/training_df.csv', index=False)

In [37]:
import openpyxl
training_df.to_excel('csv/training_df.xlsx')

  0%|          | 0/7550 [08:38<?, ?it/s]


In [38]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
training_df['input_ids'] = training_df['sentence'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))



In [39]:
from torch.utils.data import Dataset, DataLoader
import torch

class AspectSentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {'input_ids': torch.tensor(self.encodings[idx])}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare the dataset
labels = training_df[['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']].values
dataset = AspectSentimentDataset(training_df['input_ids'].tolist(), labels)

In [40]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Load the tokenizer and the model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels=9)  # We have 9 aspects

class HotelReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

encodings = tokenizer(training_df['sentence'].tolist(), truncation=True, padding=True, max_length=512)
labels = training_df[['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']].values
dataset = HotelReviewDataset(encodings, labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# Train the model
trainer.train()

In [None]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
import torch

optimizer = AdamW(model.parameters(), lr=5e-5)

train_loader = DataLoader(dataset, batch_size=8, shuffle=True)
num_train_epochs=1

for epoch in range(num_train_epochs):
    model.train()
    for batch in train_loader:
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()