In [2]:
import os
import re
import openpyxl
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [5]:
# Read each hotel file from each city folder
def read_and_parse_files(data_directory):
    reviews = []
    for root, dirs, files in os.walk(data_directory):
        for file_name in files:
            path = os.path.join(root, file_name)
            parts = path.split(os.sep)
            city = parts[-2]
            hotel_name = file_name.replace('_', ' ')

            with open(path, 'r', encoding='ISO-8859-1') as file:
                for line in file:
                    match = re.match(r'(\w+ \d+ \d{4})\s+(.*)', line) # regex for parsing date and review
                    if match:
                        date, review = match.groups()
                        reviews.append({
                            'city': city,
                            'hotel_name': hotel_name,
                            'date': pd.to_datetime(date),
                            'review': review.strip()
                        })


    review_df = pd.DataFrame(reviews)
    return review_df

# Wordnet POS to lemmatize
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                "N": nltk.corpus.wordnet.NOUN,
                "V": nltk.corpus.wordnet.VERB,
                "R": nltk.corpus.wordnet.ADV}

    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

# Preprocessing
def preprocess_reviews(review_df):
    review_df['review'] = review_df['review'].str.lower()
    review_df['review'] = review_df['review'].str.replace('[^\w\s]', '', regex=True)
    review_df['tokens'] = review_df['review'].apply(word_tokenize)
    stop_words = set(stopwords.words('english'))
    review_df['tokens'] = review_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
    lemmatizer = WordNetLemmatizer()
    review_df['tokens'] = review_df['tokens'].apply(
        lambda x: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in x]
    )
    return review_df

In [6]:
data_directory = './data2'
review_df = read_and_parse_files(data_directory)
review_df = preprocess_reviews(review_df)
review_df.to_csv('csv3/processed_hotel_reviews.csv', index=False)
print(review_df.head())

      city                           hotel_name       date  \
0  beijing  china beijing aloft beijing haidian 2009-10-12   
1  beijing  china beijing aloft beijing haidian 2009-09-25   
2  beijing  china beijing aloft beijing haidian 2009-08-04   
3  beijing  china beijing aloft beijing haidian 2009-07-17   
4  beijing  china beijing aloft beijing haidian 2009-05-30   

                                              review  \
0  nice trendy hotel location not too bad\ti stay...   
1  great budget hotel\tstayed two nights at aloft...   
2  excellent value  location not a big problem\tw...   
3  stylish clean reasonable value poor location\t...   
4  remote but excellent value for money\tstayed t...   

                                              tokens  
0  [nice, trendy, hotel, location, bad, stayed, h...  
1  [great, budget, hotel, stayed, two, night, alo...  
2  [excellent, value, location, big, problem, sta...  
3  [stylish, clean, reasonable, value, poor, loca...  
4  [remote, exc

In [8]:
review_df = pd.read_csv('csv3/processed_hotel_reviews.csv')
review_df = review_df[review_df['tokens'] != '[]']
review_df = review_df[review_df['review'].str.strip().astype(bool)]
print(review_df.describe())

           city                            hotel_name        date  \
count     23089                                 23089       23089   
unique        2                                   274        2417   
top     chicago  usa illinois chicago affinia chicago  2009-10-08   
freq      18098                                   410         162   

             review              tokens  
count         23089               23089  
unique        22959               22913  
top     great hotel  ['great', 'hotel']  
freq             21                  23  


Data Analysis

In [9]:
from nltk.sentiment import SentimentIntensityAnalyzer

def analyze_sentiments(review_df):
    sia = SentimentIntensityAnalyzer()
    # Assuming 'review' contains the cleaned, lowercased text
    review_df['sentiment'] = review_df['review'].apply(lambda x: sia.polarity_scores(x)['compound'])
    return review_df

# Perform sentiment analysis
review_df = analyze_sentiments(review_df)
review_df.to_csv('csv3/hotel_reviews_with_sentiments.csv', index=False)

In [3]:
review_df = pd.read_csv('csv3/hotel_reviews_with_sentiments.csv')

In [4]:
vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')
ngram_matrix = vectorizer.fit_transform(review_df['review'])
feature_names = vectorizer.get_feature_names_out()

In [5]:
feature_ngrams = {
    'cleanliness': {
        'positive': ['clean', 'very clean', 'perfectly clean', 'well maintained', 'spotless', 'tidy', 'very tidy', 'smells good'],
        'negative': ['not clean', 'dirty', 'very dirty', 'stain', 'poorly maintained', 'smells bad', 'stink', 'stunk']
    },
    'room': {
        'positive': ['spacious room', 'large room', 'comfortable bed', 'beautiful room', 'comfortable', 'big suite', 'huge room'],
        'negative': ['small room', 'tiny room', 'uncomfortable bed', 'outdated room', 'noisy room', 'uncomfortable']
    },
    'service': {
        'positive': ['friendly', 'good staff', 'good service', 'excellent staff', 'excellent service', 'helpful staff', 'nice service', 'excellent service', 'good assistance'],
        'negative': ['rude staff', 'rude', 'poor service', 'unhelpful', 'slow service']
    },
    'location': {
        'positive': ['great location', 'perfect location', 'convenient location', 'ideal location', 'central location', 'good location'],
        'negative': ['bad location', 'worst location', 'inconvenient location', 'unsafe area', 'unsafe', 'far away', 'far']
    },
    'value': {
        'positive': ['good value', 'worth', 'price performance', 'great price', 'affordable', 'worth every penny', 'reasonable price', 'not expensive'],
        'negative': ['overpriced', 'not worth the money', 'too expensive', 'poor value', 'rip off']
    },
    'safety': {
        'positive': ['secure parking', 'high security', 'safe', 'secure', 'safe and sound'],
        'negative': ['unsafe', 'dangerous', 'poor security', 'risky', 'not secure', 'not safe']
    },
    'comfort': {
        'positive': ['very comfortable', 'comfortable', 'relaxing', 'peaceful', 'quiet', 'hot shower'],
        'negative': ['uncomfortable', 'very uncomfortable', 'hard bed', 'uncomfortable chairs', 'noise', 'noisy', 'poor insulation']
    },
    'transportation': {
        'positive': ['close to subway', 'shuttle', 'near the airport', 'good transport links', 'ample parking', 'close to transportation', 'close to bus', 'close to station', 'close to metro', 'close to airport', 'near the bus', 'near the station', 'near the metro', 'walking distance'],
        'negative': ['far from subway', 'no shuttle', 'far from airport', 'poor transportatiton', 'limited parking', 'far away', 'far']
    },
    'noise': {
        'positive': ['quiet room', 'soundproof', 'no noise', 'peaceful', 'quiet', 'silent'],
        'negative': ['noisy room', 'loud neighbors', 'traffic noise', 'thin walls', 'can hear everything', 'high volume', 'noisy', 'noise']
    }
}


In [6]:
def expand_keywords_with_wordnet(keywords):
    expanded_keywords = set()
    for keyword in keywords:
        for synset in wn.synsets(keyword):
            expanded_keywords.update(lemma.name().replace('_', ' ') for lemma in synset.lemmas())
            expanded_keywords.update(lemma.name().replace('_', ' ') for hypernym in synset.hypernyms() for lemma in hypernym.lemmas())
            expanded_keywords.update(lemma.name().replace('_', ' ') for hyponym in synset.hyponyms() for lemma in hyponym.lemmas())
    return list(expanded_keywords)


In [7]:
for feature, sentiments in feature_ngrams.items():
    for sentiment_type, keywords in sentiments.items():
        expanded_keywords = expand_keywords_with_wordnet(keywords)
        feature_ngrams[feature][sentiment_type] = expanded_keywords

print(feature_ngrams)

{'cleanliness': {'positive': ['strip', 'goodly', 'plumb', 'spick-and-span', 'take', 'kempt', 'straighten out', 'alter', 'clean and jerk', 'uncontaminating', 'make clean', 'immaculate', 'order', 'groom', 'bathe', 'unobjectionable', 'healthy', 'sporting', 'spring-clean', 'straighten', 'clean', 'houseclean', 'hygienize', 'spic', 'lather', 'douche', 'tidy up', 'white', 'fresh', 'divest', 'do the dishes', 'sizeable', 'hygienise', 'wash', 'make up', 'weightlifting', 'neaten', 'bream', 'fair', 'sweep', 'light', 'neat', 'hefty', 'sizable', 'sanitise', 'change', 'plum', 'scavenge', 'wash up', 'spotless', 'clean house', 'withdraw', 'steam clean', 'weightlift', 'plume', 'clear', 'brush', 'spic-and-span', 'make', 'goodish', 'square away', 'receptacle', 'sanitize', 'floss', 'take away', 'disinfect', 'G.I.', 'dust', 'steam', 'remove', 'fairly', 'vacuum-clean', 'deprive', 'decontaminate', 'vacuum', 'sportsmanlike', 'bath', 'be', 'lave', 'pick', 'speckless', 'dry clean', 'uninfected', 'blank', 'sporty

In [8]:
def calculate_feature_sentiments(review_df, feature_ngrams):
    for feature in feature_ngrams:
        review_df[f'{feature}_positive_score'] = 0 # default values
        review_df[f'{feature}_negative_score'] = 0

    # Calculate scores for each review
    for index, row in review_df.iterrows():
        text = row['review'].lower()
        for feature, sentiments in feature_ngrams.items():
            positive_score = sum(text.count(ngram) for ngram in sentiments['positive'])
            negative_score = sum(text.count(ngram) for ngram in sentiments['negative'])
            review_df.at[index, f'{feature}_positive_score'] += positive_score
            review_df.at[index, f'{feature}_negative_score'] += negative_score

    return review_df

In [10]:
review_df = calculate_feature_sentiments(review_df, feature_ngrams)
print(review_df.head())

review_df.to_csv('csv3/hotel_reviews_with_feature_sentiments.csv', index=False)


      city                           hotel_name        date  \
0  beijing  china beijing aloft beijing haidian  2009-10-12   
1  beijing  china beijing aloft beijing haidian  2009-09-25   
2  beijing  china beijing aloft beijing haidian  2009-08-04   
3  beijing  china beijing aloft beijing haidian  2009-07-17   
4  beijing  china beijing aloft beijing haidian  2009-05-30   

                                              review  \
0  nice trendy hotel location not too bad\ti stay...   
1  great budget hotel\tstayed two nights at aloft...   
2  excellent value  location not a big problem\tw...   
3  stylish clean reasonable value poor location\t...   
4  remote but excellent value for money\tstayed t...   

                                              tokens  sentiment  \
0  ['nice', 'trendy', 'hotel', 'location', 'bad',...     0.9835   
1  ['great', 'budget', 'hotel', 'stayed', 'two', ...     0.9777   
2  ['excellent', 'value', 'location', 'big', 'pro...     0.9956   
3  ['stylish', '

In [11]:
reivew_df = pd.read_csv('csv3/hotel_reviews_with_feature_sentiments.csv')

In [12]:
# Ensure these columns exist before proceeding to use them
if set([f"{feature}_positive_score" for feature in feature_ngrams.keys()]).issubset(review_df.columns):
    X = review_df[[f"{feature}_positive_score" for feature in feature_ngrams.keys()]]
    # Proceed with further analysis
else:
    print("Some expected columns are missing from the DataFrame.")

In [13]:
if 'overall_sentiment' in review_df.columns:
    review_df['satisfaction'] = (review_df['overall_sentiment'] > 0.5).astype(int)  # This is just an example
else:
    print("Column 'overall_sentiment' also not found. Check data processing steps.")


Column 'overall_sentiment' also not found. Check data processing steps.


In [14]:
# Check if required columns are present
required_columns = [f'{feature}_positive_score' for feature in feature_ngrams.keys()] + \
                   [f'{feature}_negative_score' for feature in feature_ngrams.keys()]

if not set(required_columns).issubset(set(review_df.columns)):
    print("Required sentiment score columns are missing from DataFrame. Check previous calculations.")
else:
    # Calculate 'overall_sentiment' as the sum of all positive scores minus all negative scores
    review_df['overall_sentiment'] = review_df[[col for col in review_df.columns if 'positive_score' in col]].sum(axis=1) - \
                                     review_df[[col for col in review_df.columns if 'negative_score' in col]].sum(axis=1)
    
    # Now, create the 'satisfaction' column based on 'overall_sentiment'
    review_df['satisfaction'] = (review_df['overall_sentiment'] > 0).astype(int)
    print(review_df.head())  # Verify the new columns are added


      city                           hotel_name        date  \
0  beijing  china beijing aloft beijing haidian  2009-10-12   
1  beijing  china beijing aloft beijing haidian  2009-09-25   
2  beijing  china beijing aloft beijing haidian  2009-08-04   
3  beijing  china beijing aloft beijing haidian  2009-07-17   
4  beijing  china beijing aloft beijing haidian  2009-05-30   

                                              review  \
0  nice trendy hotel location not too bad\ti stay...   
1  great budget hotel\tstayed two nights at aloft...   
2  excellent value  location not a big problem\tw...   
3  stylish clean reasonable value poor location\t...   
4  remote but excellent value for money\tstayed t...   

                                              tokens  sentiment  \
0  ['nice', 'trendy', 'hotel', 'location', 'bad',...     0.9835   
1  ['great', 'budget', 'hotel', 'stayed', 'two', ...     0.9777   
2  ['excellent', 'value', 'location', 'big', 'pro...     0.9956   
3  ['stylish', '

In [15]:
# Assert to ensure the column exists after creation
assert 'overall_sentiment' in review_df.columns, "overall_sentiment column was not created."

In [16]:
# Assuming you have added the 'satisfaction' column with some logic
review_df['satisfaction'] = (review_df['overall_sentiment'] > 0).astype(int)

# Immediately check if the column has been added
print(review_df.head())  # This should show the 'satisfaction' column


      city                           hotel_name        date  \
0  beijing  china beijing aloft beijing haidian  2009-10-12   
1  beijing  china beijing aloft beijing haidian  2009-09-25   
2  beijing  china beijing aloft beijing haidian  2009-08-04   
3  beijing  china beijing aloft beijing haidian  2009-07-17   
4  beijing  china beijing aloft beijing haidian  2009-05-30   

                                              review  \
0  nice trendy hotel location not too bad\ti stay...   
1  great budget hotel\tstayed two nights at aloft...   
2  excellent value  location not a big problem\tw...   
3  stylish clean reasonable value poor location\t...   
4  remote but excellent value for money\tstayed t...   

                                              tokens  sentiment  \
0  ['nice', 'trendy', 'hotel', 'location', 'bad',...     0.9835   
1  ['great', 'budget', 'hotel', 'stayed', 'two', ...     0.9777   
2  ['excellent', 'value', 'location', 'big', 'pro...     0.9956   
3  ['stylish', '

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Assuming 'satisfaction' is now correctly added
y = review_df['satisfaction']  # Target variable

# Proceed with your machine learning setup or analysis
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Example model training and evaluation, assuming you have the rest of your ML setup correct
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.71      0.51      0.60       840
           1       0.90      0.95      0.93      3778

    accuracy                           0.87      4618
   macro avg       0.80      0.73      0.76      4618
weighted avg       0.86      0.87      0.87      4618



In [18]:
# Aggregating average scores for each feature by hotel
hotel_feature_scores = review_df.groupby('hotel_name').agg({
    'cleanliness_positive_score': 'mean',
    'cleanliness_negative_score': 'mean',
    'room_positive_score': 'mean',
    'room_negative_score': 'mean',
    'service_positive_score': 'mean',
    'service_negative_score': 'mean',
    'location_positive_score': 'mean',
    'location_negative_score': 'mean',
    'value_positive_score': 'mean',
    'value_negative_score': 'mean',
    'safety_positive_score': 'mean',
    'safety_negative_score': 'mean',
    'comfort_positive_score': 'mean',
    'comfort_negative_score': 'mean',
    'transportation_positive_score': 'mean',
    'transportation_negative_score': 'mean',
    'noise_positive_score': 'mean',
    'noise_negative_score': 'mean'
}).reset_index()

print(hotel_feature_scores.head())

                                    hotel_name  cleanliness_positive_score  \
0          china beijing aloft beijing haidian                    7.000000   
1                 china beijing ascott beijing                    9.571429   
2  china beijing autumn garden courtyard hotel                    5.166667   
3            china beijing bamboo garden hotel                    5.245283   
4         china beijing beijing century towers                    8.000000   

   cleanliness_negative_score  room_positive_score  room_negative_score  \
0                    5.000000             0.333333                  0.0   
1                    6.857143             0.392857                  0.0   
2                    2.500000             0.333333                  0.0   
3                    3.924528             0.433962                  0.0   
4                    6.000000             0.000000                  0.0   

   service_positive_score  service_negative_score  location_positive_score  \
0 

In [19]:
# Example: Detailed scores for a specific hotel
specific_hotel_scores = hotel_feature_scores[hotel_feature_scores['hotel_name'] == 'Hotel Name Here']
print(specific_hotel_scores)

# Or for a specific feature across all hotels
specific_feature_scores = review_df[['hotel_name', 'cleanliness_positive_score']]
print(specific_feature_scores.sort_values(by='cleanliness_positive_score', ascending=False))

Empty DataFrame
Columns: [hotel_name, cleanliness_positive_score, cleanliness_negative_score, room_positive_score, room_negative_score, service_positive_score, service_negative_score, location_positive_score, location_negative_score, value_positive_score, value_negative_score, safety_positive_score, safety_negative_score, comfort_positive_score, comfort_negative_score, transportation_positive_score, transportation_negative_score, noise_positive_score, noise_negative_score]
Index: []
                                              hotel_name  \
1665             china beijing hilton beijing wangfujing   
18099  usa illinois chicago sheraton chicago hotel an...   
18090  usa illinois chicago sheraton chicago hotel an...   
18866             usa illinois chicago swissotel chicago   
22723         usa illinois chicago w chicago city center   
...                                                  ...   
3746                 china beijing raffles beijing hotel   
3745                 china beiji

In [25]:
hotel_feature_scores.to_csv('csv3/hotel_feature_scores.csv', index=False)
hotel_feature_scores.to_excel('csv3/hotel_feature_scores.xlsx', engine='openpyxl', index=False)