In [27]:
import time
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
vader_sentiment = SentimentIntensityAnalyzer()

# Define sentiment calculation function
def calc_sentiment(review):    
    if review == "No Negative" or review == "No Positive":
        return 0
    return vader_sentiment.polarity_scores(review)["compound"]   

# Load the hotel reviews CSV (same directory)
print("Loading CSV file...")
df = pd.read_csv("Hotel_Reviews_Filtered.csv")

# Remove stop words efficiently
print("Removing stop words...")
start = time.time()
cache = set(stopwords.words("english"))

def remove_stopwords(review):
    return " ".join([word for word in str(review).split() if word.lower() not in cache])

df["Negative_Review"] = df["Negative_Review"].apply(remove_stopwords)   
df["Positive_Review"] = df["Positive_Review"].apply(remove_stopwords)

print(f"Stop word removal completed in {round(time.time() - start, 2)} seconds.")

# Calculate sentiment for both positive and negative reviews
print("Calculating sentiment columns for both positive and negative reviews...")
start = time.time()
df["Negative_Sentiment"] = df["Negative_Review"].apply(calc_sentiment)
df["Positive_Sentiment"] = df["Positive_Review"].apply(calc_sentiment)
print(f"Sentiment calculation completed in {round(time.time() - start, 2)} seconds.")

# Optional: Sort and preview
df = df.sort_values(by="Negative_Sentiment", ascending=True)
print(df[["Negative_Review", "Negative_Sentiment"]].head())

df = df.sort_values(by="Positive_Sentiment", ascending=True)
print(df[["Positive_Review", "Positive_Sentiment"]].head())

# Reorder columns for readability
columns_order = [
    "Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score",
    "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment",
    "Reviewer_Nationality", "Leisure_trip", "Couple", "Solo_traveler",
    "Business_trip", "Group", "Family_with_young_children",
    "Family_with_older_children", "With_a_pet", "Negative_Review", "Positive_Review"
]
df = df.reindex(columns=columns_order)

# Save the updated DataFrame to CSV in same directory
print("Saving results to Hotel_Reviews_NLP.csv...")
df.to_csv("Hotel_Reviews_NLP.csv", index=False)
print("Done.")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lacuesta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Lacuesta\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Loading CSV file...
Removing stop words...
Stop word removal completed in 4.69 seconds.
Calculating sentiment columns for both positive and negative reviews...
Sentiment calculation completed in 145.38 seconds.
                                          Negative_Review  Negative_Sentiment
186584  bad experience memories hotel first night arri...             -0.9910
307286  staff bad experience even booking January arri...             -0.9889
129503  First charged twice room booked booking second...             -0.9886
201293  usually traveling Paris 2 3 times year busines...             -0.9869
172207  1 rooms dirty dusty 2 toilet drain stank badly...             -0.9869
                                          Positive_Review  Positive_Sentiment
137893  Bathroom Shower going stay twice hotel 2 night...             -0.9802
5839    completely disappointed mad since reception st...             -0.9780
64158   get everything extra internet parking breakfas...             -0.9751
489137  r