In [48]:
import sys
!{sys.executable} -m pip install spacy
!{sys.executable} -m spacy download en_core_web_sm


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 7.5 MB/s eta 0:00:01
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Improved Hotel Review Summarizer

This notebook demonstrates best practices for extracting insights from hotel reviews, including data cleaning, chunking, and summarization using modern LLM techniques.


In [57]:
# Cell 2: Import necessary libraries
# This cell imports libraries for data handling, text cleaning, and summarization.

import pandas as pd
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline

print("All libraries imported successfully.")


All libraries imported successfully.


## Load and Preview Data

This cell loads the hotel reviews from the CSV file and displays a sample to validate that the data is loaded correctly.


In [58]:
# Cell 4: Load the hotel reviews CSV and display a sample

df = pd.read_csv("tripadvisor_hotel_reviews.csv")
display(df.head())
print(f"Loaded {len(df)} reviews. Columns: {df.columns.tolist()}")


Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


Loaded 20491 reviews. Columns: ['Review', 'Rating']


## Data Cleaning and Preprocessing

This cell will clean the review text by removing extra spaces, fixing punctuation, and standardizing the format. It will also drop any rows with missing or empty reviews. The cleaned data will be displayed for validation.


In [59]:
# Cell 6: Clean and preprocess the review text

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\.(?=[^\s])', '. ', text)  # Ensure space after periods
    text = text.strip()
    return text

# Assume the review column is named 'Review'. Adjust if needed.
df = df.dropna(subset=['Review'])
df['Cleaned_Review'] = df['Review'].apply(clean_text)
df = df[df['Cleaned_Review'].str.strip() != '']
display(df[['Cleaned_Review']].head())
print(f"After cleaning: {len(df)} reviews remain.")


Unnamed: 0,Cleaned_Review
0,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso..."


After cleaning: 20491 reviews remain.


## Data Cleaning Validation

This cell will analyze the cleaned reviews for common issues such as:
- Non-ASCII characters
- Excessive punctuation or special symbols
- Very short or very long reviews
- Repeated characters or words

It will display examples if such issues are found, so we can decide if further cleaning is needed.


In [60]:
# Cell 8: Analyze cleaned reviews for further cleaning needs

# Find reviews with non-ASCII characters
non_ascii = df[df['Cleaned_Review'].apply(lambda x: any(ord(c) > 127 for c in x))]

# Find reviews with excessive punctuation or special symbols
import string
excessive_punct = df[df['Cleaned_Review'].apply(lambda x: sum(1 for c in x if c in string.punctuation) > 10)]

# Find very short or very long reviews
short_reviews = df[df['Cleaned_Review'].str.len() < 20]
long_reviews = df[df['Cleaned_Review'].str.len() > 1000]

# Find repeated characters (e.g., '!!!!', 'soooo')
repeated_chars = df[df['Cleaned_Review'].str.contains(r'(.)\1{3,}', regex=True)]

print(f"Non-ASCII reviews: {len(non_ascii)}")
if not non_ascii.empty:
    display(non_ascii[['Cleaned_Review']].head())

print(f"Reviews with excessive punctuation: {len(excessive_punct)}")
if not excessive_punct.empty:
    display(excessive_punct[['Cleaned_Review']].head())

print(f"Very short reviews: {len(short_reviews)}")
if not short_reviews.empty:
    display(short_reviews[['Cleaned_Review']].head())

print(f"Very long reviews: {len(long_reviews)}")
if not long_reviews.empty:
    display(long_reviews[['Cleaned_Review']].head())

print(f"Reviews with repeated characters: {len(repeated_chars)}")
if not repeated_chars.empty:
    display(repeated_chars[['Cleaned_Review']].head())


  repeated_chars = df[df['Cleaned_Review'].str.contains(r'(.)\1{3,}', regex=True)]


Non-ASCII reviews: 923


Unnamed: 0,Cleaned_Review
15,horrible customer service hotel stay february ...
20,service service service spent week g-friend la...
67,ace not place husband stayed ace hotel seattle...
384,not good respect frequent business traveler se...
400,"great stay, got decent price standard room kin..."


Reviews with excessive punctuation: 11723


Unnamed: 0,Cleaned_Review
0,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso..."


Very short reviews: 0
Very long reviews: 3936


Unnamed: 0,Cleaned_Review
1,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...
4,"great stay great stay, went seahawk game aweso..."
15,horrible customer service hotel stay february ...
16,disappointed say anticipating stay hotel monac...


Reviews with repeated characters: 244


Unnamed: 0,Cleaned_Review
19,hmmmmm say really high hopes hotel monaco chos...
25,"n't mind noise place great, read reviews noise..."
55,"nice place, lunatic 20000+ miles flying space ..."
177,"great deal priceline bid win, live vancouver b..."
420,"brrrrrrrrrrrrrrr coat, need hotel good rating ..."


In [None]:
# Cell 9: Remove non-ASCII characters and words with repeated characters
def remove_non_ascii(text):
    return ''.join([c if ord(c) < 128 else '' for c in text])

def remove_repeated_words(text):
    # Remove words with any character repeated 3 or more times in a row
    return ' '.join([w for w in text.split() if not re.search(r'(.)\1{2,}', w)])

df['Cleaned_Review'] = df['Cleaned_Review'].apply(remove_non_ascii)
df['Cleaned_Review'] = df['Cleaned_Review'].apply(remove_repeated_words)

display(df[['Cleaned_Review']].head(10))
print(f"After further cleaning: {len(df)} reviews remain.")


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m


Unnamed: 0,Cleaned_Review
0,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso..."
5,love monaco staff husband stayed hotel crazy w...
6,"cozy stay rainy city, husband spent 7 nights m..."
7,"excellent staff, housekeeping quality hotel ch..."
8,"hotel stayed hotel monaco cruise, rooms genero..."
9,excellent stayed hotel monaco past w/e delight...


After further cleaning: 20491 reviews remain.


## Further Cleaning: Remove Non-ASCII Characters and Words with Repeated Characters

This cell will:
- Remove any non-ASCII characters from the reviews (but keep the review itself)
- Remove words that contain repeated characters (e.g., 'soooo', '!!!!')

The cleaned results will be displayed for validation.


In [62]:
# Cell 10: Remove non-ASCII characters and words with repeated characters
def remove_non_ascii(text):
    return ''.join([c if ord(c) < 128 else '' for c in text])

def remove_repeated_words(text):
    # Remove words with any character repeated 3 or more times in a row
    return ' '.join([w for w in text.split() if not re.search(r'(.)\1{2,}', w)])

df['Cleaned_Review'] = df['Cleaned_Review'].apply(remove_non_ascii)
df['Cleaned_Review'] = df['Cleaned_Review'].apply(remove_repeated_words)

display(df[['Cleaned_Review']].head(10))
print(f"After further cleaning: {len(df)} reviews remain.")


Unnamed: 0,Cleaned_Review
0,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso..."
5,love monaco staff husband stayed hotel crazy w...
6,"cozy stay rainy city, husband spent 7 nights m..."
7,"excellent staff, housekeeping quality hotel ch..."
8,"hotel stayed hotel monaco cruise, rooms genero..."
9,excellent stayed hotel monaco past w/e delight...


After further cleaning: 20491 reviews remain.


## Sentiment-Based Chunking

From this point, the notebook will process positive and negative reviews separately. All chunking, summarization, and similarity analysis will be performed on these two groups independently to enable clear comparison between positive and negative sentiment.


In [63]:
# Split reviews by sentiment
positive_df = df[df['Rating'] >= 4]
negative_df = df[df['Rating'] <= 3]

print(f"Positive reviews: {len(positive_df)}")
print(f"Negative reviews: {len(negative_df)}")


Positive reviews: 15093
Negative reviews: 5398


In [64]:
# First, define the functions we need
import spacy
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 20_000_000  # Increase max length to handle large texts

def spacy_sent_tokenize(text):
    return [sent.text for sent in nlp(text).sents]

def group_sentences(sentences, max_chars=1200, overlap=150):
    chunks = []
    current_chunk = ""
    for sent in sentences:
        if len(current_chunk) + len(sent) + 1 <= max_chars:
            current_chunk += " " + sent if current_chunk else sent
        else:
            chunks.append(current_chunk)
            # Overlap: take last N chars from previous chunk
            if overlap > 0 and len(current_chunk) > overlap:
                current_chunk = current_chunk[-overlap:] + " " + sent
            else:
                current_chunk = sent
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

# Tokenize and chunk positive reviews
all_pos_sentences = []
for review in positive_df['Cleaned_Review']:
    all_pos_sentences.extend(spacy_sent_tokenize(review))
positive_chunks = group_sentences(all_pos_sentences, max_chars=1200, overlap=150)

print(f"Positive chunks: {len(positive_chunks)}")
print("\nSample positive chunk:\n", positive_chunks[0][:300], "...\n")


Positive chunks: 11843

Sample positive chunk:
 nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music roo ...



In [69]:
# Tokenize and chunk negative reviews
all_neg_sentences = []
for review in negative_df['Cleaned_Review']:
    all_neg_sentences.extend(spacy_sent_tokenize(review))
negative_chunks = group_sentences(all_neg_sentences, max_chars=1200, overlap=150)

print(f"Negative chunks: {len(negative_chunks)}")
print("\nSample negative chunk:\n", negative_chunks[0][:300], "...\n")


Negative chunks: 4946

Sample negative chunk:
 ok nothing special charge diamond member hilton decided chain shot 20th anniversary seattle, start booked suite paid extra website description not, suite bedroom bathroom standard hotel room, took printed reservation desk showed said things like tv couch ect desk clerk told oh mixed suites descripti ...



In [70]:
# (Optional) Print statistics for both sets of chunks
def print_chunk_stats(chunks, label):
    lengths = [len(chunk) for chunk in chunks]
    print(f"{label} - Min: {min(lengths)}, Max: {max(lengths)}, Avg: {sum(lengths)//len(lengths)}")

print_chunk_stats(positive_chunks, "Positive chunks")
print_chunk_stats(negative_chunks, "Negative chunks")


Positive chunks - Min: 158, Max: 5262, Avg: 1034
Negative chunks - Min: 191, Max: 4583, Avg: 1029


## Setup LangChain with FLAN-T5

We'll use LangChain's HuggingFace integration to work with FLAN-T5 for generating insights from reviews.


In [77]:
# Install LangChain
import sys
!{sys.executable} -m pip install langchain langchain-community langchain-huggingface

# Import LangChain components
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU (cuda)' if device == 0 else 'CPU'}")

# Load FLAN-T5 model
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Create HuggingFace pipeline
hf_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,
    device=device,
    temperature=0.7,
    top_p=0.9
)

# Wrap in LangChain
llm = HuggingFacePipeline(pipeline=hf_pipeline)

print("✓ LangChain with FLAN-T5 loaded successfully!")


Defaulting to user installation because normal site-packages is not writeable
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl (27 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.3.1
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Using device: CPU


Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✓ LangChain with FLAN-T5 loaded successfully!


## Generate Insights from Positive Reviews

Use FLAN-T5 with custom prompts to extract key insights and themes from positive review chunks.


In [79]:
# Create LangChain prompt template and chain for positive reviews
from tqdm import tqdm

# Define prompt template for positive reviews
positive_template = """Based on these hotel reviews, list the main positive aspects that guests appreciate:

{text}

Key positive aspects:"""

positive_prompt = PromptTemplate(
    input_variables=["text"],
    template=positive_template
)

# Create LangChain chain
positive_chain = LLMChain(llm=llm, prompt=positive_prompt)

# Generate insights for positive chunks
def generate_insights_with_langchain(chain, chunks, max_chunks=50):
    """
    Generate insights using LangChain.
    """
    insights = []
    chunks_to_process = chunks[:max_chunks]
    
    print(f"Generating insights for {len(chunks_to_process)} chunks...")
    
    for chunk in tqdm(chunks_to_process):
        try:
            result = chain.run(text=chunk)
            insights.append(result)
        except Exception as e:
            print(f"Error processing chunk: {e}")
            insights.append("Error generating insight")
    
    return insights

positive_insights = generate_insights_with_langchain(
    positive_chain,
    positive_chunks,
    max_chunks=50
)

print(f"\n✓ Generated {len(positive_insights)} insights for positive reviews")
print(f"\nSample positive insight:\n{positive_insights[0]}")


  positive_chain = LLMChain(llm=llm, prompt=positive_prompt)


Generating insights for 50 chunks...


  result = chain.run(text=chunk)
100%|██████████| 50/50 [10:56<00:00, 13.13s/it]


✓ Generated 50 insights for positive reviews

Sample positive insight:
nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe just noisy neighbors, aveda bath products nice, did not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping area, pet friendly room showed no signs animal hair smells, monaco suite sleeping area big striped curtains pulled closed nice touch felt cosy goldfish named brandi enjoyed, did n't partake free wine coffee/tea





## Generate Insights from Negative Reviews

Use FLAN-T5 to extract key complaints and issues from negative review chunks.


In [None]:
# Create LangChain prompt template and chain for negative reviews
negative_template = """Based on these hotel reviews, list the main complaints and issues that guests mentioned:

{text}

Key complaints and issues:"""

negative_prompt = PromptTemplate(
    input_variables=["text"],
    template=negative_template
)

# Create LangChain chain for negative reviews
negative_chain = LLMChain(llm=llm, prompt=negative_prompt)

# Generate insights for negative chunks
negative_insights = generate_insights_with_langchain(
    negative_chain,
    negative_chunks,
    max_chunks=50
)

print(f"\n✓ Generated {len(negative_insights)} insights for negative reviews")
print(f"\nSample negative insight:\n{negative_insights[0]}")


Summarizing 50 chunks...


 14%|█▍        | 1/7 [00:36<03:37, 36.21s/it]Your max_length is set to 150, but your input_length is only 123. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)
Your max_length is set to 150, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
 29%|██▊       | 2/7 [01:19<03:21, 40.40s/it]Your max_length is set to 150, but your input_length is only 145. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=72)
Your max_length is set to 150, but your input_length is only 83. Since this is a summarization task, where outputs shorter than the input are typically wanted, 


✓ Generated 50 summaries for negative reviews

Sample negative summary:
ok nothing special charge diamond member hilton decided chain shot 20th anniversary seattle, start booked suite paid extra website description not, suite bedroom bathroom standard hotel room, took printed reservation desk showed said things like tv couch ect desk clerk told oh mixed suites description





## Extract Key Themes Using Topic Modeling

Use keyword extraction to identify the most common themes in positive and negative reviews.


In [74]:
# Install required packages for keyword extraction
import sys
!{sys.executable} -m pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [75]:
# Extract key themes using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

def extract_key_themes(insights, top_n=15):
    """
    Extract key themes from insights using TF-IDF.
    """
    # Combine all insights into one corpus
    corpus = ' '.join(insights)
    
    # Use TF-IDF to extract important words
    vectorizer = TfidfVectorizer(
        max_features=100,
        stop_words='english',
        ngram_range=(1, 2),  # Include both single words and bigrams
        min_df=2
    )
    
    tfidf_matrix = vectorizer.fit_transform(insights)
    feature_names = vectorizer.get_feature_names_out()
    
    # Get average TF-IDF scores
    scores = tfidf_matrix.sum(axis=0).A1
    word_scores = dict(zip(feature_names, scores))
    
    # Sort by score
    sorted_themes = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_themes[:top_n]

# Extract themes from positive reviews
positive_themes = extract_key_themes(positive_insights, top_n=15)
print("Top Positive Review Themes:")
print("-" * 50)
for theme, score in positive_themes:
    print(f"  • {theme}: {score:.2f}")

print("\n")

# Extract themes from negative reviews
negative_themes = extract_key_themes(negative_insights, top_n=15)
print("Top Negative Review Themes:")
print("-" * 50)
for theme, score in negative_themes:
    print(f"  • {theme}: {score:.2f}")


Top Positive Review Themes:
--------------------------------------------------
  • hotel: 6.46
  • great: 5.92
  • good: 4.96
  • place: 4.80
  • room: 4.77
  • stay: 4.47
  • seattle: 4.43
  • location: 4.06
  • staff: 3.64
  • stayed: 3.61
  • nice: 3.53
  • friendly: 3.29
  • view: 3.26
  • service: 2.84
  • staff friendly: 2.75


Top Negative Review Themes:
--------------------------------------------------
  • hotel: 6.80
  • room: 5.98
  • stay: 3.99
  • stayed: 3.33
  • good: 3.29
  • place: 3.16
  • staff: 3.12
  • seattle: 2.78
  • bed: 2.76
  • night: 2.56
  • city: 2.47
  • just: 2.43
  • parking: 2.41
  • did: 2.33
  • desk: 2.32
