# Preparation

In [9]:
# import packages
import openai
import json
from IPython.display import Image, display
from openai import OpenAI
import pandas as pd
import re
from collections import defaultdict
from pydantic import BaseModel
from typing import List
import numpy as np
import os
# API key for OpenAI
openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(
  api_key=openai_api_key
)

# Summaries

In [64]:
# subset of the data in database reviews general
reviews_general = pd.read_csv("reviews_general_selected.csv")

# read in extracted category subset
categorized_sentences = pd.read_csv("extracted_category_subset.csv")

# merge categorized sentences with review information, but keep only reviews that are included in the categorized subset
reviews_subcategories = pd.merge(reviews_general, categorized_sentences, on="review_id", how="inner")
# keep only review_id, restaurant_id, food_sentences, atmosphere_sentences, service_sentences
reviews_subcategories = reviews_subcategories[["review_id", "restaurant_id", "food_sentences", "service_sentences", "atmosphere_sentences", "price_sentences"]]
print(reviews_subcategories.head())

   review_id                restaurant_id  \
0       4471  ChIJ_VWb4xn6mUcRH4NujtHMKJI   
1       4472  ChIJ_VWb4xn6mUcRH4NujtHMKJI   
2       4473  ChIJ_VWb4xn6mUcRH4NujtHMKJI   
3       4474  ChIJ_VWb4xn6mUcRH4NujtHMKJI   
4       4475  ChIJ_VWb4xn6mUcRH4NujtHMKJI   

                                      food_sentences  \
0  Aber das das was ich hier erleben durfte war d...   
1  Der Kaffee hat gut geschmeckt und der Muffin w...   
2  Die Qualität lässt sehr zu wünschen übrig sieh...   
3                                                NaN   
4                                                NaN   

                                   service_sentences  \
0       Ich fühle mich echt vor den Kopf gestoßen!!!   
1                                                NaN   
2                                                NaN   
3                                                NaN   
4  Dame mit kurzen weißen Haaren hat mich nicht b...   

                                atmosphere_sentences  \

In [65]:
# read in information about the restaurants
restaurant_basics = pd.read_csv("API_basics.csv")

# keep only information about restaurants that have reviews in reviews_subcategories['restaurant_id']
restaurant_basics = restaurant_basics[restaurant_basics['restaurant_id'].isin(reviews_subcategories['restaurant_id'])]
print(restaurant_basics.head())

                 restaurant_id  city_id  \
0  ChIJ_VWb4xn6mUcRH4NujtHMKJI        0   
1  ChIJo5EYOK_4mUcRi4shjNiEDUc        0   
2  ChIJ_VfiMxj6mUcRRK_QBdxww7g        0   
3  ChIJT4FlA7zwmUcRcrmR1JIlwm8        0   
5  ChIJ5dhz-EXxmUcRuZTn4wsBpQs        0   

                                                name        primary_type  \
0                                Café Kult Dußlingen                cafe   
1                                          Boxenstop                 bar   
2                                         Alte Krone  italian_restaurant   
3                                       Nazar Imbiss          restaurant   
5  Italienisches Restaurant in Gomaringen Trattor...  italian_restaurant   

                                               types business_status  \
0  ['cafe', 'breakfast_restaurant', 'coffee_shop'...     OPERATIONAL   
1  ['bar', 'restaurant', 'point_of_interest', 'fo...     OPERATIONAL   
2  ['italian_restaurant', 'liquor_store', 'restau...     OPERATIONAL

In [33]:
# start with five restaurants
restaurant_ids = reviews_subcategories['restaurant_id'].unique()[:5]  # Select first three restaurants
filtered_reviews = reviews_subcategories[reviews_subcategories['restaurant_id'].isin(restaurant_ids)]
filtered_restaurants = restaurant_basics[restaurant_basics['restaurant_id'].isin(restaurant_ids)]

### 1. Overall summary

In [21]:
# Define the system prompt for summarizing restaurant reviews
overall_summary_prompt = (
    "You are an expert summarizer specializing in restaurant reviews. Summarize the following reviews "
    "for the restaurant in a concise and informative manner. Be sure to include the overall tone of the reviews."
    "Return the summary in plain text.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

# Function to summarize a chunk of reviews
def summarize_reviews_chunk(reviews_chunk):
    summary_prompt = overall_summary_prompt.format(reviews=reviews_chunk)
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": summary_prompt},
            ],
            max_tokens=300,
            temperature=0.7,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error summarizing reviews chunk: {e}")
        return None

# Function to summarize reviews for a single restaurant
def summarize_reviews_for_restaurant(restaurant_id, reviews_df):
    # Get all reviews for the restaurant
    restaurant_reviews = reviews_df[reviews_df['restaurant_id'] == restaurant_id]['review_text'].tolist()
    
    # Join reviews into a single string
    reviews_text = "\n".join(restaurant_reviews)
    
    try:
        # Attempt to summarize all reviews at once
        return summarize_reviews_chunk(reviews_text)
    
    except Exception as e:
        if "context_length_exceeded" in str(e):
            print(f"Context length exceeded for restaurant {restaurant_id}. Splitting reviews...")
            
            # Split reviews into two halves
            mid_point = len(restaurant_reviews) // 2
            first_half = "\n".join(restaurant_reviews[:mid_point])
            second_half = "\n".join(restaurant_reviews[mid_point:])
            
            # Summarize each half
            first_summary = summarize_reviews_chunk(first_half)
            second_summary = summarize_reviews_chunk(second_half)
            
            if first_summary and second_summary:
                # Combine the two summaries
                combined_prompt = (
                    "You are an expert summarizer specializing in restaurant reviews. "
                    "Combine the following two summaries into a single cohesive and informative summary:\n\n"
                    "Summary 1:\n{summary1}\n\n"
                    "Summary 2:\n{summary2}\n\n"
                    "Combined Summary:"
                ).format(summary1=first_summary, summary2=second_summary)
                
                try:
                    response = client.chat.completions.create(
                        model="gpt-4",
                        messages=[
                            {"role": "system", "content": "You are a helpful assistant."},
                            {"role": "user", "content": combined_prompt},
                        ],
                        max_tokens=300,
                        temperature=0.7,
                    )
                    return response.choices[0].message.content.strip()
                except Exception as combine_error:
                    print(f"Error combining summaries for restaurant {restaurant_id}: {combine_error}")
                    return f"{first_summary}"  # Return the first summaries if combining fails
            else:
                return f"{first_summary or 'Error in first half'}\n\n{second_summary or 'Error in second half'}"
        else:
            print(f"Error summarizing reviews for restaurant {restaurant_id}: {e}")
            return None

# Prepare a DataFrame to store summaries
summaries = []

for restaurant_id in restaurant_ids:
    print(f"Processing restaurant_id: {restaurant_id}")
    summary = summarize_reviews_for_restaurant(restaurant_id, filtered_reviews)
    summaries.append({
        "restaurant_id": restaurant_id,
        "summary": summary
    })
    print(f"Summary for restaurant_id {restaurant_id}:\n{summary}\n")

# combine summaries to the filtered_restaurants DataFrame by restaurant_id
summaries_df = pd.DataFrame(summaries)
print(summaries_df)
filtered_restaurants = pd.merge(filtered_restaurants, summaries_df, on="restaurant_id", how="inner")

Processing restaurant_id: ChIJ_VWb4xn6mUcRH4NujtHMKJI
Summary for restaurant_id ChIJ_VWb4xn6mUcRH4NujtHMKJI:
The reviews for the café highlight a mix of positive and negative experiences. Many patrons appreciate the cozy ambiance, good coffee, and fresh backwaren (baked goods), with several mentioning the enjoyable breakfast options and friendly staff. The café's location is praised for its outdoor seating and proximity to the town square, making it a nice spot for relaxation. However, several complaints arise regarding inconsistent service quality and some dissatisfaction with the freshness of certain baked items, including issues with the quality of pretzels and other pastries. A few customers noted unprofessional behavior from staff, which detracted from their experience. Overall, the tone of the reviews is varied, combining enthusiasm for the café's offerings with critical feedback on service and product quality.

Processing restaurant_id: ChIJo5EYOK_4mUcRi4shjNiEDUc
Summary for re

#### prompt adjustments
- ensure the summary is written in englisch
- only one shorter paragraph is written (around 200 characters)
- the name of the restaurant which can be found in filtered_restaurants['name'] is included

In [31]:
# Define the system prompt for summarizing restaurant reviews
overall_summary_prompt = (
    "You are an expert summarizer specializing in restaurant reviews. Summarize the following reviews "
    "for the restaurant '{restaurant_name}' in English. The summary should be concise, written in one short paragraph, "
    "and limited to around 200 characters. Be sure to include the overall tone of the reviews and mention the restaurant name explicitly.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

# Function to summarize a chunk of reviews
def summarize_reviews_chunk(reviews_chunk, restaurant_name):
    summary_prompt = overall_summary_prompt.format(reviews=reviews_chunk, restaurant_name=restaurant_name)
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": summary_prompt},
            ],
            max_tokens=100,  # Limit tokens for brevity
            temperature=0.7,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error summarizing reviews chunk: {e}")
        return None

# Function to summarize reviews for a single restaurant
def summarize_reviews_for_restaurant(restaurant_id, reviews_df, restaurant_name):
    # Get all reviews for the restaurant
    restaurant_reviews = reviews_df[reviews_df['restaurant_id'] == restaurant_id]['review_text'].tolist()
    
    # Join reviews into a single string
    reviews_text = "\n".join(restaurant_reviews)
    
    try:
        # Attempt to summarize all reviews at once
        return summarize_reviews_chunk(reviews_text, restaurant_name)
    
    except Exception as e:
        if "context_length_exceeded" in str(e):
            print(f"Context length exceeded for restaurant {restaurant_id}. Splitting reviews...")
            
            # Split reviews into two halves
            mid_point = len(restaurant_reviews) // 2
            first_half = "\n".join(restaurant_reviews[:mid_point])
            second_half = "\n".join(restaurant_reviews[mid_point:])
            
            # Summarize each half
            first_summary = summarize_reviews_chunk(first_half, restaurant_name)
            second_summary = summarize_reviews_chunk(second_half, restaurant_name)
            
            if first_summary and second_summary:
                # Combine the two summaries
                combined_prompt = (
                    "You are an expert summarizer specializing in restaurant reviews. "
                    "Combine the following two summaries into a single concise and cohesive summary in English. "
                    "The summary should mention the restaurant name ('{restaurant_name}') explicitly and "
                    "be limited to around 200 characters:\n\n"
                    "Summary 1:\n{summary1}\n\n"
                    "Summary 2:\n{summary2}\n\n"
                    "Combined Summary:"
                ).format(restaurant_name=restaurant_name, summary1=first_summary, summary2=second_summary)
                
                try:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                            {"role": "system", "content": "You are a helpful assistant."},
                            {"role": "user", "content": combined_prompt},
                        ],
                        max_tokens=100,  # Limit tokens for brevity
                        temperature=0.7,
                    )
                    return response.choices[0].message.content.strip()
                except Exception as combine_error:
                    print(f"Error combining summaries for restaurant {restaurant_id}: {combine_error}")
                    return f"{first_summary}"  # Return the first summaries if combining fails
            else:
                return f"{first_summary or 'Error in first half'}\n\n{second_summary or 'Error in second half'}"
        else:
            print(f"Error summarizing reviews for restaurant {restaurant_id}: {e}")
            return None

# Prepare a DataFrame to store summaries
summaries = []

for _, row in filtered_restaurants.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    print(f"Processing restaurant_id: {restaurant_id} ({restaurant_name})")
    summary = summarize_reviews_for_restaurant(restaurant_id, filtered_reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "summary": summary
    })
    print(f"Summary for restaurant_id {restaurant_id} ({restaurant_name}):\n{summary}\n")

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries_df = pd.DataFrame(summaries)
filtered_restaurants_summaries = pd.merge(filtered_restaurants, summaries_df, on="restaurant_id", how="left")
# rename column summary to overall_summary
filtered_restaurants_summaries.rename(columns={"summary": "overall_summary"}, inplace=True)


Processing restaurant_id: ChIJ_VWb4xn6mUcRH4NujtHMKJI (Café Kult Dußlingen)
Summary for restaurant_id ChIJ_VWb4xn6mUcRH4NujtHMKJI (Café Kult Dußlingen):
Café Kult Dußlingen offers a cozy atmosphere with friendly service and good coffee, but reviews are mixed on the quality of baked goods and service consistency, with some customers expressing disappointment.

Processing restaurant_id: ChIJo5EYOK_4mUcRi4shjNiEDUc (Boxenstop)
Summary for restaurant_id ChIJo5EYOK_4mUcRi4shjNiEDUc (Boxenstop):
Boxenstop in Dusslingen is a lively bar known for its friendly atmosphere, excellent drinks, and welcoming owner, Bianco. Reviews highlight its great parties, dart games, and affordable prices, making it a favorite spot for celebrations.

Processing restaurant_id: ChIJ_VfiMxj6mUcRRK_QBdxww7g (Alte Krone)
Summary for restaurant_id ChIJ_VfiMxj6mUcRRK_QBdxww7g (Alte Krone):
Alte Krone in Dußlingen is praised for its authentic Italian cuisine, friendly service, and welcoming atmosphere. Diners rave about

In [None]:
#### without short in prompt

# Define the system prompt for summarizing restaurant reviews
overall_summary_prompt = (
    "You are an expert summarizer specializing in restaurant reviews. Summarize the following reviews "
    "for the restaurant '{restaurant_name}' in English. The summary should be concise, written in one paragraph, "
    "and limited to around 400 characters. Be sure to include the overall tone of the reviews and mention the restaurant name.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

# Function to summarize a chunk of reviews
def summarize_reviews_chunk(reviews_chunk, restaurant_name):
    summary_prompt = overall_summary_prompt.format(reviews=reviews_chunk, restaurant_name=restaurant_name)
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": summary_prompt},
            ],
            max_tokens=200,
            temperature=0.7,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error summarizing reviews chunk: {e}")
        return None

# Function to summarize reviews for a single restaurant
def summarize_reviews_for_restaurant(restaurant_id, reviews_df, restaurant_name):
    # Get all reviews for the restaurant
    restaurant_reviews = reviews_df[reviews_df['restaurant_id'] == restaurant_id]['review_text'].tolist()
    
    # Join reviews into a single string
    reviews_text = "\n".join(restaurant_reviews)
    
    try:
        # Attempt to summarize all reviews at once
        return summarize_reviews_chunk(reviews_text, restaurant_name)
    
    except Exception as e:
        if "context_length_exceeded" in str(e):
            print(f"Context length exceeded for restaurant {restaurant_id}. Splitting reviews...")
            
            # Split reviews into two halves
            mid_point = len(restaurant_reviews) // 2
            first_half = "\n".join(restaurant_reviews[:mid_point])
            second_half = "\n".join(restaurant_reviews[mid_point:])
            
            # Summarize each half
            first_summary = summarize_reviews_chunk(first_half, restaurant_name)
            second_summary = summarize_reviews_chunk(second_half, restaurant_name)
            
            if first_summary and second_summary:
                # Combine the two summaries
                combined_prompt = (
                    "You are an expert summarizer specializing in restaurant reviews. "
                    "Combine the following two summaries into a single cohesive summary in English. "
                    "The summary should mention the restaurant name ('{restaurant_name}') and "
                    "be limited to around 400 characters:\n\n"
                    "Summary 1:\n{summary1}\n\n"
                    "Summary 2:\n{summary2}\n\n"
                    "Combined Summary:"
                ).format(restaurant_name=restaurant_name, summary1=first_summary, summary2=second_summary)
                
                try:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                            {"role": "system", "content": "You are a helpful assistant."},
                            {"role": "user", "content": combined_prompt},
                        ],
                        max_tokens=200,  # Limit tokens for brevity
                        temperature=0.7,
                    )
                    return response.choices[0].message.content.strip()
                except Exception as combine_error:
                    print(f"Error combining summaries for restaurant {restaurant_id}: {combine_error}")
                    return f"{first_summary}"  # Return the first summaries if combining fails
            else:
                return f"{first_summary or 'Error in first half'}\n\n{second_summary or 'Error in second half'}"
        else:
            print(f"Error summarizing reviews for restaurant {restaurant_id}: {e}")
            return None

In [33]:
# Prepare a DataFrame to store summaries
summaries = []

for _, row in filtered_restaurants.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    print(f"Processing restaurant_id: {restaurant_id} ({restaurant_name})")
    summary = summarize_reviews_for_restaurant(restaurant_id, filtered_reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "summary": summary
    })
    print(f"Summary for restaurant_id {restaurant_id} ({restaurant_name}):\n{summary}\n")

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries_df = pd.DataFrame(summaries)
filtered_restaurants_summaries = pd.merge(filtered_restaurants, summaries_df, on="restaurant_id", how="left")
# rename column summary to overall_summary
filtered_restaurants_summaries.rename(columns={"summary": "overall_summary"}, inplace=True)

Processing restaurant_id: ChIJ3xCcNgDxmUcRtLLUkaG5wY8 (La piccola s'Casa Imbiss Pizza)
Summary for restaurant_id ChIJ3xCcNgDxmUcRtLLUkaG5wY8 (La piccola s'Casa Imbiss Pizza):
La Piccola s'Casa Imbiss Pizza receives mixed reviews. Many customers praise the delicious food, including pizza and burgers, as well as the quick and attentive service, noting it's a great value for money. However, some reviews highlight significant issues with online ordering and delivery times, leading to frustration and dissatisfaction. Overall, while the food quality is well-received, there are complaints regarding the handling of online orders.

Processing restaurant_id: ChIJ3bJ1WPDxmUcRBC55YV0woIA (s'Casa)
Summary for restaurant_id ChIJ3bJ1WPDxmUcRBC55YV0woIA (s'Casa):
'sCasa' receives positive feedback for its excellent staff, friendly clientele, and delicious beer. Reviewers highlight the welcoming atmosphere and attentive service, making it a pleasant place to visit. The overall tone is enthusiastic and 

In [37]:
### now for all restaurants we have

# Prepare a DataFrame to store summaries
summaries = []

for _, row in restaurant_basics.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_reviews_for_restaurant(restaurant_id, reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "overall_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
filtered_restaurants_summaries = pd.merge(restaurant_basics, summaries, on="restaurant_id", how="left")

15 min for 228 restaurants and 34,340 reviews
Note: some (I found 3) summaries are in german

#### with language detection

In [41]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0  # Ensure deterministic results

# Define the system prompt for summarizing restaurant reviews
overall_summary_prompt = (
    "You are an expert summarizer specializing in restaurant reviews. Summarize the following reviews "
    "for the restaurant '{restaurant_name}' in English. The summary should be concise, written in one paragraph, "
    "and limited to around 400 characters. Be sure to include the overall tone of the reviews and mention the restaurant name.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

# Function to ensure the summary is in English
def ensure_english(summary, restaurant_name, reviews_chunk):
    if detect(summary) != "en":
        print(f"Non-English summary detected for '{restaurant_name}'. Retrying...")
        
        # Retry with a more specific prompt
        retry_prompt = (
            "You are an expert summarizer specializing in restaurant reviews. Summarize the following reviews "
            f"for the restaurant '{restaurant_name}' strictly in English. Avoid using any other language. "
            "The summary should be concise, written in one paragraph, "
            "and limited to around 400 characters.\n\n"
            "Reviews:\n{reviews}\n\n"
            "Summary:"
        ).format(restaurant_name=restaurant_name, reviews=reviews_chunk)
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": retry_prompt},
                ],
                max_tokens=200,
                temperature=0.7,
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Retry failed for '{restaurant_name}': {e}")
            return summary  # Return the original summary if retry fails
    return summary

# Update the summarize_reviews_chunk function to include language verification
def summarize_reviews_chunk(reviews_chunk, restaurant_name):
    summary_prompt = overall_summary_prompt.format(reviews=reviews_chunk, restaurant_name=restaurant_name)
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": summary_prompt},
            ],
            max_tokens=200,
            temperature=0.7,
        )
        summary = response.choices[0].message.content.strip()
        return ensure_english(summary, restaurant_name, reviews_chunk)
    except Exception as e:
        print(f"Error summarizing reviews chunk: {e}")
        return None

# Function to summarize reviews for a single restaurant
def summarize_reviews_for_restaurant(restaurant_id, reviews_df, restaurant_name):
    # Get all reviews for the restaurant
    restaurant_reviews = reviews_df[reviews_df['restaurant_id'] == restaurant_id]['review_text'].tolist()
    
    # Join reviews into a single string
    reviews_text = "\n".join(restaurant_reviews)
    
    try:
        # Attempt to summarize all reviews at once
        return summarize_reviews_chunk(reviews_text, restaurant_name)
    
    except Exception as e:
        if "context_length_exceeded" in str(e):
            print(f"Context length exceeded for restaurant {restaurant_id}. Splitting reviews...")
            
            # Split reviews into two halves
            mid_point = len(restaurant_reviews) // 2
            first_half = "\n".join(restaurant_reviews[:mid_point])
            second_half = "\n".join(restaurant_reviews[mid_point:])
            
            # Summarize each half
            first_summary = summarize_reviews_chunk(first_half, restaurant_name)
            second_summary = summarize_reviews_chunk(second_half, restaurant_name)
            
            if first_summary and second_summary:
                # Combine the two summaries
                combined_prompt = (
                    "You are an expert summarizer specializing in restaurant reviews. "
                    "Combine the following two summaries into a single cohesive summary in English. "
                    "The summary should mention the restaurant name ('{restaurant_name}') and "
                    "be limited to around 400 characters:\n\n"
                    "Summary 1:\n{summary1}\n\n"
                    "Summary 2:\n{summary2}\n\n"
                    "Combined Summary:"
                ).format(restaurant_name=restaurant_name, summary1=first_summary, summary2=second_summary)
                
                try:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                            {"role": "system", "content": "You are a helpful assistant."},
                            {"role": "user", "content": combined_prompt},
                        ],
                        max_tokens=200,  # Limit tokens for brevity
                        temperature=0.7,
                    )
                    combined_summary = response.choices[0].message.content.strip()
                    return ensure_english(combined_summary, restaurant_name, reviews_text)
                except Exception as combine_error:
                    print(f"Error combining summaries for restaurant {restaurant_id}: {combine_error}")
                    return f"{first_summary}"  # Return the first summaries if combining fails
            else:
                return f"{first_summary or 'Error in first half'}\n\n{second_summary or 'Error in second half'}"
        else:
            print(f"Error summarizing reviews for restaurant {restaurant_id}: {e}")
            return None

In [42]:
# Prepare a DataFrame to store summaries
summaries = []

for _, row in filtered_restaurants.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_reviews_for_restaurant(restaurant_id, filtered_reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "overall_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
summaries_test = pd.merge(restaurant_basics, summaries, on="restaurant_id", how="left")

In [43]:
# Prepare a DataFrame to store summaries
summaries = []

for _, row in restaurant_basics.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_reviews_for_restaurant(restaurant_id, reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "overall_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
filtered_restaurants_summaries = pd.merge(restaurant_basics, summaries, on="restaurant_id", how="left")

### 2. Food summary

In [49]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0  # Ensure deterministic results

# Define the system prompt for summarizing food perceptions
food_summary_prompt = (
    "You are an expert summarizer specializing in customer opinions about food. Summarize the following reviews in English."
    "The summary should focus on customer perceptions of the food, "
    "including aspects like taste, presentation, freshness, and variety. Write concisely in one paragraph, "
    "and limit the summary to around 400 characters. \n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

# Function to ensure the summary is in English
def ensure_english(summary, restaurant_name, reviews_chunk):
    if detect(summary) != "en":
        print(f"Non-English summary detected for '{restaurant_name}'. Retrying...")
        
        # Retry with a more specific prompt
        retry_prompt = (
            "You are an expert summarizer specializing in customer opinions about food. Summarize the following reviews strictly in English. "
            "Focus on customer perceptions of the food, "
            "including aspects like taste, presentation, freshness, and variety. Write concisely in one paragraph, "
            "and limit the summary to around 400 characters. Avoid using any other language.\n\n"
            "Reviews:\n{reviews}\n\n"
            "Summary:"
        ).format(restaurant_name=restaurant_name, reviews=reviews_chunk)
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": retry_prompt},
                ],
                max_tokens=200,
                temperature=0.7,
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Retry failed for '{restaurant_name}': {e}")
            return summary  # Return the original summary if retry fails
    return summary

# Update the summarize_reviews_chunk function to include language verification
def summarize_reviews_chunk(reviews_chunk, restaurant_name):
    summary_prompt = food_summary_prompt.format(reviews=reviews_chunk, restaurant_name=restaurant_name)
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": summary_prompt},
            ],
            max_tokens=200,
            temperature=0.7,
        )
        summary = response.choices[0].message.content.strip()
        return ensure_english(summary, restaurant_name, reviews_chunk)
    except Exception as e:
        print(f"Error summarizing reviews chunk: {e}")
        return None

# Function to summarize reviews for a single restaurant
def summarize_reviews_for_restaurant(restaurant_id, reviews_df, restaurant_name):
    # Filter reviews and convert to strings, ignoring missing values
    food_reviews = reviews_df[reviews_df['restaurant_id'] == restaurant_id]['food_sentences'].dropna().astype(str).tolist()
    
    # Join food related reviews into a single string
    food_text = "\n".join(food_reviews)
    
    try:
        # Attempt to summarize all reviews at once
        return summarize_reviews_chunk(food_text, restaurant_name)
    
    except Exception as e:
        if "context_length_exceeded" in str(e):
            print(f"Context length exceeded for restaurant {restaurant_id}. Splitting reviews...")
            
            # Split reviews into two halves
            mid_point = len(food_reviews) // 2
            first_half = "\n".join(food_reviews[:mid_point])
            second_half = "\n".join(food_reviews[mid_point:])
            
            # Summarize each half
            first_summary = summarize_reviews_chunk(first_half, restaurant_name)
            second_summary = summarize_reviews_chunk(second_half, restaurant_name)
            
            if first_summary and second_summary:
                # Combine the two summaries
                combined_prompt = (
                    "You are an expert summarizer specializing in customer opinions about food. "
                    "Combine the following two summaries into a single cohesive summary in English. "
                    "The summary should focus on customer perceptions of the food, "
                    "including taste, presentation, freshness, and variety. "
                    "Limit the summary to around 400 characters:\n\n"
                    "Summary 1:\n{summary1}\n\n"
                    "Summary 2:\n{summary2}\n\n"
                    "Combined Summary:"
                ).format(restaurant_name=restaurant_name, summary1=first_summary, summary2=second_summary)
                
                try:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                            {"role": "system", "content": "You are a helpful assistant."},
                            {"role": "user", "content": combined_prompt},
                        ],
                        max_tokens=200,  # Limit tokens for brevity
                        temperature=0.7,
                    )
                    combined_summary = response.choices[0].message.content.strip()
                    return ensure_english(combined_summary, restaurant_name, food_text)
                except Exception as combine_error:
                    print(f"Error combining summaries for restaurant {restaurant_id}: {combine_error}")
                    return f"{first_summary}"  # Return the first summaries if combining fails
            else:
                return f"{first_summary or 'Error in first half'}\n\n{second_summary or 'Error in second half'}"
        else:
            print(f"Error summarizing reviews for restaurant {restaurant_id}: {e}")
            return None


In [50]:
### for testing

# Prepare a DataFrame to store summaries
summaries = []

for _, row in filtered_restaurants.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_reviews_for_restaurant(restaurant_id, filtered_reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "food_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
summaries_test = pd.merge(filtered_restaurants, summaries, on="restaurant_id", how="left")

#### adding food recommendations

In [51]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0  # Ensure deterministic results

# Function to ensure the summary is in English
def ensure_english(summary, restaurant_name, reviews_chunk):
    if detect(summary) != "en":
        print(f"Non-English summary detected for '{restaurant_name}'. Retrying...")
        
        # Retry with the retry prompt
        retry_prompt = RETRY_PROMPT.format(reviews=reviews_chunk)
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": retry_prompt},
                ],
                max_tokens=200,
                temperature=0.7,
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Retry failed for '{restaurant_name}': {e}")
            return summary  # Return the original summary if retry fails
    return summary

# Update the summarize_reviews_chunk function to include language verification
def summarize_reviews_chunk(reviews_chunk, restaurant_name):
    summary_prompt = FOOD_SUMMARY_PROMPT.format(reviews=reviews_chunk)
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": summary_prompt},
            ],
            max_tokens=200,
            temperature=0.7,
        )
        summary = response.choices[0].message.content.strip()
        return ensure_english(summary, restaurant_name, reviews_chunk)
    except Exception as e:
        print(f"Error summarizing reviews chunk: {e}")
        return None

# Function to summarize reviews for a single restaurant
def summarize_reviews_for_restaurant(restaurant_id, reviews_df, restaurant_name):
    # Filter reviews and convert to strings, ignoring missing values
    food_reviews = reviews_df[reviews_df['restaurant_id'] == restaurant_id]['food_sentences'].dropna().astype(str).tolist()
    
    # Join food related reviews into a single string
    food_text = "\n".join(food_reviews)
    
    try:
        # Attempt to summarize all reviews at once
        return summarize_reviews_chunk(food_text, restaurant_name)
    
    except Exception as e:
        if "context_length_exceeded" in str(e):
            print(f"Context length exceeded for restaurant {restaurant_id}. Splitting reviews...")
            
            # Split reviews into two halves
            mid_point = len(food_reviews) // 2
            first_half = "\n".join(food_reviews[:mid_point])
            second_half = "\n".join(food_reviews[mid_point:])
            
            # Summarize each half
            first_summary = summarize_reviews_chunk(first_half, restaurant_name)
            second_summary = summarize_reviews_chunk(second_half, restaurant_name)
            
            if first_summary and second_summary:
                # Combine the two summaries
                combine_prompt = COMBINE_PROMPT.format(summary1=first_summary, summary2=second_summary)
                
                try:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                            {"role": "system", "content": "You are a helpful assistant."},
                            {"role": "user", "content": combine_prompt},
                        ],
                        max_tokens=200,  # Limit tokens for brevity
                        temperature=0.7,
                    )
                    combined_summary = response.choices[0].message.content.strip()
                    return ensure_english(combined_summary, restaurant_name, food_text)
                except Exception as combine_error:
                    print(f"Error combining summaries for restaurant {restaurant_id}: {combine_error}")
                    return f"{first_summary}"  # Return the first summaries if combining fails
            else:
                return f"{first_summary or 'Error in first half'}\n\n{second_summary or 'Error in second half'}"
        else:
            print(f"Error summarizing reviews for restaurant {restaurant_id}: {e}")
            return None


In [52]:
FOOD_SUMMARY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about food. Summarize the following reviews in English. "
    "The summary should focus on customer perceptions of the food, including aspects like taste, presentation, freshness, and variety. "
    "Additionally, if customers mention specific food items they recommend, include those in a second paragraph. "
    "Write concisely and limit the summary to around 400 characters.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

RETRY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about food. Summarize the following reviews strictly in English. "
    "Focus on customer perceptions of the food, including aspects like taste, presentation, freshness, and variety. "
    "Additionally, if customers mention specific food items they recommend, include those in a second paragraph. "
    "Write concisely and limit the summary to around 400 characters. Avoid using any other language.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

COMBINE_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about food. Combine the following two summaries into a single cohesive summary in English. "
    "The combined summary should focus on customer perceptions of the food, including aspects like taste, presentation, freshness, and variety. "
    "If there are food recommendations mentioned in the summaries, include those in a new paragraph at the end. "
    "Limit the entire summary to around 400 characters:\n\n"
    "Summary 1:\n{summary1}\n\n"
    "Summary 2:\n{summary2}\n\n"
    "Combined Summary:"
)

In [53]:
### for testing

# Prepare a DataFrame to store summaries
summaries = []

for _, row in filtered_restaurants.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_reviews_for_restaurant(restaurant_id, filtered_reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "food_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
summaries_test = pd.merge(filtered_restaurants, summaries, on="restaurant_id", how="left")

In [54]:
### food recommendations in bullet points
FOOD_SUMMARY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about food. Summarize the following reviews in English. "
    "The summary should focus on customer perceptions of the food, including aspects like taste, presentation, freshness, and variety. "
    "Additionally, if customers mention specific food items they recommend, list these in a second section as bullet points, with the number of reviewers mentioning them in brackets. "
    "Write concisely and limit the overall response to around 400 characters.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

RETRY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about food. Summarize the following reviews strictly in English. "
    "Focus on customer perceptions of the food, including aspects like taste, presentation, freshness, and variety. "
    "Additionally, if customers mention specific food items they recommend, list these in a second section as bullet points, with the number of reviewers mentioning them in brackets. "
    "Write concisely and limit the overall response to around 400 characters. Avoid using any other language.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

COMBINE_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about food. Combine the following two summaries into a single cohesive summary in English. "
    "The combined summary should focus on customer perceptions of the food, including aspects like taste, presentation, freshness, and variety. "
    "If there are food recommendations mentioned in the summaries, list these in a new section as bullet points, with the number of reviewers mentioning them in brackets. "
    "Limit the overall response to around 400 characters:\n\n"
    "Summary 1:\n{summary1}\n\n"
    "Summary 2:\n{summary2}\n\n"
    "Combined Summary:"
)

In [55]:
### for testing

# Prepare a DataFrame to store summaries
summaries = []

for _, row in filtered_restaurants.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_reviews_for_restaurant(restaurant_id, filtered_reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "food_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
summaries_test = pd.merge(filtered_restaurants, summaries, on="restaurant_id", how="left")

In [67]:
### food recommendations in bullet points, only information about the food, at maximum the 5 most recommended meals
FOOD_SUMMARY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about food. Summarize the following reviews in English. "
    "The summary should focus exclusively on customer perceptions of the food, including aspects like taste, presentation, freshness, and variety. "
    "Do not include information about price, service, or atmosphere. "
    "List up to the 5 most positively recommended items in a second section as bullet points. "
    "Only include food items in the recommendations section if customers mention them positively. "
    "Do not list items with mixed or negative reviews. "
    "Write concisely and limit the overall response to around 400 characters.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)


RETRY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about food. Summarize the following reviews strictly in English. "
    "Focus exclusively on customer perceptions of the food, including aspects like taste, presentation, freshness, and variety. "
    "Do not include information about price, service, or atmosphere. "
    "List up to the 5 most positively recommended items in a second section as bullet points. "
    "Only include food items in the recommendations section if customers mention them positively. "
    "Do not list items with mixed or negative reviews. "    
    "Write concisely and limit the overall response to around 400 characters. Avoid using any other language.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

COMBINE_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about food. Combine the following two summaries into a single cohesive summary in English. "
    "The combined summary should focus exclusively on customer perceptions of the food, including aspects like taste, presentation, freshness, and variety. "
    "Do not include information about price, service, or atmosphere. "
    "List up to the 5 most positively recommended items in a second section as bullet points. "
    "Only include food items in the recommendations section if customers mention them positively. "
    "Do not list items with mixed or negative reviews. "
    "Limit the overall response to around 400 characters:\n\n"
    "Summary 1:\n{summary1}\n\n"
    "Summary 2:\n{summary2}\n\n"
    "Combined Summary:"
)

In [65]:
### for testing

# Prepare a DataFrame to store summaries
summaries = []

for _, row in filtered_restaurants.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_reviews_for_restaurant(restaurant_id, filtered_reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "food_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
summaries_test = pd.merge(filtered_restaurants, summaries, on="restaurant_id", how="left")

In [68]:
# Prepare a DataFrame to store summaries
summaries = []

for _, row in restaurant_basics.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_reviews_for_restaurant(restaurant_id, reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "food_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
filtered_restaurants_summaries = pd.merge(filtered_restaurants_summaries, summaries, on="restaurant_id", how="left")

10 min for 228 restaurants and 26561 reviews

## 3. Service summary

In [5]:
SERVICE_SUMMARY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about the service in restaurants. Summarize the following reviews only in English. "
   # "The summary should focus exclusively on customer perceptions of the service, including aspects like speed, attentiveness, friendliness, and professionalism. "
    "Do not include information about price, food, or atmosphere. "
    "Write concisely and limit the overall response to around 400 characters.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

SERVICE_COMBINE_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about service in restaurants. Combine the following two summaries into a single cohesive summary in English. "
  #  "The combined summary should focus exclusively on customer perceptions of the service, including aspects like speed, attentiveness, friendliness, and professionalism. "
    "Do not include information about price, food, or atmosphere. "
    "Limit the overall response to around 400 characters:\n\n"
    "Summary 1:\n{summary1}\n\n"
    "Summary 2:\n{summary2}\n\n"
    "Combined Summary:"
)


In [6]:
# Function to summarize service reviews using modular prompts
def summarize_service_chunk(reviews_chunk, restaurant_name):
    service_summary_prompt = SERVICE_SUMMARY_PROMPT.format(reviews=reviews_chunk)
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": service_summary_prompt},
            ],
            max_tokens=200,
            temperature=0.7,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error summarizing service reviews chunk: {e}")
        return None

# Function to handle larger reviews, retry, and combine summaries
def summarize_service_for_restaurant(restaurant_id, reviews_df, restaurant_name):
    service_reviews = reviews_df[reviews_df['restaurant_id'] == restaurant_id]['service_sentences'].dropna().astype(str).tolist()
    service_text = "\n".join(service_reviews)
    
    try:
        # Summarize the full chunk
        return summarize_service_chunk(service_text, restaurant_name)
    
    except Exception as e:
        if "context_length_exceeded" in str(e):
            print(f"Context length exceeded for restaurant {restaurant_id}. Splitting reviews...")
            
            # Split reviews into halves
            mid_point = len(service_reviews) // 2
            first_half = "\n".join(service_reviews[:mid_point])
            second_half = "\n".join(service_reviews[mid_point:])
            
            # Summarize each half
            first_summary = summarize_service_chunk(first_half, restaurant_name)
            second_summary = summarize_service_chunk(second_half, restaurant_name)
            
            if first_summary and second_summary:
                # Combine summaries
                combine_prompt = SERVICE_COMBINE_PROMPT.format(summary1=first_summary, summary2=second_summary)
                
                try:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                            {"role": "system", "content": "You are a helpful assistant."},
                            {"role": "user", "content": combine_prompt},
                        ],
                        max_tokens=200,
                        temperature=0.7,
                    )
                    return response.choices[0].message.content.strip()
                except Exception as combine_error:
                    print(f"Error combining service summaries for restaurant {restaurant_id}: {combine_error}")
                    return f"{first_summary}"  # Return the first summary if combining fails
            else:
                return f"{first_summary or 'Error in first half'}\n\n{second_summary or 'Error in second half'}"
        else:
            print(f"Error summarizing service reviews for restaurant {restaurant_id}: {e}")
            return None

In [None]:
### for testing

# Prepare a DataFrame to store summaries
summaries = []

for _, row in filtered_restaurants.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_service_for_restaurant(restaurant_id, filtered_reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "service_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
summaries_test = pd.merge(filtered_restaurants, summaries, on="restaurant_id", how="left")

In [10]:
SERVICE_SUMMARY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about the service in restaurants. Summarize the following reviews only in English. "
    "The summary should focus exclusively on customer perceptions of the service, including aspects like speed, attentiveness, friendliness, and professionalism. "
    "Do not include information about price, food, or atmosphere. "
    "Write concisely and limit the overall response to around 400 characters.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

SERVICE_COMBINE_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about service in restaurants. Combine the following two summaries into a single cohesive summary in English. "
    "The combined summary should focus exclusively on customer perceptions of the service, including aspects like speed, attentiveness, friendliness, and professionalism. "
    "Do not include information about price, food, or atmosphere. "
    "Limit the overall response to around 400 characters:\n\n"
    "Summary 1:\n{summary1}\n\n"
    "Summary 2:\n{summary2}\n\n"
    "Combined Summary:"
)

In [11]:
### for testing

# Prepare a DataFrame to store summaries
summaries = []

for _, row in filtered_restaurants.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_service_for_restaurant(restaurant_id, filtered_reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "service_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
summaries_test = pd.merge(filtered_restaurants, summaries, on="restaurant_id", how="left")

In [14]:
### on all restaurants
# Prepare a DataFrame to store summaries
summaries = []

for _, row in restaurant_basics.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_service_for_restaurant(restaurant_id, reviews_subcategories, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "service_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
filtered_restaurants_summaries = pd.merge(restaurant_basics, summaries, on="restaurant_id", how="left")

8 min for 228 restaurants and 17611 reviews

## 4. Atmosphere summary

In [15]:
ATMOSPHERE_SUMMARY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about the atmosphere in restaurants. "
    "The summary should focus exclusively on customer perceptions of the atmosphere, including aspects like ambiance, decor, cleanliness, noise levels, and overall vibe. "
    "Do not include information about price, food, or service. "
    "Write concisely, strictly in English and limit the overall response to around 400 characters.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

ATMOSPHERE_COMBINE_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about the atmosphere in restaurants. Combine the following two summaries into a single cohesive summary in English. "
    "The combined summary should focus exclusively on customer perceptions of the atmosphere, including aspects like ambiance, decor, cleanliness, noise levels, and overall vibe. "
    "Do not include information about price, food, or service. "
    "Limit the overall response to around 400 characters:\n\n"
    "Summary 1:\n{summary1}\n\n"
    "Summary 2:\n{summary2}\n\n"
    "Combined Summary:"
)

In [20]:
# Function to summarize atmosphere reviews using modular prompts
def summarize_atmosphere_chunk(reviews_chunk, restaurant_name):
    atmosphere_summary_prompt = ATMOSPHERE_SUMMARY_PROMPT.format(reviews=reviews_chunk)
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": atmosphere_summary_prompt},
            ],
            max_tokens=200,
            temperature=0.7,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error summarizing atmosphere reviews chunk: {e}")
        return None

# Function to handle larger reviews, retry, and combine summaries
def summarize_atmosphere_for_restaurant(restaurant_id, reviews_df, restaurant_name):
    atmosphere_reviews = reviews_df[reviews_df['restaurant_id'] == restaurant_id]['atmosphere_sentences'].dropna().astype(str).tolist()
    atmosphere_text = "\n".join(atmosphere_reviews)
    
    try:
        # Summarize the full chunk
        return summarize_atmosphere_chunk(atmosphere_text, restaurant_name)
    
    except Exception as e:
        if "context_length_exceeded" in str(e):
            print(f"Context length exceeded for restaurant {restaurant_id}. Splitting reviews...")
            
            # Split reviews into halves
            mid_point = len(atmosphere_reviews) // 2
            first_half = "\n".join(atmosphere_reviews[:mid_point])
            second_half = "\n".join(atmosphere_reviews[mid_point:])
            
            # Summarize each half
            first_summary = summarize_atmosphere_chunk(first_half, restaurant_name)
            second_summary = summarize_atmosphere_chunk(second_half, restaurant_name)
            
            if first_summary and second_summary:
                # Combine summaries
                combine_prompt = ATMOSPHERE_COMBINE_PROMPT.format(summary1=first_summary, summary2=second_summary)
                
                try:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                            {"role": "system", "content": "You are a helpful assistant."},
                            {"role": "user", "content": combine_prompt},
                        ],
                        max_tokens=200,
                        temperature=0.7,
                    )
                    return response.choices[0].message.content.strip()
                except Exception as combine_error:
                    print(f"Error combining atmosphere summaries for restaurant {restaurant_id}: {combine_error}")
                    return f"{first_summary}"  # Return the first summary if combining fails
            else:
                return f"{first_summary or 'Error in first half'}\n\n{second_summary or 'Error in second half'}"
        else:
            print(f"Error summarizing atmosphere reviews for restaurant {restaurant_id}: {e}")
            return None

In [18]:
### for testing

# Prepare a DataFrame to store summaries
summaries = []

for _, row in filtered_restaurants.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_atmosphere_for_restaurant(restaurant_id, filtered_reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "atmosphere_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
summaries_test = pd.merge(filtered_restaurants, summaries, on="restaurant_id", how="left")

In [21]:
### on all restaurants
# Prepare a DataFrame to store summaries
summaries = []

for _, row in restaurant_basics.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_atmosphere_for_restaurant(restaurant_id, reviews_subcategories, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "atmosphere_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
filtered_restaurants_summaries = pd.merge(restaurant_basics, summaries, on="restaurant_id", how="left")

8 min for 228 restaurants and 13582 reviews

## 5. Price

In [28]:
PRICE_SUMMARY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about pricing in restaurants. "
    "The summary should focus exclusively on customer perceptions of the price, including aspects like value for money, affordability, and pricing fairness. "
    "Do not include information about food, service, or atmosphere. "
    "Write concisely, strictly in English and limit the overall response to around 400 characters.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

PRICE_COMBINE_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about pricing in restaurants. Combine the following two summaries into a single cohesive summary in English. "
    "The combined summary should focus exclusively on customer perceptions of the price, including aspects like value for money, affordability, and pricing fairness. "
    "Do not include information about food, service, or atmosphere. "
    "Limit the overall response to around 400 characters:\n\n"
    "Summary 1:\n{summary1}\n\n"
    "Summary 2:\n{summary2}\n\n"
    "Combined Summary:"
)


In [29]:
# Function to summarize price reviews using modular prompts
def summarize_price_chunk(reviews_chunk, restaurant_name):
    price_summary_prompt = PRICE_SUMMARY_PROMPT.format(reviews=reviews_chunk)
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": price_summary_prompt},
            ],
            max_tokens=200,
            temperature=0.7,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error summarizing price reviews chunk: {e}")
        return None

# Function to handle larger price reviews, retry, and combine summaries
def summarize_price_for_restaurant(restaurant_id, reviews_df, restaurant_name):
    price_reviews = reviews_df[reviews_df['restaurant_id'] == restaurant_id]['price_sentences'].dropna().astype(str).tolist()
    price_text = "\n".join(price_reviews)
    
    try:
        # Summarize the full chunk
        return summarize_price_chunk(price_text, restaurant_name)
    
    except Exception as e:
        if "context_length_exceeded" in str(e):
            print(f"Context length exceeded for restaurant {restaurant_id}. Splitting reviews...")
            
            # Split reviews into halves
            mid_point = len(price_reviews) // 2
            first_half = "\n".join(price_reviews[:mid_point])
            second_half = "\n".join(price_reviews[mid_point:])
            
            # Summarize each half
            first_summary = summarize_price_chunk(first_half, restaurant_name)
            second_summary = summarize_price_chunk(second_half, restaurant_name)
            
            if first_summary and second_summary:
                # Combine summaries
                combine_prompt = PRICE_COMBINE_PROMPT.format(summary1=first_summary, summary2=second_summary)
                
                try:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                            {"role": "system", "content": "You are a helpful assistant."},
                            {"role": "user", "content": combine_prompt},
                        ],
                        max_tokens=200,
                        temperature=0.7,
                    )
                    return response.choices[0].message.content.strip()
                except Exception as combine_error:
                    print(f"Error combining price summaries for restaurant {restaurant_id}: {combine_error}")
                    return f"{first_summary}"  # Return the first summary if combining fails
            else:
                return f"{first_summary or 'Error in first half'}\n\n{second_summary or 'Error in second half'}"
        else:
            print(f"Error summarizing price reviews for restaurant {restaurant_id}: {e}")
            return None

In [34]:
### for testing

# Prepare a DataFrame to store summaries
summaries = []

for _, row in filtered_restaurants.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_price_for_restaurant(restaurant_id, filtered_reviews, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "price_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
summaries_test = pd.merge(filtered_restaurants, summaries, on="restaurant_id", how="left")

In [35]:
### on all restaurants
# Prepare a DataFrame to store summaries
summaries = []

for _, row in restaurant_basics.iterrows():
    restaurant_id = row['restaurant_id']
    restaurant_name = row['name']
    summary = summarize_price_for_restaurant(restaurant_id, reviews_subcategories, restaurant_name)
    summaries.append({
        "restaurant_id": restaurant_id,
        "price_summary": summary
    })

# Combine summaries with the filtered_restaurants DataFrame by restaurant_id
summaries = pd.DataFrame(summaries)
filtered_restaurants_summaries = pd.merge(restaurant_basics, summaries, on="restaurant_id", how="left")

8 min for 228 restaurants and 8172 reviews

# Combination of everything

In [66]:
### Define all prompts
OVERALL_SUMMARY_PROMPT = (
    "You are an expert summarizer specializing in restaurant reviews. Summarize the following reviews for a restaurant. "
    "Be sure to include the overall tone of the reviews. "
    "Write concisely, strictly in English and limit the overall response to around 400 characters.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

OVERALL_SUMMARY_COMBINE_PROMPT = (
    "You are an expert summarizer specializing in restaurant reviews. "
    "Combine the following two summaries into a single concise and cohesive summary in English. "
    "The summary should be limited to around 200 characters:\n\n"
    "Summary 1:\n{summary1}\n\n"
    "Summary 2:\n{summary2}\n\n"
    "Combined Summary:"
)

FOOD_SUMMARY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about the food in a restaurant. "
    "The summary should focus exclusively on customer perceptions of the food, including aspects like taste, presentation, freshness, and variety. "
    "Do not include information about price, service, or atmosphere. "
    "List up to the 5 most positively recommended items in a second section as bullet points. "
    "Only include food items in the recommendations section if customers mention them positively. "
    "Do not list items with mixed or negative reviews. "
    "Write concisely, strictly in English and limit the overall response to around 400 characters.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

FOOD_COMBINE_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about food. Combine the following two summaries into a single cohesive summary in English. "
    "The combined summary should focus exclusively on customer perceptions of the food, including aspects like taste, presentation, freshness, and variety. "
    "Do not include information about price, service, or atmosphere. "
    "List up to the 5 most positively recommended items in a second section as bullet points. "
    "Only include food items in the recommendations section if customers mention them positively. "
    "Do not list items with mixed or negative reviews. "
    "Limit the overall response to around 400 characters:\n\n"
    "Summary 1:\n{summary1}\n\n"
    "Summary 2:\n{summary2}\n\n"
    "Combined Summary:"
)

SERVICE_SUMMARY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about the service in restaurants. "
    "The summary should focus exclusively on customer perceptions of the service, including aspects like speed, attentiveness, friendliness, and professionalism. "
    "Do not include information about price, food, or atmosphere. "
    "Write concisely, strictly in English and limit the overall response to around 400 characters.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

SERVICE_COMBINE_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about service in restaurants. Combine the following two summaries into a single cohesive summary in English."
    "The combined summary should focus exclusively on customer perceptions of the service, including aspects like speed, attentiveness, friendliness, and professionalism. "
    "Do not include information about price, food, or atmosphere. "
    "Limit the overall response to around 400 characters:\n\n"
    "Summary 1:\n{summary1}\n\n"
    "Summary 2:\n{summary2}\n\n"
    "Combined Summary:"
)

ATMOSPHERE_SUMMARY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about the atmosphere in restaurants. "
    "The summary should focus exclusively on customer perceptions of the atmosphere, including aspects like ambiance, decor, cleanliness, noise levels, and overall vibe. "
    "Do not include information about price, food, or service. "
    "Write concisely, strictly in English and limit the overall response to around 400 characters.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

ATMOSPHERE_COMBINE_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about the atmosphere in restaurants. Combine the following two summaries into a single cohesive summary in English. "
    "The combined summary should focus exclusively on customer perceptions of the atmosphere, including aspects like ambiance, decor, cleanliness, noise levels, and overall vibe. "
    "Do not include information about price, food, or service. "
    "Limit the overall response to around 400 characters:\n\n"
    "Summary 1:\n{summary1}\n\n"
    "Summary 2:\n{summary2}\n\n"
    "Combined Summary:"
)

PRICE_SUMMARY_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about pricing in restaurants. "
    "The summary should focus exclusively on customer perceptions of the price, including aspects like value for money, affordability, and pricing fairness. "
    "Do not include information about food, service, or atmosphere. "
    "Write concisely, strictly in English and limit the overall response to around 400 characters.\n\n"
    "Reviews:\n{reviews}\n\n"
    "Summary:"
)

PRICE_COMBINE_PROMPT = (
    "You are an expert summarizer specializing in customer opinions about pricing in restaurants. Combine the following two summaries into a single cohesive summary in English. "
    "The combined summary should focus exclusively on customer perceptions of the price, including aspects like value for money, affordability, and pricing fairness. "
    "Do not include information about food, service, or atmosphere. "
    "Limit the overall response to around 400 characters:\n\n"
    "Summary 1:\n{summary1}\n\n"
    "Summary 2:\n{summary2}\n\n"
    "Combined Summary:"
)

# Function to summarize a single chunk of reviews using modular prompts
def summarize_chunk(prompt, reviews_chunk):
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt.format(reviews=reviews_chunk)},
            ],
            max_tokens=200,
            temperature=0.7,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error summarizing chunk: {e}")
        return None
    

# Function to combine summaries using modular prompts
def combine_summaries(combine_prompt, summary1, summary2):
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": combine_prompt.format(summary1=summary1, summary2=summary2)},
            ],
            max_tokens=200,
            temperature=0.7,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error combining summaries: {e}")
        return f"{summary1}\n\n{summary2}"

    

# Function to handle larger price reviews, retry, and combine summaries
def summarize_reviews(restaurant_id, reviews_df, category_column_name, summary_prompt, combine_prompt):
    """
    Generalized function to summarize reviews for a specific aspect (overall, service, atmosphere, etc.).
    
    Args:
        restaurant_id (int): ID of the restaurant.
        reviews_df (DataFrame): DataFrame containing review data.
        category_column_name (str): Column in the DataFrame containing the reviews for this aspect.
        summary_prompt (str): Prompt for summarizing reviews.
        combine_prompt (str): Prompt for combining summaries.
    
    Returns:
        str: Final summarized review.
    """
    # Filter and join the reviews for the specified column
    reviews = reviews_df[reviews_df['restaurant_id'] == restaurant_id][category_column_name].dropna().astype(str).tolist()
    reviews_text = "\n".join(reviews)
    
    try:
        # Attempt to summarize the full chunk
        return summarize_chunk(summary_prompt, reviews_text)
    
    except Exception as e:
        if "context_length_exceeded" in str(e):
            print(f"Context length exceeded for restaurant '{restaurant_id}' in column '{category_column_name}'. Splitting reviews...")
            
            # Split reviews into halves
            mid_point = len(reviews) // 2
            first_half = "\n".join(reviews[:mid_point])
            second_half = "\n".join(reviews[mid_point:])
            
            # Summarize each half
            first_summary = summarize_chunk(summary_prompt, first_half)
            second_summary = summarize_chunk(summary_prompt, second_half)
            
            if first_summary and second_summary:
                # Combine summaries
                try:
                    return combine_summaries(combine_prompt, first_summary, second_summary)
                except Exception as combine_error:
                    print(f"Error combining summaries for restaurant '{restaurant_id}' in column '{category_column_name}': {combine_error}")
                    return f"{first_summary}"  # Return the first summary if combining fails
            else:
                return f"{first_summary or 'Error in first half'}\n\n{second_summary or 'Error in second half'}"
        else:
            print(f"Error summarizing reviews for restaurant '{restaurant_id}' in column '{category_column_name}': {e}")
            return None

In [67]:
# keep only 10 restaurants for testing
restaurant_basics = restaurant_basics[:20]

In [68]:
# merge the data such that we have the full review text with the categorized review text
reviews_df = pd.merge(reviews_general, reviews_subcategories, on='review_id', how='left')
# keep only the necessary columns
reviews_df = reviews_df[['restaurant_id_x', 'review_id', 'review_text', 'food_sentences', 'service_sentences', 'atmosphere_sentences', 'price_sentences']]
# rename column to 'restaurant_id'
reviews_df = reviews_df.rename(columns={'restaurant_id_x': 'restaurant_id'})
# preprocess reviews_df
reviews_df = reviews_df.dropna(subset=['review_text'])
# remove extra spaces, newlines, and tabs
reviews_df['review_text'] = reviews_df['review_text'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [69]:
### Generate summaries for each restaurant and category

# Initialize an empty list to hold the summaries
summaries = []

# Generate summaries for each restaurant and category
for _, row in restaurant_basics.iterrows():
    restaurant_id = row['restaurant_id']
    
    print(f"Processing restaurant ID: {restaurant_id}")
    
    # Summarize overall reviews
    print("Start overall summary")
    overall_summary = summarize_reviews(restaurant_id, reviews_df, 'review_text', OVERALL_SUMMARY_PROMPT, OVERALL_SUMMARY_COMBINE_PROMPT)
    
    # Summarize food reviews
    print("Start food summary")
    food_summary = summarize_reviews(restaurant_id, reviews_df, 'food_sentences', FOOD_SUMMARY_PROMPT, FOOD_COMBINE_PROMPT)
    
    # Summarize service reviews
    print("Start service summary")
    service_summary = summarize_reviews(restaurant_id, reviews_df, 'service_sentences', SERVICE_SUMMARY_PROMPT, SERVICE_COMBINE_PROMPT)
    
    # Summarize atmosphere reviews
    print("Start atmosphere summary")
    atmosphere_summary = summarize_reviews(restaurant_id, reviews_df, 'atmosphere_sentences', ATMOSPHERE_SUMMARY_PROMPT, ATMOSPHERE_COMBINE_PROMPT)
    
    # Summarize price reviews
    print("Start price summary")
    price_summary = summarize_reviews(restaurant_id, reviews_df, 'price_sentences', PRICE_SUMMARY_PROMPT, PRICE_COMBINE_PROMPT)
    
    # Append the summaries to the list
    summaries.append({
        "restaurant_id": restaurant_id,
        "overall_summary": overall_summary,
        "food_summary": food_summary,
        "service_summary": service_summary,
        "atmosphere_summary": atmosphere_summary,
        "price_summary": price_summary,
    })

# Convert the list of summaries into a DataFrame
summaries_df = pd.DataFrame(summaries)

# Display the resulting DataFrame
print(summaries_df.head())



Processing restaurant ID: ChIJ_VWb4xn6mUcRH4NujtHMKJI
Start overall summary
Start food summary
Start service summary
Start atmosphere summary
Start price summary
Processing restaurant ID: ChIJo5EYOK_4mUcRi4shjNiEDUc
Start overall summary
Start food summary
Start service summary
Start atmosphere summary
Start price summary
Processing restaurant ID: ChIJ_VfiMxj6mUcRRK_QBdxww7g
Start overall summary
Start food summary
Start service summary
Start atmosphere summary
Start price summary
Processing restaurant ID: ChIJT4FlA7zwmUcRcrmR1JIlwm8
Start overall summary
Start food summary
Start service summary
Start atmosphere summary
Start price summary
Processing restaurant ID: ChIJ5dhz-EXxmUcRuZTn4wsBpQs
Start overall summary
Start food summary
Start service summary
Start atmosphere summary
Start price summary
Processing restaurant ID: ChIJ3xCcNgDxmUcRtLLUkaG5wY8
Start overall summary
Start food summary
Start service summary
Start atmosphere summary
Start price summary
Processing restaurant ID: Ch

In [63]:
print(restaurant_basics)

                  restaurant_id  city_id  \
11  ChIJy9BCYNr9mUcRKQBU3lnANgo        0   
12  ChIJeSaF9XD8mUcRDm3he7lkuMA        0   
13  ChIJk0bQgJr7mUcR-CKFCw2h2KY        0   
14  ChIJN7o495b7mUcRgQbaxkTIkXo        0   
15  ChIJBctfifbwmUcRrXoViGlewME        0   

                                     name        primary_type  \
11                      Café im Dreikönig                cafe   
12              Sportgaststätte Kiebingen          restaurant   
13  Sportheim Bühl Inh. Gerardo Carbonaro          restaurant   
14   Schützenhaus Bühl Restaurant Taverne    greek_restaurant   
15    Sportgaststätte Reinenberg La Corte  italian_restaurant   

                                                types business_status  \
11  ['cafe', 'restaurant', 'food', 'point_of_inter...     OPERATIONAL   
12  ['restaurant', 'point_of_interest', 'food', 'e...     OPERATIONAL   
13  ['restaurant', 'pizza_restaurant', 'italian_re...     OPERATIONAL   
14  ['greek_restaurant', 'restaurant', 'food', 'po..