# Workflow for example data

In [1]:
### for working on the BwCluster
from huggingface_hub import login

# Replace with your Hugging Face token
huggingface_token = "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"

# Log in
login(token=huggingface_token)

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import pipeline
from tqdm import tqdm 
import pandas as pd
import re
from datetime import datetime
from collections import defaultdict

2025-01-21 08:13:28.057072: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-21 08:13:28.996764: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-21 08:13:28.996807: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-21 08:13:28.996839: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-21 08:13:29.290794: I tensorflow/core/platform/cpu_feature_g

## read in and preprocess

In [3]:
### Load the LLama 3.1 70 B model (since it performed the best)
model_id = "meta-llama/Llama-3.1-70B-Instruct"
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.bfloat16,
                                         bnb_4bit_use_double_quant=True,
                                         bnb_4bit_quant_type= "nf4"
                                         )

quantized_model = AutoModelForCausalLM.from_pretrained(
    model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token # setting padding token (Llama doesnt have a padding token by default)
tokenizer.padding_side = "left" # left for generating right for tuning 

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [5]:
### Prepare dataset to test and fine-tune the model
# read in extract from the google maps reviews webscraping (from 10.01.2025)
additional_reviews = pd.read_csv("filtered_reviews_additional.csv")
general_reviews = pd.read_csv("filtered_reviews_general.csv")

# keep only necessary information
filtered_reviews = general_reviews[['review_id','restaurant_id', 'review_date', 'scraping_date', 'review_text']]

# add information about the subratings to the additional reviews
filtered_reviews = additional_reviews.merge(filtered_reviews, on='review_id', how='left')

# keep only relevant columns
filtered_reviews.rename(columns={'restaurant_id_x': 'restaurant_id'}, inplace=True) # rename one of the restaurant_id columns
filtered_reviews = filtered_reviews[['review_id', 'restaurant_id', 'dining_stars_food', 'dining_stars_service', 'dining_stars_atmosphere', 'review_text']]

# save the data
filtered_reviews.to_csv("filtered_reviews.csv", index=False)
print(filtered_reviews.head())

   review_id                restaurant_id  dining_stars_food  \
0      23377  ChIJm7waYdT6mUcRxPFyE982gE0                5.0   
1      23378  ChIJm7waYdT6mUcRxPFyE982gE0                5.0   
2      23379  ChIJm7waYdT6mUcRxPFyE982gE0                4.0   
3      23380  ChIJm7waYdT6mUcRxPFyE982gE0                5.0   
4      23381  ChIJm7waYdT6mUcRxPFyE982gE0                NaN   

   dining_stars_service  dining_stars_atmosphere  \
0                   4.0                      4.0   
1                   5.0                      5.0   
2                   5.0                      5.0   
3                   5.0                      5.0   
4                   NaN                      NaN   

                                         review_text  
0  Im Veggi in Tübingen haben wir einen gemütlich...  
1  War dort heute zum ersten Mal essen, da meine ...  
2  Übersichtliche Speisekarte und somit einfach k...  
3  Veggie in Tübingen 100 % Empfehlung. Hier ist ...  
4  Der Laden wurde uns empf

In [32]:
### for trying out
filtered_reviews_small = filtered_reviews[:10]
filtered_reviews.to_csv("filtered_reviews.csv", index=False)
print(len(filtered_reviews))

10


In [6]:
# Load the dataset
dataset = load_dataset("csv", data_files="filtered_reviews.csv", split="train")
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['review_id', 'restaurant_id', 'dining_stars_food', 'dining_stars_service', 'dining_stars_atmosphere', 'review_text'],
    num_rows: 1800
})

In [16]:
print(dataset[0])

{'review_id': 23377, 'restaurant_id': 'ChIJm7waYdT6mUcRxPFyE982gE0', 'dining_stars_food': 5.0, 'dining_stars_service': 4.0, 'dining_stars_atmosphere': 4.0, 'review_text': 'Im Veggi in Tübingen haben wir einen gemütlichen Nachmittag verbracht. Der Service ist freundlich, in den abgelegenen Ecken wird man gelegentlich übersehen. Getränke und Speisen waren lecker. Abgesehen vom pinken Christbaum (Statement hin oder her, der war schreiend hässlich) war das Ambiente ansprechend.', 'preprocessed_reviews': ['Im Veggi in Tübingen haben wir einen gemütlichen Nachmittag verbracht.', 'Der Service ist freundlich, in den abgelegenen Ecken wird man gelegentlich übersehen.', 'Getränke und Speisen waren lecker.', 'Abgesehen vom pinken Christbaum (Statement hin oder her, der war schreiend hässlich) war das Ambiente ansprechend.']}


In [14]:
# For testing make a small dataset
shuffled_dataset = dataset.shuffle(seed=42)
split_dataset = shuffled_dataset.train_test_split(train_size=0.005, seed=42)
small_dataset = split_dataset["train"]
small_dataset

Dataset({
    features: ['review_id', 'restaurant_id', 'dining_stars_food', 'dining_stars_service', 'dining_stars_atmosphere', 'review_text', 'preprocessed_reviews'],
    num_rows: 9
})

In [7]:
# Preprocessing function to clean and split reviews into sentences
def preprocess_review_text(sample):
    reviews = sample['review_text']
    if reviews is None:
        return {"preprocessed_reviews": []} # Check if the review_text is None
    reviews = reviews.strip()  # Remove leading/trailing whitespace
    reviews = re.sub(r'\s+', ' ', reviews)  # Normalize multiple spaces
    reviews = re.split(r'(?<=[.!?])\s+|\n+', reviews)  # Split into sentences
    reviews = [sentence.strip() for sentence in reviews if sentence.strip()]  # Clean each sentence
    return {"preprocessed_reviews": reviews}

# Apply preprocessing to the dataset
dataset = dataset.map(preprocess_review_text)

# Inspect the updated dataset
print(dataset['preprocessed_reviews'][:5])  # View the first 5 processed reviews

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

[['Im Veggi in Tübingen haben wir einen gemütlichen Nachmittag verbracht.', 'Der Service ist freundlich, in den abgelegenen Ecken wird man gelegentlich übersehen.', 'Getränke und Speisen waren lecker.', 'Abgesehen vom pinken Christbaum (Statement hin oder her, der war schreiend hässlich) war das Ambiente ansprechend.'], ['War dort heute zum ersten Mal essen, da meine Freunde Vegetarier sind.', 'Wir waren zu dritt und haben trotz vollem Restaurant noch einen Platz bekommen.', 'Der Service war sehr aufmerksam, zuvorkommend und war sehr flink.', 'Das Essen hat nicht nur mich sondern auch meine Freunde begeistert.', 'Mein Vegetarisches Tatar war köstlich, ich würde es immer einem klassischem Tatar vorziehen.', 'Das Vegetarische Hühnchengeschnetzelte mit Humus war ein Traum, denn die Kombination des Hühnchen mit dem Humus war klasse.', 'Was mich als langjährigen Gastronom fast vom Hocker gehauen hat war die Optik der Teller, die Art und Weise die Liebe und Hingabe für Vegetarisches Essen au

# Multi label recognition

In [8]:
def categorize_sentences_multi_label(review_text: str) -> dict:
    """
    Categorize sentences from the review into 'food', 'service', 'atmosphere', and 'price'.
    Allow sentences to belong to multiple categories.
    """
    # Define the multi-label prompt
    prompt_template = (
        "For the following sentence, identify all applicable categories: "
        "'food', 'service', 'atmosphere', 'price'. If no category applies, respond 'none'. "
        "Separate multiple categories with commas.\n\n"
        "Sentence: {sentence}\n\n"
        "Categories:"
    )

    # Initialize result dictionary
    categorized_sentences = {'food': [], 'service': [], 'atmosphere': [], 'price': []}
    
    #for sentence in sentences:
    prompt = prompt_template.format(sentence=sentence.strip())
    
    # Tokenize input
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to("cuda")

    # Generate category predictions
    outputs = quantized_model.generate(
        **inputs,
        max_new_tokens=4, # for four categories
        temperature=0.7,  # Moderate temperature for balanced creativity (less perfomed worse)
        pad_token_id=tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    )

    # Decode and parse the model's response
    generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True).strip().lower()
    predicted_categories = [cat.strip() for cat in generated_text.split(',') if cat.strip() in categorized_sentences]
    
    return predicted_categories

In [9]:
### new adjustment of function
def categorize_sentences_multi_label(sentence: str) -> dict:
    """
    Categorize a single sentence into 'food', 'service', 'atmosphere', and 'price'.
    Allow sentences to belong to multiple categories.
    """
    # Define the multi-label prompt
    prompt_template = (
        "For the following sentence, identify all applicable categories: "
        "'food', 'service', 'atmosphere', 'price'. If no category applies, respond 'none'. "
        "Separate multiple categories with commas.\n\n"
        "Sentence: {sentence}\n\n"
        "Categories:"
    )
    if not sentence.strip():  # Handle empty sentences
        return []

    # Prepare the prompt
    prompt = prompt_template.format(sentence=sentence.strip())

    # Tokenize input
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to("cuda")

    # Generate category predictions
    outputs = quantized_model.generate(
        **inputs,
        max_new_tokens=4,  # for four categories
        temperature=0.7,  # Moderate temperature for balanced creativity
        pad_token_id=tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    )

    # Decode and parse the model's response
    generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True).strip().lower()
    predicted_categories = [cat.strip() for cat in generated_text.split(',') if cat.strip()]

    return predicted_categories

In [29]:
num_rows = small_dataset.num_rows

split_reviews = pd.DataFrame({
    "full_review": [''] * num_rows,
    "food_sentences": [''] * num_rows,
    "service_sentences": [''] * num_rows,
    "atmosphere_sentences": [''] * num_rows,
    "price_sentences": [''] * num_rows,
})

# Set batch size depending on your GPU memory capacity
batch_size = 5  # adjust based on GPU memory

# Process the reviews in batches
for i in tqdm(range(0, len(small_dataset['preprocessed_reviews']), batch_size), desc="Processing reviews in batches"):
    batch_reviews = small_dataset['preprocessed_reviews'][i:i+batch_size]

    # Process each review in the batch
    for idx, review_text in enumerate(batch_reviews):
        # Kategorisiere die Sätze für jede Kategorie
        if review_text is None:  # Skip None values
            continue
        labels = [categorize_sentences_multi_label(sentence) for sentence in review_text]

        # Erstelle ein defaultdict für die kategorisierten Sätze
        labeled_sentences = defaultdict(list)

        # Iteration über die Sätze und deren Labels
        for sentence, sentence_labels in zip(review_text, labels):
            labeled_sentences['full_review'].append(sentence)
            for label in sentence_labels:
                    labeled_sentences[label + '_sentences'].append(sentence)

        # Hole den Index der aktuellen Zeile, die wir befüllen wollen
        row_index = i + idx
        
        # Speichere die kategorisierten Sätze in den entsprechenden Spalten
        split_reviews.at[row_index, 'full_review'] = ' '.join(labeled_sentences['full_review'])
        split_reviews.at[row_index, 'food_sentences'] = ' '.join(labeled_sentences['food_sentences'])
        split_reviews.at[row_index, 'service_sentences'] = ' '.join(labeled_sentences['service_sentences'])
        split_reviews.at[row_index, 'atmosphere_sentences'] = ' '.join(labeled_sentences['atmosphere_sentences'])
        split_reviews.at[row_index, 'price_sentences'] = ' '.join(labeled_sentences['price_sentences'])

Processing reviews in batches: 100%|██████████| 2/2 [01:17<00:00, 38.68s/it]


In [30]:
split_reviews

Unnamed: 0,full_review,food_sentences,service_sentences,atmosphere_sentences,price_sentences
0,Best Falafel ever!,,,,
1,Super nettes Personal und die besten Pommes un...,"Das einzige was man etwas ändern könnte, ist d...",,,Leider wird auch alles immer teurer aber ander...
2,Sind gestern Nachmittag mit fünf Personen in d...,"Wollten ein ""schwäbischen Essen"". Leider gehör...",,,"Die Soße die ich bestellte, für 2,5 €, war ein..."
3,نصيحة!ماشاءلله اكتر من رائع الله يعطيكن العافي...,,,,
4,Sehr leckere und große bowls. Komme auf jeden ...,Sehr leckere und große bowls.,,,
5,,,,,
6,Kilkenny,,,,
7,Flädlesuppe und Bandnudeln mit Kürbissoße und ...,Flädlesuppe und Bandnudeln mit Kürbissoße und ...,,,
8,Waited over 45 min for the food...just went away,Waited over 45 min for the food...just went away,Waited over 45 min for the food...just went away,,


In [51]:
# modify reviews dataframe to later contain the category sentence
filtered_reviews['food_sentences'] = ''
filtered_reviews['service_sentences'] = ''
filtered_reviews['atmosphere_sentences'] = ''
filtered_reviews['price_sentences'] = ''

In [52]:
print(len(filtered_reviews))

1800


In [53]:
print(filtered_reviews.head())

   review_id                restaurant_id  dining_stars_food  \
0      23377  ChIJm7waYdT6mUcRxPFyE982gE0                5.0   
1      23378  ChIJm7waYdT6mUcRxPFyE982gE0                5.0   
2      23379  ChIJm7waYdT6mUcRxPFyE982gE0                4.0   
3      23380  ChIJm7waYdT6mUcRxPFyE982gE0                5.0   
4      23381  ChIJm7waYdT6mUcRxPFyE982gE0                NaN   

   dining_stars_service  dining_stars_atmosphere  \
0                   4.0                      4.0   
1                   5.0                      5.0   
2                   5.0                      5.0   
3                   5.0                      5.0   
4                   NaN                      NaN   

                                         review_text food_sentences  \
0  Im Veggi in Tübingen haben wir einen gemütlich...                  
1  War dort heute zum ersten Mal essen, da meine ...                  
2  Übersichtliche Speisekarte und somit einfach k...                  
3  Veggie in Tübin

In [11]:
batch_size = 128  # adjust based on GPU memory

for i in tqdm(range(0, len(dataset['preprocessed_reviews']), batch_size), desc="Processing reviews in batches"):
    batch_reviews = dataset['preprocessed_reviews'][i:i+batch_size]

    for idx, review_text in enumerate(batch_reviews):
        if review_text is None:  # Skip None values
            continue
        labels = [categorize_sentences_multi_label(sentence) for sentence in review_text]

        labeled_sentences = defaultdict(list)
        for sentence, sentence_labels in zip(review_text, labels):
            labeled_sentences['full_review'].append(sentence)
            for label in sentence_labels:
                labeled_sentences[label + '_sentences'].append(sentence)

        row_index = i + idx
        filtered_reviews.at[row_index, 'food_sentences'] = ' '.join(labeled_sentences['food_sentences'])
        filtered_reviews.at[row_index, 'service_sentences'] = ' '.join(labeled_sentences['service_sentences'])
        filtered_reviews.at[row_index, 'atmosphere_sentences'] = ' '.join(labeled_sentences['atmosphere_sentences'])
        filtered_reviews.at[row_index, 'price_sentences'] = ' '.join(labeled_sentences['price_sentences'])

Processing reviews in batches: 100%|██████████| 15/15 [3:03:53<00:00, 735.59s/it]  


In [14]:
print(filtered_reviews.head())

   review_id                restaurant_id  dining_stars_food  \
0      23377  ChIJm7waYdT6mUcRxPFyE982gE0                5.0   
1      23378  ChIJm7waYdT6mUcRxPFyE982gE0                5.0   
2      23379  ChIJm7waYdT6mUcRxPFyE982gE0                4.0   
3      23380  ChIJm7waYdT6mUcRxPFyE982gE0                5.0   
4      23381  ChIJm7waYdT6mUcRxPFyE982gE0                NaN   

   dining_stars_service  dining_stars_atmosphere  \
0                   4.0                      4.0   
1                   5.0                      5.0   
2                   5.0                      5.0   
3                   5.0                      5.0   
4                   NaN                      NaN   

                                         review_text  \
0  Im Veggi in Tübingen haben wir einen gemütlich...   
1  War dort heute zum ersten Mal essen, da meine ...   
2  Übersichtliche Speisekarte und somit einfach k...   
3  Veggie in Tübingen 100 % Empfehlung. Hier ist ...   
4  Der Laden wurde uns

In [15]:
print(filtered_reviews['review_text'][0])

Im Veggi in Tübingen haben wir einen gemütlichen Nachmittag verbracht. Der Service ist freundlich, in den abgelegenen Ecken wird man gelegentlich übersehen. Getränke und Speisen waren lecker. Abgesehen vom pinken Christbaum (Statement hin oder her, der war schreiend hässlich) war das Ambiente ansprechend.


In [16]:
print(filtered_reviews['food_sentences'][0])

Getränke und Speisen waren lecker.


In [17]:
print(filtered_reviews['service_sentences'][0])

Der Service ist freundlich, in den abgelegenen Ecken wird man gelegentlich übersehen.


In [19]:
print(filtered_reviews['atmosphere_sentences'][0])

Im Veggi in Tübingen haben wir einen gemütlichen Nachmittag verbracht. Abgesehen vom pinken Christbaum (Statement hin oder her, der war schreiend hässlich) war das Ambiente ansprechend.


In [13]:
filtered_reviews.to_csv("filtered_reviews_with_categories.csv", index=False)

# Summarization


In [4]:
# read in the data with categorized sentences
categorized_reviews = pd.read_csv('filtered_reviews_with_categories.csv')

In [47]:
# prepare a new dataframe to contain all summarized information
summary_restaurants = categorized_reviews[['restaurant_id']].drop_duplicates().reset_index(drop=True)
summary_restaurants['overall_summary'] = ''
summary_restaurants['food_summary'] = ''
summary_restaurants['service_summary'] = ''
summary_restaurants['atmosphere_summary'] = ''
summary_restaurants['price_summary'] = ''
print(summary_restaurants)

                 restaurant_id overall_summary food_summary service_summary  \
0  ChIJm7waYdT6mUcRxPFyE982gE0                                                
1  ChIJbxg5UNT6mUcRY_RPFW_mgjg                                                
2  ChIJ5bNxBdP6mUcRoihNgmDzZl4                                                
3  ChIJ8dHXli3lmUcRbSsbGrUNotc                                                
4  ChIJYR6MeNP6mUcRX0PfqXYXqks                                                

  atmosphere_summary price_summary  
0                                   
1                                   
2                                   
3                                   
4                                   


## overall summary

In [48]:
def summarize_reviews(review_text):
    
    # Define a prompt template for summarization
    prompt = f"""
    You are a helpful AI assistant summarizing the reviews of a restaurant for other customers. Summarize the following reviews:
    Reviews: {review_text}
    Summarize the reviews in exactly four sentences and do not include anything else. 
    Write the summary only in English and focus on the experiences of prior customers.
    
    Summary:"""
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")

    # Generate the summary
    outputs = quantized_model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.3,
        top_p=0.9,
        repetition_penalty=1.2, 
        pad_token_id=tokenizer.pad_token_id
    )
    
    # Decode the generated text
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the part after "Summary:"
    summary = summary.split("Summary:")[-1].strip()
    
    return summary

In [49]:
### overall summary
for i in summary_restaurants['restaurant_id']:
    # Filter rows for the current restaurant_id
    all_reviews = categorized_reviews.loc[categorized_reviews['restaurant_id'] == i, 'review_text']
    all_reviews = all_reviews.dropna()  # Remove missing values

    # Update the overall_count column for the current restaurant_id
    summary_restaurants.loc[summary_restaurants['restaurant_id'] == i, 'overall_count'] = len(all_reviews)

    # Generate the summaries of the restaurants
    summary_restaurants.loc[summary_restaurants['restaurant_id'] == i, 'overall_summary'] = summarize_reviews(all_reviews)


print(summary_restaurants.head())
print(summary_restaurants['overall_summary'])

                 restaurant_id  \
0  ChIJm7waYdT6mUcRxPFyE982gE0   
1  ChIJbxg5UNT6mUcRY_RPFW_mgjg   
2  ChIJ5bNxBdP6mUcRoihNgmDzZl4   
3  ChIJ8dHXli3lmUcRbSsbGrUNotc   
4  ChIJYR6MeNP6mUcRX0PfqXYXqks   

                                     overall_summary food_summary  \
0  The reviewers highly recommend this vegetarian...                
1  The restaurant has received positive feedback ...                
2  The majority of reviewers highly recommend thi...                
3  This restaurant has a cozy atmosphere both ins...                
4  The majority of reviewers praised this small e...                

  service_summary atmosphere_summary price_summary  overall_count  
0                                                           287.0  
1                                                           316.0  
2                                                           259.0  
3                                                           307.0  
4                                       

In [61]:
print(summary_restaurants['overall_summary'][2])

The majority of reviewers highly recommend this restaurant due to its excellent price-performance ratio and extremely friendly staff who go out of their way to help guests. Many patrons have had an amazing dining experience with delicious food that exceeded expectations. Some visitors even describe it as "top" or "mega", indicating exceptional quality. Overall, past diners praise the welcoming atmosphere created by the attentive personnel.

I hope I could assist you properly!
Please let me know if there is something more I can help you with!

Best regards,
Your AI Assistant


### food

In [50]:
def summarize_reviews_food(review_text):
    
    # Define a prompt template for summarization
    prompt = f"""
    You are a helpful AI assistant summarizing reviews about the food quality of a restaurant for other customers. Summarize the following reviews:
    Reviews: {review_text}
    Summarize the reviews in exactly four sentences and do not include anything else. 
    Write the summary only in English and focus on the experiences of prior customers.
    
    Summary:"""
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")

    # Generate the summary
    outputs = quantized_model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.3,
        top_p=0.9,
        repetition_penalty=1.2, 
        pad_token_id=tokenizer.pad_token_id
    )
    
    # Decode the generated text
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the part after "Summary:"
    summary = summary.split("Summary:")[-1].strip()
    
    return summary

In [51]:
### food summary
for i in summary_restaurants['restaurant_id']:
    # Filter rows for the current restaurant_id
    all_reviews = categorized_reviews.loc[categorized_reviews['restaurant_id'] == i, 'food_sentences']
    all_reviews = all_reviews.dropna()  # Remove missing values

    # Update the overall_count column for the current restaurant_id
    summary_restaurants.loc[summary_restaurants['restaurant_id'] == i, 'food_count'] = len(all_reviews)

    # Generate the summaries of the restaurants
    summary_restaurants.loc[summary_restaurants['restaurant_id'] == i, 'food_summary'] = summarize_reviews_food(all_reviews)


In [27]:
print(summary_restaurants['food_summary'][1])

The majority of reviewers praised the quick service at this German-themed eatery, with many noting that their meals arrived promptly after ordering. While opinions were mixed regarding the taste of the dishes, most agreed that portion sizes were generous to say the least. Some patrons described their dining experience as average or good but nothing spectacular, while others raved about specific menu items like spätzle. Overall, diners appreciated the value they received given the reasonable prices charged by the establishment.


### service

In [52]:
def summarize_reviews_service(review_text):
    
    # Define a prompt template for summarization
    prompt = f"""
    You are a helpful AI assistant summarizing reviews about the service of a restaurant for other customers. Summarize the following reviews:
    Reviews: {review_text}
    Summarize the reviews in exactly four sentences and do not include anything else. 
    Write the summary only in English and focus on the experiences of prior customers.
    
    Summary:"""
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")

    # Generate the summary
    outputs = quantized_model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.3,
        top_p=0.9,
        repetition_penalty=1.2, 
        pad_token_id=tokenizer.pad_token_id
    )
    
    # Decode the generated text
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the part after "Summary:"
    summary = summary.split("Summary:")[-1].strip()
    
    return summary

In [53]:
### service summary
for i in summary_restaurants['restaurant_id']:
    # Filter rows for the current restaurant_id
    all_reviews = categorized_reviews.loc[categorized_reviews['restaurant_id'] == i, 'service_sentences']
    all_reviews = all_reviews.dropna()  # Remove missing values

    # Update the overall_count column for the current restaurant_id
    summary_restaurants.loc[summary_restaurants['restaurant_id'] == i, 'service_count'] = len(all_reviews)

    # Generate the summaries of the restaurants
    summary_restaurants.loc[summary_restaurants['restaurant_id'] == i, 'service_summary'] = summarize_reviews_service(all_reviews)


In [36]:
print(summary_restaurants['service_summary'][4])

Many reviewers praised the friendliness of the staff at this restaurant, with some describing them as very kind and welcoming. However, one reviewer had an unpleasant experience with someone behind the counter who was described as being unfriendly. Overall, most guests were satisfied with their dining experience here due to both delicious meals and quick service. A few even went out of their way from nearby towns just to enjoy what they considered exceptional offerings.


### atmosphere

In [54]:
def summarize_reviews_atmosphere(review_text):
    
    # Define a prompt template for summarization
    prompt = f"""
    You are a helpful AI assistant summarizing reviews about the atmosphere of a restaurant for other customers. Summarize the following reviews:
    Reviews: {review_text}
    Summarize the reviews in exactly four sentences and do not include anything else. 
    Write the summary only in English and focus on the experiences of prior customers.
    
    Summary:"""
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")

    # Generate the summary
    outputs = quantized_model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.3,
        top_p=0.9,
        repetition_penalty=1.2, 
        pad_token_id=tokenizer.pad_token_id
    )
    
    # Decode the generated text
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the part after "Summary:"
    summary = summary.split("Summary:")[-1].strip()
    
    return summary

In [55]:
### atmosphere summary
for i in summary_restaurants['restaurant_id']:
    # Filter rows for the current restaurant_id
    all_reviews = categorized_reviews.loc[categorized_reviews['restaurant_id'] == i, 'atmosphere_sentences']
    all_reviews = all_reviews.dropna()  # Remove missing values

    # Update the overall_count column for the current restaurant_id
    summary_restaurants.loc[summary_restaurants['restaurant_id'] == i, 'atmosphere_count'] = len(all_reviews)

    # Generate the summaries of the restaurants
    summary_restaurants.loc[summary_restaurants['restaurant_id'] == i, 'atmosphere_summary'] = summarize_reviews_atmosphere(all_reviews)


In [63]:
print(summary_restaurants['atmosphere_summary'][2])

The upper floor's sitting area is very nicely decorated with large portions served at this cozy eatery where guests can enjoy good food and ambiance despite some difficulties getting seated due to its popularity; however, it has been noted that reservations may be necessary as the space fills up quickly. Guests have praised the net and pleasant dining environment which contributes positively to their overall experience while also appreciating the generous serving sizes. Overall, previous diners found the establishment to offer an awesome setting even if there were occasional issues related to availability. Despite these minor drawbacks, patrons generally had positive impressions regarding both the quality of meals offered and comfort provided within the venue itself.


### price

In [56]:
def summarize_reviews_price(review_text):
    
    # Define a prompt template for summarization
    prompt = f"""
    You are a helpful AI assistant summarizing reviews about how customers perceived the prices of a restaurant. Summarize the following reviews:
    Reviews: {review_text}
    Summarize the reviews in exactly four sentences and do not include anything else. 
    Write the summary only in English and focus on the experiences of prior customers.
    
    Summary:"""
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")

    # Generate the summary
    outputs = quantized_model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.3,
        top_p=0.9,
        repetition_penalty=1.2, 
        pad_token_id=tokenizer.pad_token_id
    )
    
    # Decode the generated text
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the part after "Summary:"
    summary = summary.split("Summary:")[-1].strip()
    
    return summary

In [57]:
### price summary
for i in summary_restaurants['restaurant_id']:
    # Filter rows for the current restaurant_id
    all_reviews = categorized_reviews.loc[categorized_reviews['restaurant_id'] == i, 'price_sentences']
    all_reviews = all_reviews.dropna()  # Remove missing values

    # Update the overall_count column for the current restaurant_id
    summary_restaurants.loc[summary_restaurants['restaurant_id'] == i, 'price_count'] = len(all_reviews)

    # Generate the summaries of the restaurants
    summary_restaurants.loc[summary_restaurants['restaurant_id'] == i, 'price_summary'] = summarize_reviews_price(all_reviews)


In [45]:
print(summary_restaurants['price_summary'][4])

The majority of reviewers found the prices offered by this restaurant very satisfactory as they were able to enjoy high-quality meals without breaking their budget. Many praised the excellent value for money provided with options starting from just €3.50. Customers appreciated that delicious and healthy food was available at such an affordable rate. Overall, patrons felt that the establishment maintained a great balance between quality and affordability.


In [64]:
# ensure that all summaries only consist of one paragraph
# Define a function to clean summaries
def clean_summary(text):
    # Remove text after paragraphs (identified by '\n')
    text = text.split('\n')[0]
    return text

# Apply the cleaning function to all summary columns
for column in ['overall_summary', 'food_summary', 'service_summary', 'atmosphere_summary', 'price_summary']:
    summary_restaurants[column] = summary_restaurants[column].apply(clean_summary)

In [65]:
print(summary_restaurants['overall_summary'][2])

The majority of reviewers highly recommend this restaurant due to its excellent price-performance ratio and extremely friendly staff who go out of their way to help guests. Many patrons have had an amazing dining experience with delicious food that exceeded expectations. Some visitors even describe it as "top" or "mega", indicating exceptional quality. Overall, past diners praise the welcoming atmosphere created by the attentive personnel.


In [66]:
summary_restaurants.to_csv("filtered_summary_restaurants.csv", index=False)