In [2]:
import pandas as pd
from dateutil.relativedelta import relativedelta
from datetime import datetime
import matplotlib.pyplot as plt

In [3]:
# loading the data
reviews_general_selected  = pd.read_csv("C:/Users/Theresa/Downloads/filtered_reviews_general.csv")
reviews_additional_selected = pd.read_csv("C:/Users/Theresa/Downloads/filtered_reviews_additional.csv")
api_general = pd.read_csv("C:/Users/Theresa/Downloads/api_general.csv")
api_additional = pd.read_csv("C:/Users/Theresa/Downloads/api_additional.csv")
api_basics = pd.read_csv("C:/Users/Theresa/Downloads/api_basics.csv")

In [4]:
# merging the data
reviews = pd.merge(reviews_general_selected, reviews_additional_selected, on='review_id', how='outer')
reviews_full = pd.merge(reviews, api_basics, left_on = "restaurant_id_x", right_on = "id", how = 'left')

In [5]:
# function to calculate the actual date
def calculate_actual_date(row):
    review_text = row["review_date"]
    scraping_date = row["scraping_date"]

    # check if review date is a string
    if not isinstance(review_text, str) or pd.isna(review_text):
        return None

    # months
    if "Monat" in review_text:
        months = int(review_text.split()[1]) if "einem" not in review_text else 1
        return scraping_date - relativedelta(months=months)

    # years
    elif "Jahr" in review_text:
        years = int(review_text.split()[1]) if "einem" not in review_text else 1
        return scraping_date - relativedelta(years=years)

    # weeks
    elif "Woche" in review_text:
        weeks = int(review_text.split()[1]) if "einer" not in review_text else 1
        return scraping_date - pd.to_timedelta(weeks * 7, unit="days")

    # days
    elif "Tag" in review_text:
        days = int(review_text.split()[1]) if "einem" not in review_text else 1
        return scraping_date - pd.to_timedelta(days, unit="days")

    # default if nothing is found
    return None

In [6]:
# convert scraping_date to datetime
reviews_full["scraping_date"] = pd.to_datetime(reviews_full["scraping_date"])

# apply the function to the dataframe
reviews_full["actual_review_date"] = reviews_full.apply(calculate_actual_date, axis=1)

In [7]:
# add column month and year
reviews_full["review_month"] = reviews_full["actual_review_date"].dt.strftime("%m/%Y")
reviews_full["review_year"] = reviews_full["actual_review_date"].dt.strftime("%Y")

In [8]:
# group by month and year
reviews_grouped_month = (
    reviews_full.groupby(["review_month", "restaurant_id_x"])
    .agg(
        stars_mean=("stars", "mean"),
        stars_count=("stars", "count"),
        dining_stars_service_mean=("dining_stars_service", "mean"),
        dining_stars_service_count=("dining_stars_service", "count"),
        dining_stars_food_mean=("dining_stars_food", "mean"),
        dining_stars_food_count=("dining_stars_food", "count"),
        dining_stars_atmosphere_mean=("dining_stars_atmosphere", "mean"),
        dining_stars_atmosphere_count=("dining_stars_atmosphere", "count")
    )
    .reset_index()
)

In [9]:
# group by year
reviews_grouped_year = (
    reviews_full.groupby(["review_year", "restaurant_id_x"])
    .agg(
        stars_mean=("stars", "mean"),
        stars_count=("stars", "count"),
        dining_stars_service_mean=("dining_stars_service", "mean"),
        dining_stars_service_count=("dining_stars_service", "count"),
        dining_stars_food_mean=("dining_stars_food", "mean"),
        dining_stars_food_count=("dining_stars_food", "count"),
        dining_stars_atmosphere_mean=("dining_stars_atmosphere", "mean"),
        dining_stars_atmosphere_count=("dining_stars_atmosphere", "count")
    )
    .reset_index()
)

In [10]:
# group by year
dining_price_range_group = (
    reviews_full.groupby(["dining_price_range", "restaurant_id_x"])
    .agg(
        dining_price_range_count=("dining_price_range", "count")
    )
    .reset_index()
)

In [14]:
# saving the dataframes
reviews_grouped_year.to_csv('reviews_grouped_year.csv', index=False)
reviews_grouped_month.to_csv('reviews_grouped_month.csv', index=False)
dining_price_range_group.to_csv('dining_price_range_group.csv', index=False)    

In [11]:
# # add column month and year
# reviews_full["review_month"] = reviews_full["actual_review_date"].dt.strftime("%m/%Y")

# # Group by review_month and restaurant_id, and calculate the average stars
# avg_stars_month = reviews_full.groupby(["review_month", "restaurant_id_x"])["stars"].mean().reset_index()

# # Pivot the DataFrame for visualization
# avg_stars_month_pivot = avg_stars_month.pivot(index="review_month", columns="restaurant_id_x", values="stars")

# avg_stars_month_pivot = avg_stars_month_pivot["ChIJU3_pY4T7mUcREub8_JVTsJc"]


# plt.figure(figsize=(12, 6))
# avg_stars_month_pivot.plot(marker="o", title="Average Stars Per Month and Restaurant ID")
# plt.xlabel("Month")
# plt.ylabel("Average Stars")
# plt.legend(title="Restaurant ID")
# plt.grid(True)
# plt.show()

In [12]:
# # Add column for review_week in "KW YYYY" format
# reviews_full["review_week"] = reviews_full["actual_review_date"].dt.isocalendar().year.astype(str) + "-KW" + reviews_full["actual_review_date"].dt.isocalendar().week.astype(str)

# # Group by review_week and restaurant_id, and calculate the average stars
# avg_stars_week = reviews_full.groupby(["review_week", "restaurant_id_x"])["stars"].mean().reset_index()

# # Pivot the DataFrame for visualization
# avg_stars_week_pivot = avg_stars_week.pivot(index="review_week", columns="restaurant_id_x", values="stars")

# # Filter for a specific restaurant (optional, change to desired ID)
# avg_stars_week_pivot = avg_stars_week_pivot["ChIJU3_pY4T7mUcREub8_JVTsJc"]

# # Plot the data
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 6))
# avg_stars_week_pivot.plot(marker="o", title="Average Stars Per Kalenderwoche and Restaurant ID")
# plt.xlabel("Kalenderwoche")
# plt.ylabel("Average Stars")
# plt.legend(title="Restaurant ID")
# plt.grid(True)
# plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
# plt.show()



In [13]:
# # Gruppiere nach Stadt und Restaurant, und berechne die durchschnittliche Sternebewertung
# avg_stars_per_restaurant_per_city = reviews_full.groupby(["city", "restaurant_id_x"])["stars"].mean().reset_index(name="avg_stars")

# # Funktion, um die Top 5 Restaurants pro Stadt nach Durchschnitts-Sternebewertung auszuwählen
# top_5_restaurants_by_stars_per_city = avg_stars_per_restaurant_per_city.groupby("city").apply(lambda x: x.nlargest(5, "avg_stars")).reset_index(drop=True)

# # Anzeigen der Top 5 Restaurants pro Stadt nach Durchschnitts-Sternebewertung
# print(top_5_restaurants_by_stars_per_city)

# # Optional: Visualisierung der Top 5 Restaurants pro Stadt (nur für eine bestimmte Stadt)
# # Beispiel: Top 5 Restaurants in einer bestimmten Stadt visualisieren (z.B. "Tübingen")
# city_to_plot = "Tübingen"
# top_5_in_city = top_5_restaurants_by_stars_per_city[top_5_restaurants_by_stars_per_city["city"] == city_to_plot]

# # Anzeigen der Top 5 Restaurants pro Stadt nach Durchschnitts-Sternebewertung
# print(top_5_restaurants_by_stars_per_city)
