In [22]:
import pandas as pd
import json
import requests
import numpy as np

In [None]:
class RestaurantDataProcessor:
    def __init__(self, url, country_code_file, output_file_path):
        self.url = url
        self.country_code_file = country_code_file
        self.output_file_path = output_file_path

    def fetch_json_data(self):
        try:
            response = requests.get(self.url)
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching restaurant data: {e}")
            return None

    def read_country_code_data(self):
        try:
            return pd.read_csv(self.country_code_file)
        except FileNotFoundError as e:
            print(f"Error reading country code file: {e}")
            return None

    def merge_data(self, parsed_data, country_code_df):
        main_restaurant_df = pd.json_normalize(parsed_data, "restaurants")
        merged_df = pd.merge(
            main_restaurant_df, country_code_df, 
            how='left', left_on='restaurant.location.country_id', 
            right_on='Country Code'
        )
        return merged_df

    def process_merged_data(self, merged_df):
        extract_restaurant_df = merged_df[[
            'restaurant.R.res_id', 'restaurant.name', 'Country', 
            'restaurant.location.city', 'restaurant.user_rating.votes', 
            'restaurant.user_rating.aggregate_rating', 'restaurant.cuisines'
        ]]
        renamed_columns = {
            'restaurant.R.res_id': 'Restaurant Id',
            'restaurant.name': 'Restaurant Name',
            'restaurant.location.city': 'City',
            'restaurant.user_rating.votes': 'User Rating Votes',
            'restaurant.user_rating.aggregate_rating': 'User Aggregate Rating',
            'restaurant.cuisines': 'Cuisines'
        }
        renamed_restaurant_df = extract_restaurant_df.rename(columns=renamed_columns)
        renamed_restaurant_df['User Aggregate Rating'] = renamed_restaurant_df['User Aggregate Rating'].astype('float64')
        final_restaurant_df = renamed_restaurant_df.dropna()
        return final_restaurant_df

    def export_df_to_csv(self, df):
        try:
            df.to_csv(self.output_file_path, index=False)
            print(f"Data exported successfully to {self.output_file_path}")
        except Exception as e:
            print(f"Error exporting data: {e}")

    def run(self):
        json_data = self.fetch_json_data()
        if json_data:
            country_code_df = self.read_country_code_data()
            merged_df = self.merge_data(json_data, country_code_df)
            final_df = self.process_merged_data(merged_df)
            self.export_df_to_csv(final_df)
        else:
            print("Failed to fetch or process data.")

In [None]:
if __name__ == "__main__":
    url = "https://raw.githubusercontent.com/Papagoat/brain-assessment/main/restaurant_data.json"
    country_code_file = "Country-Code.csv"
    output_file_path = "restaurants.csv"
    processor = RestaurantDataProcessor(url, country_code_file, output_file_path)
    processor.run()

### Part 2

In [10]:
"""
Expanding the restaurant.zomato_events column 
"""

# Unpack the lists in the events column vertically
expended_events_df = main_restaurant_df.explode("restaurant.zomato_events")

# Unpack the dictionaries within each element of the column
unpacked_events_df = pd.json_normalize(expended_events_df["restaurant.zomato_events"])

# Merge the unpacked DataFrame with the original DataFrame
events_df = pd.concat([main_restaurant_df, unpacked_events_df], axis=1)

In [11]:
"""
Filtering past event in the month of April 2019 
"""

# Change event start and end date to a date data type
events_df['event.start_date'] = pd.to_datetime(events_df["event.start_date"])
events_df['event.end_date'] = pd.to_datetime(events_df["event.end_date"])


# Define the condition for events starting in or before April 2019
start_date_condition = (
    (events_df['event.start_date'].dt.year == 2019) & 
    (events_df['event.start_date'].dt.month <= 4)
)

# Define the condition for events ending in or after April 2019
end_date_condition = (
    (events_df['event.end_date'].dt.year == 2019) & 
    (events_df['event.end_date'].dt.month >= 4)
)

# Apply the conditions to filter the DataFrame
April2019_events_df = events_df[(start_date_condition | end_date_condition)]

In [12]:
"""
Extraction and renaming of required columns
"""

# Extract required Columns
April2019_events_df = April2019_events_df[[
    "event.event_id", "restaurant.id",
    "restaurant.name", "restaurant.photos_url",
    "event.title", "event.start_date",
    "event.end_date"
    ]]

# Rename required columns
columns_to_rename = {
    "event.event_id":"Event Id", 
    "restaurant.id":"Restaurant Id",
    "restaurant.name":"Restaurant Name", 
    "restaurant.photos_url":"Photo URL",
    "event.title":"Event Title", 
    "event.start_date":"Event Start Date",
    "event.end_date":"Event End Date"
}

final_events_df = April2019_events_df.rename(columns=columns_to_rename)

In [14]:
"""
To check duplicate values
"""

final_events_df.duplicated().sum()

0

In [15]:
"""
Export final_events_df to CSV
"""

final_events_df.to_csv(input(), index=False)

# My input was C:\\Users\\junke\\Desktop\\Important Documents (JunKeat)\\GovTech-Application\\restaurant_events.csv

### Part 3

In [16]:
"""
Filter for the specified rating texts only
"""

specified_texts = ['Excellent', 'Very Good', 'Good', 'Average', 'Poor']
filtered_rating_df = main_restaurant_df[main_restaurant_df['restaurant.user_rating.rating_text'].isin(specified_texts)]

In [17]:
"""
To check for null values in the aggregate rating column"""

filtered_rating_df['restaurant.user_rating.aggregate_rating'].isna().sum()

0

In [18]:
"""
Changing the aggregate rating column to float data type
"""

filtered_rating_df['restaurant.user_rating.aggregate_rating'] = filtered_rating_df['restaurant.user_rating.aggregate_rating'].astype("float64")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rating_df['restaurant.user_rating.aggregate_rating'] = filtered_rating_df['restaurant.user_rating.aggregate_rating'].astype("float64")


In [19]:
"""
Analyze the distribution of aggregate ratings for each rating text
"""

aggregate = [ 'min', 'max']

rating_statistics = filtered_rating_df.groupby('restaurant.user_rating.rating_text')['restaurant.user_rating.aggregate_rating'].agg(aggregate)


In [20]:
"""
Export rating_statistics to json
"""

rating_statistics.to_json(input(), index=False)

# My input was C:\\Users\\junke\\Desktop\\Important Documents (JunKeat)\\GovTech-Application\\restaurant_data.json

In [21]:
pd.set_option('display.max_columns', 500)