In [21]:
# Always include these two lines.
# They allow multiple cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Importing required libraries
import requests  # To make HTTP requests and fetch data from APIs
import pandas as pd  # To store, manipulate, and clean tabular data
import sqlite3  # To interact with an SQLite database for data storage
import json  # To handle JSON data from APIs
import matplotlib.pyplot as plt  # Optional, for data visualization
import os # working with operating system functions


# Display confirmation
print("Libraries imported successfully!")

Libraries imported successfully!


In [22]:
print(os.getcwd())

# Set a path to the data folder in your directory system. For example, on my machine, I downloaded the data and set it to the path given below. 
# Change the directory according to your system,
# You can either use 
# a) directory= "C:\\Users\\achir\\OneDrive\\Documents\\Data Engineering\\Projects\\Project 2\\Data"
# Note the use of double backslashes. This is where the downloaded data will be stored
# b) directory=r"C:\Users\achir\OneDrive\Documents\Data Engineering\Projects\Project 2"
# Note the use of 'r' treats it as a raw string, so backslashes (\) are not treated as escape characters

directory='/Users/munishshah/Documents/GitHub/daen328_project/term_project/data'
os.chdir(directory)
# Verify the change
print(os.getcwd()) 

/Users/munishshah/Documents/GitHub/daen328_project/term_project/data
/Users/munishshah/Documents/GitHub/daen328_project/term_project/data


In [23]:
def fetch_api_data(api_url, output_file, batch_size=290342, num_records=None):
    """
    Fetches all data from the API in chunks using $limit and $offset parameters, 
    and saves each batch to a file incrementally.

    Parameters:
    - api_url (str): The base URL of the API.
    - output_file (str): Path to the JSON file to save data incrementally.
    - batch_size (int): Number of records to fetch per request (default: 1000).
    - num_records (int or None): Maximum number of records to fetch. If None, fetch all records.
    """
    offset = 0
   
    # Check if the output file already exists and load existing data
    if os.path.exists(output_file):
        with open(output_file, "r") as f:
            try:
                all_data = json.load(f)
                print(f"Resuming from {len(all_data)} records in {output_file}.")
            except json.JSONDecodeError:
                print(f"{output_file} is corrupted or empty. Starting fresh.")
                all_data = []
    else:
        all_data = []

    # Calculate the starting offset based on the existing data
    offset = len(all_data)
    print(f"Starting from offset {offset}...")

    while True:
        # Add $limit and $offset parameters to the API URL
        paginated_url = f"{api_url}?$limit={batch_size}&$offset={offset}"
        print(f"Fetching records starting at offset {offset}...")
        
        # Fetch data from the API
        try:
            response = requests.get(paginated_url)
            response.raise_for_status()
            batch_data = response.json()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")
            break

        # Stop if no more data is returned
        if not batch_data:
            print("No more data to fetch.")
            break

        # Append the batch to the combined data list
        all_data.extend(batch_data)

        # Save the updated data to the output file incrementally
        with open(output_file, "w") as f:
            json.dump(all_data, f, indent=2)
        print(f"Appended {len(batch_data)} records. Total records saved: {len(all_data)}")

        # Update offset to fetch the next batch
        offset += batch_size

        # Stop if a specific number of records is requested and reached
        if num_records is not None and len(all_data) >= num_records:
            print(f"Reached the specified number of records: {num_records}.")
            break

        # Break if the batch size is less than the limit, indicating the end of the dataset
        if len(batch_data) < batch_size:
            print("Reached the end of the dataset.")
            break

    print(f"Fetched a total of {len(all_data)} records. Data saved to {output_file}.")
    return all_data




In [24]:
# API URL for NYC COVID-19 Outcomes dataset
api_url = "https://data.cityofchicago.org/resource/4ijn-s7e5.json"
 
# Store json data set. You will need to adjust this paths
json_file_path = directory +"/api_data.json"

# Fetch the data
api_data = fetch_api_data(api_url = api_url, output_file = json_file_path, batch_size=290342, num_records=None)

# Verify the total number of records fetched
print(f"Total records fetched: {len(api_data)}")

# Display a sample of the data to inspect
if api_data:
    print("Sample data (first 5 records):")
    print(json.dumps(api_data[:5], indent=2))

Starting from offset 0...
Fetching records starting at offset 0...
Appended 290342 records. Total records saved: 290342
Fetching records starting at offset 290342...
No more data to fetch.
Fetched a total of 290342 records. Data saved to /Users/munishshah/Documents/GitHub/daen328_project/term_project/data/api_data.json.
Total records fetched: 290342
Sample data (first 5 records):
[
  {
    "inspection_id": "2616252",
    "dba_name": "ROTI MODERN MEDITERRANEAN",
    "aka_name": "ROTI MODERN MEDITERRANEAN",
    "license_": "2594670",
    "facility_type": "Restaurant",
    "risk": "Risk 1 (High)",
    "address": "1012 W RANDOLPH ST",
    "city": "CHICAGO",
    "state": "IL",
    "zip": "60607",
    "inspection_date": "2025-04-23T00:00:00.000",
    "inspection_type": "Canvass",
    "results": "Pass",
    "latitude": "41.88442576290219",
    "longitude": "-87.65268988514183",
    "location": {
      "latitude": "41.88442576290219",
      "longitude": "-87.65268988514183",
      "human_addre

In [25]:

# Read the CSV into a DataFrame
df = pd.read_json(json_file_path)

# Display the first few rows of the DataFrame
print(df.head())

# Display the first few rows to inspect the structure
print("Sample DataFrame (first 5 rows):")
print(df.head())

# Display information about the DataFrame's structure and data types
print("\nDataFrame Info:")
print(df.info())

# Display summary statistics for numeric columns
print("\nSummary Statistics for Numeric Columns:")
print(df.describe())


# Additional explanation:
# 1. pd.read_csv(csv_file_path): opens csv file into a pandas Dataframe.
# 2. df.head(): Displays the first 5 rows of the DataFrame for a quick overview.
# 3. df.info(): Shows column names, data types, and non-null counts.
# 4. df.describe(): Provides basic statistics (e.g., mean, min, max) for numeric columns.


   inspection_id                   dba_name                   aka_name  \
0        2616252  ROTI MODERN MEDITERRANEAN  ROTI MODERN MEDITERRANEAN   
1        2616253     MAGGIE GYROS & CHICKEN     MAGGIE GYROS & CHICKEN   
2        2616226    FORNO ROSSO ON RANDOLPH    FORNO ROSSO ON RANDOLPH   
3        2616137   HOOK POINT FISHERIES LLC   HOOK POINT FISHERIES LLC   
4        2616171          TO KOREAN CUISINE          TO KOREAN CUISINE   

    license_ facility_type           risk             address     city state  \
0  2594670.0    Restaurant  Risk 1 (High)  1012 W RANDOLPH ST  CHICAGO    IL   
1  1869774.0    Restaurant  Risk 1 (High)       349 E 47TH ST  CHICAGO    IL   
2  2358717.0    Restaurant  Risk 1 (High)  1048 W RANDOLPH ST  CHICAGO    IL   
3  3020919.0           NaN            NaN   4755 S TALMAN AVE  CHICAGO    IL   
4  2997283.0    Restaurant  Risk 1 (High)     3108 N BROADWAY  CHICAGO    IL   

       zip  ...    results   latitude  longitude  \
0  60607.0  ...       

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290342 entries, 0 to 290341
Data columns (total 22 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   inspection_id                290342 non-null  int64  
 1   dba_name                     290342 non-null  object 
 2   aka_name                     287919 non-null  object 
 3   license_                     290324 non-null  float64
 4   facility_type                285136 non-null  object 
 5   risk                         290260 non-null  object 
 6   address                      290342 non-null  object 
 7   city                         290182 non-null  object 
 8   state                        290284 non-null  object 
 9   zip                          290302 non-null  float64
 10  inspection_date              290342 non-null  object 
 11  inspection_type              290341 non-null  object 
 12  results                      290342 non-null  object 
 13 