# Minneapolis Food Inspections - ETL

In [None]:
# Dependencies

import pandas as pd
# import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from config import (user, password, host, port, database)

## Extraction Phase

In [None]:
# Read CSV 

food_insp_df = pd.read_csv("Food_Inspections.csv")

## Transformation Phase

In [None]:
# Drop unneeded columns

columns_to_drop = ['X', 'Y', 'HealthFacilityIDNumber',  'AddrNum', 'AddrNumSuffix',
       'StreetName', 'StreetType', 'PostDir', 'Unit', 'City', 'State',
       'County', 'ZipCode', 'APN', 'Ward', 'X_WebMercator', 'Y_WebMercator']

food_insp_df_clean = food_insp_df.drop(columns = columns_to_drop)

In [None]:
# Drop any existing NA's

food_insp_df_clean.dropna(how = 'any', inplace = True)

In [None]:
# Drop non-restaurant facilities

restaurants_df = food_insp_df_clean.loc[food_insp_df_clean["FacilityCategory"] == "RESTAURANT"]
restaurants_df.head()

In [None]:
# Drop details of inspections and violations, won't be needed, as well as Restaurant Category

inspection_details = ['FoodCodeText', 'FacilityCategory', 'RiskLevel', 'InspectorComments',
       'InspectionResult', 'ViolationPriority', 'FoodCodeItem', 'YearOfInspection',
       'ViolationStatus', 'ViolationPoints', 'ViolationIDNumber']

restaurants_df_clean = restaurants_df.drop(columns = inspection_details)
restaurants_df_clean.head(10)

In [None]:
# Drop duplicate inspection ID numbers (each inspection has a unique ID)

restaurants_df_unique = restaurants_df_clean.drop_duplicates("InspectionIDNumber")
restaurants_df_unique.head()

In [None]:
# Rename and rearrange columns (OBJECTID is going to be our primary key column)
## Verify that OBJECTID has unique values

print(restaurants_df_unique.shape) # Dataframe dimensions
print(restaurants_df_unique["OBJECTID"].nunique()) # Number of unique values in the OBJECTID column

In [None]:
## Since the amount of unique values is the same as the number of rows, this means we can use this column as our primary key
## Rename the "OBJECTID" column to "id" and rename rest of columns to snake case, this is to comply with PostgreSQL standards
## We will also drop the Inspection ID column because it's not needed at this point

new_column_names = {"OBJECTID": "id", 
                    "BusinessName": "business_name",
                    "FullAddress": "address",
                    "InspectionType": "inspection_type",
                    "DateOfInspection": "inspection_date",
                    "InspectionScore": "inspection_score",
                    "Neighborhood": "neighborhood",
                    "Latitude": "latitude",
                    "Longitude": "longitude"}

restaurants_df_lower = restaurants_df_unique.rename(columns = new_column_names).drop(columns = "InspectionIDNumber")
restaurants_df_lower.head()

In [None]:
# Split date from time in inspection_date column, we only need the date component

inspection_dates = restaurants_df_lower["inspection_date"]
new_insp_dates = [date.split(" ")[0] for date in inspection_dates]
restaurants_df_with_dates = restaurants_df_lower.assign(inspection_date = new_insp_dates)
restaurants_df_with_dates

## Loading Phase

In [None]:
# Create Postgres engine
engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{database}')

In [None]:
# Confirm connection was established

## Declare base using `automap_base()`
Base = automap_base()

## Use the Base class to reflect the database tables
Base.prepare(engine, reflect = True)

## Print all of the classes mapped to the Base
Base.classes.keys()

In [None]:
# Load to SQL using Pandas' to_sql function
restaurants_df_with_dates.to_sql(name = "food_inspections", con = engine, if_exists = 'append', index = False)