# ETL Project - Yelp API and NYC Health Inspection Grades

In [121]:
# Dependencies
import pandas as pd
from sqlalchemy import create_engine
from flask import Flask, jsonify
import json
import requests
import os
import csv
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint



### Store NYC Health Inspection CSV into DataFrame

In [122]:
csv_file = "../Resources/DOHMH_New_York_City_Restaurant_Inspection_Results.csv"
inspection_data_df = pd.read_csv(csv_file)
inspection_data_df.head()

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,...,RECORD DATE,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,50059672,GOOD FRIENDS 1,Brooklyn,1376,NOSTRAND AVE,11226.0,7182872345,Chinese,09/06/2018,Establishment Closed by DOHMH. Violations wer...,...,10/11/2019,Cycle Inspection / Initial Inspection,40.653158,-73.949837,317.0,40.0,82000.0,3116688.0,3050850000.0,BK60
1,50034192,K'OOK,Manhattan,324,E 6TH ST,10003.0,2122540300,Korean,08/14/2017,Violations were cited in the following area(s).,...,10/11/2019,Cycle Inspection / Initial Inspection,40.727066,-73.98778,103.0,2.0,3800.0,1006234.0,1004470000.0,MN22
2,50033885,A&H DELI,Manhattan,431,7TH AVE,10001.0,2125636200,American,06/06/2016,Violations were cited in the following area(s).,...,10/11/2019,Cycle Inspection / Re-inspection,40.75071,-73.990811,105.0,3.0,10100.0,1015218.0,1008090000.0,MN17
3,41519373,BUNGALO,Queens,3203,BROADWAY,11106.0,7182047010,Armenian,01/21/2017,No violations were recorded at the time of thi...,...,10/11/2019,Inter-Agency Task Force / Initial Inspection,40.761538,-73.92445,401.0,22.0,6100.0,4008406.0,4006140000.0,QN70
4,50016112,ANTOJITOS ECUATORIANOS,Brooklyn,3398,FULTON ST,11208.0,7182770970,"Latin (Cuban, Dominican, Puerto Rican, South &...",07/11/2018,Violations were cited in the following area(s).,...,10/11/2019,Cycle Inspection / Re-inspection,40.684208,-73.870173,305.0,37.0,118400.0,3092908.0,3041490000.0,BK83


### Create new NYC Health Inspection DataFrame with select columns

In [123]:
# Select relevant columns: DBA (name), Building, Street, Zipcode, Boro, Grade
ny_inspect_df = inspection_data_df[['DBA', 'BUILDING', 'STREET', 'ZIPCODE', 'BORO', 'GRADE']].copy()
ny_inspect_df = ny_inspect_df.dropna()
ny_inspect_final = ny_inspect_df.drop_duplicates(subset='DBA', keep='first', inplace=False)
ny_inspect_final.count()

DBA         20157
BUILDING    20157
STREET      20157
ZIPCODE     20157
BORO        20157
GRADE       20157
dtype: int64

### Yelp API Response

In [124]:
# Set up parameters for the API Key
api_key='KMXu7o4jj9H_5fBmmoxcQUXUcjaIiDMpnabg34SZhyJUQPt-H6y8sfBIq8jI65xOovUH7cKhDpTUnvK2UIFOf1r5864boyx0PCcIwR4QQ1OeR8IWr5RO7UxW3HJoXXYx'
headers = {'Authorization': 'Bearer %s' % api_key}

# Define the Business Search end point url
url='https://api.yelp.com/v3/businesses/search'

In [125]:
# Pulling data for 5,000 restaurants in NYC
categories = ['Italian', 'American', 'Mexican', 'Chinese', 'Cuban']
restaurants = []

for category in categories:
    for offset in range(0,1000,50):
        params = {'term':'restaurants','categories': category,'location':'New York','limit':50, 'offset':offset}
        response = requests.get(url, params=params, headers=headers).json()
        restaurants.append(dict(response))

In [140]:
# pprint(restaurants[1])

In [127]:
# Define empty dictionary to save only required elements
final = {}

# Define Empty list to append all the data 
data = []

for restaurant in restaurants:
    try:
        for item in restaurant['businesses']:
            final['Restaurant Name'] = item['name']
            final['City'] = item['location']['city']
            final['Rating'] = item['rating']
            data.append(dict(final))
    except:   
        restaurant['businesses'] = float('nan')
        
        # Send api data to json file
        # data.to_json("../Resources/YelpData.json")

In [128]:
# Create DataFrame to save restaurant name, neighborhood, rating
restaurants_df = pd.DataFrame(data)
restaurants_df.head()

#json_file = "../Resources/YelpData.json"
#restaurants_df = pd.read_json(json_file)
#restaurants_df.head()

Unnamed: 0,City,Rating,Restaurant Name
0,New York,4.5,Upstate
1,New York,4.5,Amélie
2,New York,4.0,Trattoria Trecolori
3,New York,4.5,Thai Villa
4,New York,4.5,nonono


In [129]:
# Drop nan values and duplicates
restaurants_df.dropna(axis='index',how='all',inplace=True)
restaurants_df.reset_index(drop=True)

restaurants_final = restaurants_df.drop_duplicates(subset='Restaurant Name', keep='first', inplace=False)
restaurants_final.count()

City               968
Rating             968
Restaurant Name    968
dtype: int64

In [130]:
# Convert restaurant names to uppercase to match DBA column in NYC Health Inspection DataFrame
restaurants_final['Restaurant Name'] = restaurants_final['Restaurant Name'].str.upper() 
restaurants_final.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,City,Rating,Restaurant Name
0,New York,4.5,UPSTATE
1,New York,4.5,AMÉLIE
2,New York,4.0,TRATTORIA TRECOLORI
3,New York,4.5,THAI VILLA
4,New York,4.5,NONONO


### Transform Yelp_Data DataFrame

In [131]:
# Create filtered DataFrame from specific columns
restaurants_df_cols = ["Restaurant Name", "City", "Rating"]
restaurant_transformed= restaurants_final[restaurants_df_cols].copy()

# Rename the column headers
restaurant_transformed = restaurant_transformed.rename(columns={"Restaurant Name": "Restaurant_Name",
                                                          "City": "City",
                                                          "Rating": "Rating"})

# Clean the data by dropping duplicates and setting index
restaurant_transformed.dropna(subset=['Restaurant_Name'],inplace=False)
restaurant_transformed.set_index("Restaurant_Name", inplace=False)

restaurant_transformed.head()

Unnamed: 0,Restaurant_Name,City,Rating
0,UPSTATE,New York,4.5
1,AMÉLIE,New York,4.5
2,TRATTORIA TRECOLORI,New York,4.0
3,THAI VILLA,New York,4.5
4,NONONO,New York,4.5


### Transform Ny_inspection Dataframe

In [132]:
# Create  filtered dataframe from specific columns
ny_inspect_cols = ["DBA", "BUILDING", "STREET", "ZIPCODE", "BORO", "GRADE"]
inspect_transformed= ny_inspect_final[ny_inspect_cols].copy()

# Rename the column headers
inspect_transformed = inspect_transformed.rename(columns={"DBA": "Restaurant_Name",
                                                          "BUILDING": "Building",
                                                          "STREET": "Street",
                                                          "ZIPCODE": "Zip_Code",
                                                          "BORO": "Boro",
                                                          "GRADE": "Grade"})

# Clean the data by dropping duplicates and setting the index
inspect_transformed.dropna(subset=['Restaurant_Name'], inplace=False)
inspect_transformed.set_index("Restaurant_Name", inplace=False)
inspect_transformed = inspect_transformed.reset_index(drop=True)

inspect_transformed.head()

Unnamed: 0,Restaurant_Name,Building,Street,Zip_Code,Boro,Grade
0,A&H DELI,431,7TH AVE,10001.0,Manhattan,A
1,ANTOJITOS ECUATORIANOS,3398,FULTON ST,11208.0,Brooklyn,A
2,I LAND FISH & GRILL,7911,FLATLANDS AVE,11236.0,Brooklyn,B
3,CAFE LAFAYETTE,80,LAFAYETTE STREET,10013.0,Manhattan,A
4,FLY BAR,4224,COLLEGE POINT BLVD,11355.0,Queens,A


### Connect to local database

In [133]:
rds_connection_string = "postgres:rutgers@localhost:5432/yelpvsnyc"
engine = create_engine(f'postgresql://{rds_connection_string}')

### Check for tables

In [134]:
engine.table_names()

['Ny_Inspection', 'Yelp_Data']

### Load CSV converted DataFrame into database

In [135]:
inspect_transformed.to_sql(name='Ny_Inspection', con=engine, if_exists='append', index=False)

### Load JSON converted DataFrame into database

In [136]:
restaurant_transformed.to_sql(name='Yelp_Data', con=engine, if_exists='append', index=False)

### Confirm data has been added

In [137]:
pd.read_sql_query('select * from "Ny_Inspection";', con=engine).head()

Unnamed: 0,Restaurant_Name,Building,Street,Zip_Code,Boro,Grade
0,A&H DELI,431,7TH AVE,10001,Manhattan,A
1,ANTOJITOS ECUATORIANOS,3398,FULTON ST,11208,Brooklyn,A
2,I LAND FISH & GRILL,7911,FLATLANDS AVE,11236,Brooklyn,B
3,CAFE LAFAYETTE,80,LAFAYETTE STREET,10013,Manhattan,A
4,FLY BAR,4224,COLLEGE POINT BLVD,11355,Queens,A


In [138]:
pd.read_sql_query('select * from "Yelp_Data";', con=engine).head()

Unnamed: 0,Restaurant_Name,City,Rating
0,UPSTATE,New York,5
1,AMÉLIE,New York,5
2,TRATTORIA TRECOLORI,New York,4
3,THAI VILLA,New York,5
4,NONONO,New York,5


### Confirm Ny_Inspection and Yelp_Data tables have been joined

In [142]:
pd.read_sql_query('select * from "Final_Table";', con=engine).head()

Unnamed: 0,Restaurant_Name,Grade,Boro,Rating
0,LOCAL 92,A,Manhattan,4
1,VANESSA'S DUMPLING HOUSE,A,Brooklyn,4
2,BENITO ONE,A,Manhattan,4
3,THE HALAL GUYS,A,Manhattan,4
4,LIDO,A,Manhattan,4
