In [16]:
# import dependencies
import pandas as pd
import json
import os

# Import SQL Alchemy
from sqlalchemy import create_engine

# Import and establish Base for which classes will be constructed 
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.ext.automap import automap_base
Base = declarative_base()

# Import modules to declare columns and column data types
from sqlalchemy import Column, Integer, String, Float, Boolean, DateTime, Date, func, ForeignKey
from sqlalchemy.orm import Session, relationship, session, column_property
from sqlalchemy.exc import SQLAlchemyError

from datetime import datetime

import psycopg2

DATA_SRC_RESTAURANTS = 1 

# Hide warning messages in notebook
#import warnings
#warnings.filterwarnings('ignore')

### Functions

In [9]:
# Database connection
def get_dbconnection():
    connection = psycopg2.connect(user = "postgres",
                                  password = "postgres",
                                  host = "localhost",
                                  port = "5432",
                                  database = "ETLproject")    
    return connection

# Read restaurant data from json file
def read_restaurants(file, states_id):

    with open(file, "r") as read_file:
        restaurants = json.load(read_file)["businesses"]

    # Hash of categories
    categories = {}

    # List of restaurants
    rest_list = []

    # List of restaurant categories
    rest_cat = []
    
    for restaurant in restaurants:
        id = restaurant["id"]
        name = restaurant["name"]

        # location
        location = restaurant["location"]
        if(not location['state'] in states_id):
            continue
        state_id = states_id[location['state']]
        
        # All address entries into street
        street = location["address1"]
        street2 = location["address2"]
        street3 = location["address3"]
        if(street2 and not street2.isspace()):
            street += " " + street2 
        if(street3 and not street3.isspace()):
            street += " " + street3 

        # operational info
        for c in restaurant["categories"]:
            # Object to list of categories 
            alias =  c['alias'].strip().lower() # alias is used as key
            rest_cat.append((id, CategoryCls(alias = c['alias'], title = c['title'] )))
            categories[c['alias']] = c['title']

        if(restaurant.get("price")):
            price = restaurant["price"].count('$') 
        else:
            price = None # some entries miss price

        restaurantObj = RestaurantCls(
            name = restaurant["name"],
            business_id = restaurant["id"],
            state_id = state_id,
            city = location["city"],
            street = street,
            zip_code = location["zip_code"],
            price_range = price,
            rating = float(restaurant["rating"]),
            is_closed = bool(restaurant["is_closed"])
        )
        rest_list.append(restaurantObj)
    
    print(len(rest_list))
    return rest_list, rest_cat, categories

# Get state_id form database
def get_states_id():
    # Get states keys
    engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost/ETLproject')

    # Reflect an existing database into a new model
    Base = automap_base()

    # reflect the tables
    Base.prepare(engine, reflect=True)
    #Base.classes.keys()

    # Assign the state class to a variable
    StatesList = Base.classes.state
    # Create a session
    statsession = Session(engine)

    states_id = {}

    for row in statsession.query(StatesList, StatesList.id, StatesList.name_a2).all():
        states_id[row[2]] = row[1]

    return states_id


### Classes

In [10]:
class RestaurantCls(Base):
    __tablename__ = 'Restaurant'
    id = Column(Integer, primary_key=True)
    name = Column(String(255), nullable=False)
    business_id = Column(String(255), unique=True, nullable=False)
    # location
    state_id = Column(Integer, ForeignKey('state.id'), nullable=False)
    city = Column(String(100), nullable=False)
    street = Column(String(250), nullable=False)
    zip_code = Column(String(10), nullable=False)
    # Operational info
    price_range = Column(Integer, nullable=False)
    rating = Column(Float, nullable=False)
    is_closed = Column(Boolean, nullable=False)
    source_id = Column(Integer, nullable=False)
    modified_date = Column(Date, nullable=False, default=func.now(), onupdate=datetime.now())

class CategoryCls(Base):
    __tablename__ = 'category'
    id = Column(Integer, primary_key=True)
    alias = Column(String(255), nullable=False)
    title = Column(String(255), nullable=False)
    source_id = Column(Integer, nullable=False)
    modified_date = Column(Date, default=func.now())    

In [11]:
# read data from files created by calling Yelp API
#data_file=pd.read_json("data.txt", lines=False)
#data_file
#data = json.load(open('data.txt'))
#df = pd.DataFrame(data["businesses"])
#df

In [12]:
#data = json.load(open('YelpData.txt'))
#df = pd.DataFrame(data["businesses"])
#df

### Load data from file into objects for further processing
* Use direct data reader, not Pandas df to get values

In [22]:
states_id = get_states_id()
restaurant_file = os.path.join("..", "Data", "YelpData.txt")
restaurants, restaurant_categories, categories = read_restaurants(restaurant_file, states_id)
len(restaurants)

50


50

In [26]:
# connect to database with psycopg2 w/o SQLAlchemy
#conn_string = "host='localhost' dbname='ETLproject' user='postgres' password='postgres'"
#conn = psycopg2.connect(conn_string)
for x in restaurants:
    print(x.state_id)

In [67]:
# Add entires into category table using SQLAlchemy ORM
# The issue with this aproach is no support for upsert
# https://www.pythonsheets.com/notes/python-sqlalchemy.html
#session = Session(engine)

# Note that adding to the session does not update the table. It queues up those queries.
# https://stackoverflow.com/questions/9911467/sqlalchemy-update-if-unique-key-exists 
# https://stackoverflow.com/questions/6611563/sqlalchemy-on-duplicate-key-update
#try:
#   for alias, title in categories.items():
#        print(f"{alias} - {title}")
#        categoryObj = CategoryCls(id=None, alias = alias, title = title, source_id = DATA_SRC_RESTAURANTS)
#        session.add(categoryObj)
#    # commit() flushes whatever remaining changes remain to the database, and commits the transaction.
#    session.commit()
#except SQLAlchemyError as e:
#    print(e)
#finally:
#    session.close()        

In [48]:
# Insert categories
connection = get_dbconnection()
cursor = connection.cursor()

try:
    for alias, title in categories.items():
        cursor.execute("INSERT INTO category (alias, title, source_id) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING", 
                       (alias, title, DATA_SRC_RESTAURANTS))
     # commit the changes to the database
    connection.commit()
    cursor.close()
except psycopg2.DatabaseError as e:
    print(e)
finally:
    connection.rollback()
    if connection is not None:
        connection.close()    
    print("Categories import finished.")

Categories import finished.


In [27]:
# Insert, update restaurants in database
connection = get_dbconnection()
cursor = connection.cursor()

try:
    for r in restaurants:
        cursor.execute("INSERT INTO restaurant (name, business_id, state_id, city, street, zip_code, price_range, rating, is_closed, source_id) \
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING", 
                       (r.name, r.business_id, r.state_id, r.city, r.street, r.zip_code, r.price_range, r.rating, r.is_closed, DATA_SRC_RESTAURANTS))
     # commit the changes to the database
    connection.commit()
    cursor.close()
except psycopg2.DatabaseError as e:
    print(e)
finally:
    connection.rollback()
    if connection is not None:
        connection.close()    
    print("Categories import finished.")    


Categories import finished.


AttributeError: 'list' object has no attribute 'items'

In [None]:
# load resturant categories into database
for r in restaurant_categories:
    print(f"{r[0]}; {r[1].alias}; {r[1].title}")