# Creates a Yelp POSTGRESSQL database
 - Uses the open source Yelp dataset and inserts json files into SQL tables

## Table of contents 
- importing json into python using pandas tools
- importing json to postgresql database

In [1]:
import json 
import pandas as pd 
from re import sub

from sqlalchemy import create_engine,text
from sqlalchemy.engine.url import URL 
import psycopg2 #postgressql driver

In [45]:
#format the columns/fields that will be jsonb
def stringify_json_records(series):
    '''
    converts a series of dicts into a stringified 
    set of json records (which is necessary format for json storage)
    for example single quotes are replaced by double quotes
    and the dict is converted into a single quote string.
    '''
    if series.apply(lambda x: x is None or type(x) is dict).all():
       json_string_records = (
           series
           .apply(json.dumps)
           .apply(_restringify_nested_json_records) 
       )
    else:
        print("the series {} are not all dict records".format(series.name))
        json_string_records = series
    return json_string_records

def _restringify_nested_json_records(stringified_record):
    '''
    yelp open dataset has a nested (depth 2) json that is a string (invalid) and has single quotes
    rather than double quotes and should not have . need to change to be considered valid 
    json and inserted into postgressql table.
    '''
    #TODO: clean up with re.sub or replace booleans and nulls with proper json syntax (e.g., False is false and None is null)
    corrected_record = (
        stringified_record
        .replace('\"{','{') #nested json object should not have any quotes
        .replace('}\"','}')
        .replace('\'','\"') #change single to double quotes
        .replace('False,','\"False\",') #stringify booleans if not already
        .replace('True,','\"True\",')
        .replace('False}','\"False\"}') #end of dict boolean
        .replace('True}','\"True\"}')
        .replace('\"u\"','\"') #some records had double double quotes (ie "u"no"")
        .replace('\"\"','\"')
        .replace('None,','"None",') #stringify nones
        .replace('None}','"None"}')
    )
    return corrected_record


#functions to format for SQL-ready strings
def convert_to_sql_str(obj):
    '''
    if string : escape single quotes and/or make SQL quotes ($$), if not string : convert
    to string
    (see https://www.postgresqltutorial.com/dollar-quoted-string-constants/#:~:text=In%20PostgreSQL%2C%20you%20use%20single,doubling%20up%20the%20single%20quote.)
    '''
    if type(obj) is str:
        obj_str = "$$" + obj.replace("$","$ ") + "$$" #if $ at end of string, will have have invalid triple $$$
    else:
        obj_str = str(obj)
    return obj_str

def make_sql_str_values(sql_str_list):
    '''
    make valid values (ie record) for INSERT statement

    sql_str_list : iterable of strings
    '''
    #convert every element to valid sql statements
    #join fields with ',' and wrap in '()' to 
    sql_str_values = "({})".format(",".join(sql_str_list))
    return sql_str_values 

In [46]:
yelp_data_path = '/Users/michaelkranz/Documents/restaurant-app/data/yelp_dataset/'

yelp_json_filenames = {"tips":'yelp_academic_dataset_tip.json',"reviews":'yelp_academic_dataset_review.json',
"business_info":'yelp_academic_dataset_business.json',"user":'yelp_academic_dataset_user.json'}

In [47]:
#%%timeit
with open(yelp_data_path+yelp_json_filenames['business_info']) as json_file:
    json_list = json_file.readlines()
json_data = [json.loads(json_line) for json_line in json_list] 
json_df = pd.DataFrame(json_data)

In [48]:
#upload json to postgresSQL table
json_df.head(1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'..."


In [49]:
json_df['attributes'] = stringify_json_records(json_df['attributes'])
json_df['hours'] = stringify_json_records(json_df['hours'])

In [None]:
json_str_list = (
    json_df
    .fillna('null') #null is syntax for SQL NAs
    .applymap(convert_to_sql_str)
    .apply(make_sql_str_values,axis=1)
    .values
)

### Storing JSON in PostgreSQL
- upon further research, it looks like we can store JSON directly in PostgreSQL without the traditional field format
    - [Storing JSON in PostgreSQL: A must-know feature] (https://www.blendo.co/blog/storing-json-in-postgresql/)
    - [Replacing EAV with JSONB in PostgreSQL*](https://coussej.github.io/2016/01/14/Replacing-EAV-with-JSONB-in-PostgreSQL/)

    *EAV = Entity,Attribute,Value (ie three tables connected with joins to get fields for the entity)


- find connection information 

>michaelkranz$ `psql`

>michaelkranz=# `CREATE DATABASE restaurants`

>michaelkranz=# `\c restaurants`

>restaurants=# `\conninfo`

> You are now connected to database "restaurants" as user "michaelkranz".
restaurants=# \conninfo
You are connected to database "restaurants" as user "michaelkranz" via socket in "/tmp" at port "5432".

- [CHAR and VARCHAR : no performance differences and character limits used to check and will return error if longer](https://www.postgresqltutorial.com/postgresql-char-varchar-text/)

- https://www.postgresql.org/docs/12/datatype-json.html

In [64]:
postgres_db_params = {'drivername': 'postgres',
                'database':'restaurants',
               'username': 'michaelkranz',
               'password': 'helloworld',
               'host': 'localhost',
               'port': 5432}

postgres_db_url = URL(**postgres_db_params)

In [65]:
engine = create_engine(postgres_db_url)

In [66]:
# store first level as a column but hours and attributes as jsonb format
business_info_create_table_str = '''
CREATE TABLE business_info (
    business_id VARCHAR PRIMARY KEY 
    ,name VARCHAR
    ,address VARCHAR
    ,city VARCHAR
    ,state VARCHAR
    ,postal_code VARCHAR  
    ,latitude FLOAT
    ,longitude FLOAT
    ,stars FLOAT(1)
    ,review_count INTEGER  
    ,is_open SMALLINT
    ,attributes JSONB
    ,categories VARCHAR
    ,hours JSONB
);
'''

In [110]:
with engine.connect() as conn:
    conn.execute("DROP TABLE IF EXISTS business_info;")
    conn.execute(business_info_create_table_str)
    print((pd.read_sql(con=conn,sql='''
    SELECT table_name
        ,column_name
        ,data_type
    FROM information_schema.columns
    WHERE
        table_name LIKE 'b%'
    ''')
    .sort_values('table_name',ascending=True)
    ))

table_name   column_name          data_type
0   business_info   business_id  character varying
1   business_info       address  character varying
2   business_info          name  character varying
3   business_info          city  character varying
4   business_info         state  character varying
5   business_info   postal_code  character varying
6   business_info      latitude   double precision
7   business_info     longitude   double precision
8   business_info         stars               real
9   business_info  review_count            integer
10  business_info       is_open           smallint
11  business_info    attributes              jsonb
12  business_info    categories  character varying
13  business_info         hours              jsonb


In [111]:
with engine.connect() as conn:
    for i,json_record in enumerate(json_str_list):
        #print(i)
        sql_record = text( #escapes special chars for you
            '''
            INSERT INTO business_info
            VALUES {};
            '''
            .format(json_record)
        )
        conn.execute(sql_record)

- TODO : write sample queries extracting from json to ensure it works
- TODO: refactor code 
- TODO: import every Yelp JSON



### Sample queries
- [Querying Json fields](https://www.postgresqltutorial.com/postgresql-json/)

In [127]:
#return BikeParking attribute
(
pd.read_sql(con=engine.connect(),sql='''
SELECT attributes -> 'BikeParking' as BikeParking
FROM business_info
'''
)
.head()
.values
)

array([['True'],
       [None],
       [None],
       ['True'],
       [None]], dtype=object)

In [132]:
#return BikeParking attribute
(
pd.read_sql(con=engine.connect(),sql='''
SELECT attributes ->> 'BikeParking' as BikeParking
FROM business_info
'''
)
.head()
)

Unnamed: 0,bikeparking
0,True
1,
2,
3,True
4,


In [134]:
pd.read_sql(con=engine.connect(),sql='''
SELECT
    name
    ,address
    ,attributes ->> 'BikeParking' AS BikeParking
FROM business_info
WHERE attributes ->> 'BikeParking' = 'True' 
'''
)

Unnamed: 0,name,address,bikeparking
0,10913 Bailey Rd,The Range At Lake Norman,True
1,1015 Sharp Cir,Nevada House of Hose,True
2,"6870 S Rainbow Blvd, Ste 117",Green World Cleaners,True
3,2831 Parmenter St,Chocolate Shoppe Ice Cream,True
4,3131 Las Vegas Blvd,Manolo Blahnik,True
...,...,...,...
69804,829 E Washington Ave,Julep,True
69805,81 Front Street E,Starbucks,True
69806,"2487 S Gilbert Rd, Ste 102",Yao Fine Chinese Cuisine,True
69807,199 College St,Steak & Cheese & Quick Pita Restaurant,True
