# Creates a Yelp POSTGRESSQL database
 - Uses the open source Yelp dataset and inserts json files into SQL tables

## Table of contents 
- importing json into python using pandas tools
- importing json to postgresql database

## TODO
- write sample queries extracting from json to ensure it works
- import every Yelp JSON
- refactor code 

In [1]:
import json 
import pandas as pd 
from re import sub

from sqlalchemy import create_engine,text
from sqlalchemy.engine.url import URL 
import psycopg2 #postgressql driver

In [87]:
def get_json_records(json_file_object,to_df=True,x_lines_to_read=-1):
    '''
    read all lines (ie records) or specified number of lines from json file
    and return these lines or convert to dataframe (default)
    '''
    with json_file_object:
        if x_lines_to_read is -1:
            json_list = json_file_object.readlines()
        else:
            json_list = [json_file_object.readline() for i in range(x_lines_to_read)]

        if to_df:
            json_data = [json.loads(json_line) for json_line in json_list] 
            json_df = pd.DataFrame(json_data)
            return json_df
        else:
            return json_list

def get_json_record(json_file_object,to_series=True):

    '''read a single json line (ie record) and return as 
    the json string (default) or convert to series if specified
    '''

    json_str = json_file_object.readline()
    if to_series:
        return pd.Series(eval(json_str))
    else:
        return json_str

#format the columns/fields that will be jsonb
def stringify_json_records(series):
    '''
    converts a series of dicts into a stringified 
    set of json records (which is necessary format for json storage)
    for example single quotes are replaced by double quotes
    and the dict is converted into a single quote string.

    Note: intended for use after json file converted to df and
    for nested json files
    '''
    if series.apply(lambda x: x is None or type(x) is dict).all():
       json_string_records = (
           series
           .apply(json.dumps)
           .apply(_restringify_nested_json_records) 
       )
    else:
        print("the series {} are not all dict records".format(series.name))
        json_string_records = series
    return json_string_records

def _restringify_nested_json_records(stringified_record):
    '''
    yelp open dataset has a nested (depth 2) json that is a string (invalid) and has single quotes
    rather than double quotes and should not have . need to change to be considered valid 
    json and inserted into postgressql table.
    '''
    #TODO: clean up with re.sub or replace booleans and nulls with proper json syntax (e.g., False is false and None is null)
    corrected_record = (
        stringified_record
        .replace('\"{','{') #nested json object should not have any quotes
        .replace('}\"','}')
        .replace('\'','\"') #change single to double quotes
        .replace('False,','\"False\",') #stringify booleans if not already
        .replace('True,','\"True\",')
        .replace('False}','\"False\"}') #end of dict boolean
        .replace('True}','\"True\"}')
        .replace('\"u\"','\"') #some records had double double quotes (ie "u"no"")
        .replace('\"\"','\"')
        .replace('None,','"None",') #stringify nones
        .replace('None}','"None"}')
    )
    return corrected_record


#functions to format for SQL-ready strings

def convert_to_sql_str(obj):
    '''
    if string : escape single quotes and/or make SQL quotes ($$), if not string : convert
    to string
    (see https://www.postgresqltutorial.com/dollar-quoted-string-constants/#:~:text=In%20PostgreSQL%2C%20you%20use%20single,doubling%20up%20the%20single%20quote.)
    '''
    if type(obj) is str:
        obj_str = "$$" + obj.replace("$","$ ") + "$$" #if $ at end of string, will have have invalid triple $$$
    else:
        obj_str = str(obj)
    return obj_str

def combine_sql_str_values(sql_str_list):
    '''
    make valid values (ie record) for INSERT statement

    sql_str_list : iterable of strings 
    '''
    #convert every element to valid sql statements
    #join fields with ',' and wrap in '()' to 
    sql_str_values = "({})".format(",".join(sql_str_list))
    return sql_str_values 

## functions to create a sql table

def _make_sql_field_str(field_dict,sep=' '):
    '''turns a field name (key) and datatypes (value)
    into SQL statement. Note- the default space 
    is for CREATE TABLE fxn.
    '''
    formatted_list = [key + sep + val for key,val in field_dict.items()]
    formatted_str = '\n,'.join(formatted_list)
    return formatted_str

def create_sql_table(table_name,field_dict):
    '''
    creates a table given a table name and set of variables with datatypes.
    outputs the list of table, column, and datatype dataframe.

    table_name : str
    field_dict : dict of field names (key) and datatypes(value)
    '''
    #create sql statement strings
    drop_table_str = f"DROP TABLE IF EXISTS {table_name};"

    field_str = _make_sql_field_str(field_dict)
    create_table_str = f'''
    CREATE TABLE {table_name} (
            {field_str}
        );
        '''

    table_desc_str = f'''
        SELECT 
            table_name
            ,column_name
            ,data_type
        FROM 
            information_schema.columns
        WHERE
            table_name='{table_name}'
            '''
    
    #execute sql statements
    with engine.connect() as conn:
        conn.execute(drop_table_str)
        conn.execute(create_table_str)

        table_desc = pd.read_sql(con=conn,sql=table_desc_str)

    return (create_table_str,table_desc)

#inserting sql into table
def insert_sql_record(sql_str_value,table_name):
    '''
    insert a SQL string-formatted record (ie one SQL string) 
    or a list of SQL records (list of SQL strings)
    
     into a SQL table
    '''
    insert_str = '''
    INSERT INTO {}
    VALUES {};
    '''
    with engine.connect() as conn:
        if type(sql_str_value) is str:
            #text escapes special chars for you
            sql_record = text(
                insert_str.format(table_name,sql_str_value)
            )
            conn.execute(sql_record)
            i = 1
        elif type(sql_str_value) is list:
            for i,i_sql in enumerate(sql_str_value):
                #text escapes special chars for you
                sql_record = text(
                    insert_str.format(table_name,i_sql)
                    )
                conn.execute(sql_record)

    return f"{str(i)} SQL records inserted into {table_name}"

In [3]:
yelp_data_path = '/Users/michaelkranz/Documents/restaurant-app/data/yelp_dataset/'

yelp_json_filenames = {"tips":'yelp_academic_dataset_tip.json',"reviews":'yelp_academic_dataset_review.json',
"business_info":'yelp_academic_dataset_business.json',"user":'yelp_academic_dataset_user.json'}

In [34]:
test_json_file = open(yelp_data_path+yelp_json_filenames['reviews'])

In [64]:
test_json_str = get_json_record(test_json_file)

"($$LG2ZaYiOgpr2DK_90pYjNw$$,$$V34qejxNsCbcgD8C0HVk-Q$$,$$HQl28KMwrEKHqhFrrDqVNQ$$,5.0,1,0,0,$$I love Deagan's. I do. I really do. The atmosphere is cozy and festive. The shrimp tacos and house fries are my standbys. The fries are sometimes good and sometimes great, and the spicy dipping sauce they come with is to die for. The beer list is amazing and the cocktails are great. The prices are mid-level, so it's not a cheap dive you can go to every week, but rather a treat when you do. Try it out. You won't be disappointed!$$,$$2015-12-05 03:18:11$$)"

In [91]:
review_fields = {
    "review_id":"VARCHAR PRIMARY KEY",
    "user_id":"VARCHAR",
    "business_id":"VARCHAR",
    'stars': "DECIMAL(2)",
    'useful': "SMALLINT",
    'funny': "SMALLINT",
    'cool': "SMALLINT",
    'text': "VARCHAR",
    'date': "TIMESTAMP"
}

In [92]:
create_sql_table("reviews",review_fields)

('\n    CREATE TABLE reviews (\n            review_id VARCHAR PRIMARY KEY\n,user_id VARCHAR\n,business_id VARCHAR\n,stars DECIMAL(2)\n,useful SMALLINT\n,funny SMALLINT\n,cool SMALLINT\n,text VARCHAR\n,date TIMESTAMP\n        );\n        ',
   table_name  column_name                    data_type
 0    reviews    review_id            character varying
 1    reviews      user_id            character varying
 2    reviews  business_id            character varying
 3    reviews        stars                      numeric
 4    reviews       useful                     smallint
 5    reviews        funny                     smallint
 6    reviews         cool                     smallint
 7    reviews         text            character varying
 8    reviews         date  timestamp without time zone)

In [94]:
sql_str_series = test_json_str.apply(convert_to_sql_str)
sql_str = combine_sql_str_values(sql_str_series)
insert_sql_record(sql_str,"reviews")

'1 SQL records inserted into reviews'

In [None]:
#%%timeit
#with open(yelp_data_path+yelp_json_filenames['business_info']) as json_file:
    json_list = json_file.readlines()
json_data = [json.loads(json_line) for json_line in json_list] 
json_df = pd.DataFrame(json_data)

In [None]:
#upload json to postgresSQL table
json_df.head(1)

In [49]:
json_df['attributes'] = stringify_json_records(json_df['attributes'])
json_df['hours'] = stringify_json_records(json_df['hours'])

In [None]:
json_str_list = (
    json_df
    .fillna('null') #null is syntax for SQL NAs
    .applymap(convert_to_sql_str)
    .apply(make_sql_str_values,axis=1)
    .values
)

### Storing JSON in PostgreSQL
- upon further research, it looks like we can store JSON directly in PostgreSQL without the traditional field format
    - [Storing JSON in PostgreSQL: A must-know feature] (https://www.blendo.co/blog/storing-json-in-postgresql/)
    - [Replacing EAV with JSONB in PostgreSQL*](https://coussej.github.io/2016/01/14/Replacing-EAV-with-JSONB-in-PostgreSQL/)

    *EAV = Entity,Attribute,Value (ie three tables connected with joins to get fields for the entity)


- find connection information 

>michaelkranz$ `psql`

>michaelkranz=# `CREATE DATABASE restaurants`

>michaelkranz=# `\c restaurants`

>restaurants=# `\conninfo`

> You are now connected to database "restaurants" as user "michaelkranz".
restaurants=# \conninfo
You are connected to database "restaurants" as user "michaelkranz" via socket in "/tmp" at port "5432".

- [CHAR and VARCHAR : no performance differences and character limits used to check and will return error if longer](https://www.postgresqltutorial.com/postgresql-char-varchar-text/)

- https://www.postgresql.org/docs/12/datatype-json.html

In [80]:
postgres_db_params = {'drivername': 'postgres',
                'database':'restaurants',
               'username': 'michaelkranz',
               'password': 'helloworld',
               'host': 'localhost',
               'port': 5432}

postgres_db_url = URL(**postgres_db_params)

In [81]:
engine = create_engine(postgres_db_url)

In [66]:
# store first level as a column but hours and attributes as jsonb format
business_info_create_table_str = '''
CREATE TABLE business_info (
    business_id VARCHAR PRIMARY KEY 
    ,name VARCHAR
    ,address VARCHAR
    ,city VARCHAR
    ,state VARCHAR
    ,postal_code VARCHAR  
    ,latitude FLOAT
    ,longitude FLOAT
    ,stars FLOAT(1)
    ,review_count INTEGER  
    ,is_open SMALLINT
    ,attributes JSONB
    ,categories VARCHAR
    ,hours JSONB
);
'''

In [18]:
# store first level as a column but hours and attributes as jsonb format
reviews_create_table_str = '''
CREATE TABLE reviews (
    review_gid PRIMARY KEY
    ,review_json JSONB
);
'''

In [26]:
test = {'review_gid':'PRIMARY KEY',
    'review':'test'}

In [29]:
'/n,'.join([key + ' ' + val for key,val in test.items()])

'review_gid PRIMARY KEY/n,review test'

In [None]:
def _make_sql_field_str(field_dict,sep=' '):
    '''turns a field name (key) and datatypes (value)
    into SQL statement. Note- the default space 
    is for CREATE TABLE fxn.
    '''
    formatted_list = key + sep + val for key,val in field_dict.items()
    formatted_str = '/n,'.join(field_list)
    return formatted_str

def create_sql_table(table_name,field_dict):
    '''
    creates a table given a table name and set of variables with datatypes.
    outputs the list of table, column, and datatype dataframe.

    table_name : str
    field_dict : dict of field names (key) and datatypes(value)
    '''
    #create sql statement strings
    drop_table_str = f"DROP TABLE IF EXISTS {table_name};"

    field_str = _make_sql_field_str(field_dict)
    create_table_str = f'''
    CREATE TABLE {table_name} (
            {field_str}
        );
        '''

    table_desc_str = f'''
        SELECT 
            table_name
            ,column_name
            ,data_type
        FROM 
            information_schema.columns
        WHERE
            table_name='{table_name}'
            '''
    
    #execute sql statements
    with engine.connect() as conn:
        conn.execute(drop_table_str)
        conn.execute(create_table_str)

        table_desc = pd.read_sql(con=conn,sql=table_desc_str)

    return table_desc

def insert_sql_record(sql_str_value,table_name):
    '''
    insert a SQL string-formatted record (ie one SQL string) 
    or a list of SQL records (list of SQL strings)
    
     into a SQL table
    '''
    insert_str = '''
    INSERT INTO {}
    VALUES {};
    '''
    with engine.connect() as conn:
        if type(sql_str_value) is str:
            #text escapes special chars for you
            sql_record = text(
                insert_str.format(table_name,sql_str_value)
            )
            conn.execute(sql_record)
            i = 1
        elif type(sql_str_value) is list:
            for i,i_sql in enumerate(sql_str_value):
                #text escapes special chars for you
                sql_record = text(
                    insert_str.format(table_name,i_sql)
                    )
                conn.execute(sql_record)

    return f"{str(i)} SQL records inserted into {table_name}"

In [111]:
with engine.connect() as conn:
    for i,json_record in enumerate(json_str_list):
        #print(i)
        sql_record = text( #escapes special chars for you
            '''
            INSERT INTO business_info
            VALUES {};
            '''
            .format(json_record)
        )
        conn.execute(sql_record)

### Sample queries
- [Querying Json fields](https://www.postgresqltutorial.com/postgresql-json/)

In [127]:
#return BikeParking attribute
(
pd.read_sql(con=engine.connect(),sql='''
SELECT attributes -> 'BikeParking' as BikeParking
FROM business_info
'''
)
.head()
.values
)

array([['True'],
       [None],
       [None],
       ['True'],
       [None]], dtype=object)

In [132]:
#return BikeParking attribute
(
pd.read_sql(con=engine.connect(),sql='''
SELECT attributes ->> 'BikeParking' as BikeParking
FROM business_info
'''
)
.head()
)

Unnamed: 0,bikeparking
0,True
1,
2,
3,True
4,


In [134]:
pd.read_sql(con=engine.connect(),sql='''
SELECT
    name
    ,address
    ,attributes ->> 'BikeParking' AS BikeParking
FROM business_info
WHERE attributes ->> 'BikeParking' = 'True' 
'''
)

Unnamed: 0,name,address,bikeparking
0,10913 Bailey Rd,The Range At Lake Norman,True
1,1015 Sharp Cir,Nevada House of Hose,True
2,"6870 S Rainbow Blvd, Ste 117",Green World Cleaners,True
3,2831 Parmenter St,Chocolate Shoppe Ice Cream,True
4,3131 Las Vegas Blvd,Manolo Blahnik,True
...,...,...,...
69804,829 E Washington Ave,Julep,True
69805,81 Front Street E,Starbucks,True
69806,"2487 S Gilbert Rd, Ste 102",Yao Fine Chinese Cuisine,True
69807,199 College St,Steak & Cheese & Quick Pita Restaurant,True
