# Creates a Yelp POSTGRESSQL database
 - Uses the open source Yelp dataset and inserts json files into SQL tables

## Table of contents 
- importing json into python using pandas tools
- importing json to postgresql database

In [1]:
import json 
import pandas as pd 
from pandas.io.json._normalize import json_normalize,nested_to_record #flattening json in pandas df

from re import sub

In [2]:
def stringify_json_records(series):
    '''
    converts a series of dicts into a stringified 
    set of json records (which is necessary format for json storage)
    for example single quotes are replaced by double quotes
    and the dict is converted into a single quote string.
    '''
    if series.apply(lambda x: x is None or type(x) is dict).all():
       json_string_records = (
           series
           .apply(json.dumps)
           .apply(_restringify_nested_json_records) 
       )
    else:
        print("the series {} are not all dict records".format(series.name))
        json_string_records = series
    return json_string_records

def _restringify_nested_json_records(stringified_record):
    '''
    yelp open dataset has a nested (depth 2) json that is a string (invalid) and has single quotes
    rather than double quotes and should not have . need to change to be considered valid 
    json and inserted into postgressql table.
    '''
    corrected_record = (
        stringified_record
        .replace('\"{','{') #nested json object should not have any quotes
        .replace('}\"','}')
        .replace('\'','\"') #change single to double quotes
        .replace('False,','\"False\",') #stringify booleans if not already
        .replace('True,','\"True\",')
        .replace('False}','\"False\"}') #end of dict boolean
        .replace('True}','\"True\"}')
    )
    return corrected_record

In [3]:
yelp_data_path = '/Users/michaelkranz/Documents/restaurant-app/data/yelp_dataset/'

yelp_json_filenames = {"tips":'yelp_academic_dataset_tip.json',"reviews":'yelp_academic_dataset_review.json',
"business_info":'yelp_academic_dataset_business.json',"user":'yelp_academic_dataset_user.json'}

In [4]:
#%%timeit
with open(yelp_data_path+yelp_json_filenames['business_info']) as json_file:
    json_list = json_file.readlines()
json_data = [json.loads(json_line) for json_line in json_list] 
json_df = pd.DataFrame(json_data)

In [5]:
#upload json to postgresSQL table
json_df.head(1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'..."


In [6]:
json_df['attributes'] = stringify_json_records(json_df['attributes'])
json_df['hours'] = stringify_json_records(json_df['hours'])

In [21]:
#2 nested json objects : attributes and hours
#df_flattened_data = json_normalize(json_data,sep="_")
json_flattened_data = nested_to_record(json_data,sep="_",) #dont need to convert to df as destination is SQL database

In [23]:
#for flattening in future, may want to expand to convert nested json string to json
##see attributes_Business
json_flattened_data[0] 

{'business_id': 'f9NumwFMBDn751xgFiRbNA',
 'name': 'The Range At Lake Norman',
 'address': '10913 Bailey Rd',
 'city': 'Cornelius',
 'state': 'NC',
 'postal_code': '28031',
 'latitude': 35.4627242,
 'longitude': -80.8526119,
 'stars': 3.5,
 'review_count': 36,
 'is_open': 1,
 'categories': 'Active Life, Gun/Rifle Ranges, Guns & Ammo, Shopping',
 'attributes_BusinessAcceptsCreditCards': 'True',
 'attributes_BikeParking': 'True',
 'attributes_GoodForKids': 'False',
 'attributes_BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
 'attributes_ByAppointmentOnly': 'False',
 'attributes_RestaurantsPriceRange2': '3',
 'hours_Monday': '10:0-18:0',
 'hours_Tuesday': '11:0-20:0',
 'hours_Wednesday': '10:0-18:0',
 'hours_Thursday': '11:0-20:0',
 'hours_Friday': '11:0-20:0',
 'hours_Saturday': '11:0-20:0',
 'hours_Sunday': '13:0-18:0'}

### Storing JSON in PostgreSQL
- upon further research, it looks like we can store JSON directly in PostgreSQL without the traditional field format
    - [Storing JSON in PostgreSQL: A must-know feature] (https://www.blendo.co/blog/storing-json-in-postgresql/)
    - [Replacing EAV with JSONB in PostgreSQL*](https://coussej.github.io/2016/01/14/Replacing-EAV-with-JSONB-in-PostgreSQL/)

    *EAV = Entity,Attribute,Value (ie three tables connected with joins to get fields for the entity)


- find connection information 

>michaelkranz$ `psql`

>michaelkranz=# `CREATE DATABASE restaurants`

>michaelkranz=# `\c restaurants`

>restaurants=# `\conninfo`

> You are now connected to database "restaurants" as user "michaelkranz".
restaurants=# \conninfo
You are connected to database "restaurants" as user "michaelkranz" via socket in "/tmp" at port "5432".

- [CHAR and VARCHAR : no performance differences and character limits used to check and will return error if longer](https://www.postgresqltutorial.com/postgresql-char-varchar-text/)

- https://www.postgresql.org/docs/12/datatype-json.html

In [7]:
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL 
import psycopg2 #postgressql driver

In [8]:
postgres_db_params = {'drivername': 'postgres',
                'database':'restaurants',
               'username': 'michaelkranz',
               'password': 'helloworld',
               'host': 'localhost',
               'port': 5432}

postgres_db_url = URL(**postgres_db_params)

In [9]:
engine = create_engine(postgres_db_url)

In [10]:
# store first level as a column but hours and attributes as jsonb format
business_info_create_table_str = '''
CREATE TABLE business_info (
    business_id VARCHAR PRIMARY KEY 
    ,address VARCHAR
    ,name VARCHAR
    ,city VARCHAR
    ,state VARCHAR
    ,postal_code VARCHAR  
    ,latitude FLOAT
    ,longitude FLOAT
    ,stars FLOAT(1)
    ,review_count INTEGER  
    ,is_open SMALLINT
    ,attributes JSONB
    ,categories VARCHAR
    ,hours JSONB
);
'''

In [12]:
with engine.connect() as conn:
    conn.execute("DROP TABLE IF EXISTS business_info;")
    conn.execute(business_info_create_table_str)
    print((pd.read_sql(con=conn,sql='''
    SELECT table_name
        ,column_name
        ,data_type
    FROM information_schema.columns
    WHERE
        table_name LIKE 'b%'
    ''')
    .sort_values('table_name',ascending=True)
    ))

table_name   column_name          data_type
0   business_info   business_id  character varying
1   business_info       address  character varying
2   business_info          name  character varying
3   business_info          city  character varying
4   business_info         state  character varying
5   business_info   postal_code  character varying
6   business_info      latitude   double precision
7   business_info     longitude   double precision
8   business_info         stars               real
9   business_info  review_count            integer
10  business_info       is_open           smallint
11  business_info    attributes              jsonb
12  business_info    categories  character varying
13  business_info         hours              jsonb


In [13]:

json_str_list = [str(json_record) for json_record in json_df.to_records(index=False)]

In [14]:
json_str_list[0]

'(\'f9NumwFMBDn751xgFiRbNA\', \'The Range At Lake Norman\', \'10913 Bailey Rd\', \'Cornelius\', \'NC\', \'28031\', 35.4627242, -80.8526119, 3.5, 36, 1, \'{"BusinessAcceptsCreditCards": "True", "BikeParking": "True", "GoodForKids": "False", "BusinessParking": {"garage": "False", "street": "False", "validated": "False", "lot": "True", "valet": "False"}, "ByAppointmentOnly": "False", "RestaurantsPriceRange2": "3"}\', \'Active Life, Gun/Rifle Ranges, Guns & Ammo, Shopping\', \'{"Monday": "10:0-18:0", "Tuesday": "11:0-20:0", "Wednesday": "10:0-18:0", "Thursday": "11:0-20:0", "Friday": "11:0-20:0", "Saturday": "11:0-20:0", "Sunday": "13:0-18:0"}\')'

In [15]:
engine.connect().execute(
'''
INSERT INTO business_info
VALUES 
    {}
;
'''
.format(json_str_list[0])
)

<sqlalchemy.engine.result.ResultProxy at 0x189162e90>

In [282]:
engine.connect().execute(
'''
INSERT INTO business_info
VALUES {};
'''
.format(test)
)

ProgrammingError: (psycopg2.errors.SyntaxError) INSERT has more expressions than target columns
LINE 3: ...e Life, Gun/Rifle Ranges, Guns & Ammo, Shopping', '{"Monday"...
                                                             ^

[SQL: 
INSERT INTO business_info
VALUES ('f9NumwFMBDn751xgFiRbNA', 'The Range At Lake Norman', '10913 Bailey Rd', 'Cornelius', 'NC', '28031', 35.4627242, -80.8526119, 3.5, 36, 1, '{"BusinessAcceptsCreditCards": "True", "BikeParking": "True", "GoodForKids": "False", "BusinessParking": {"garage": False, "street": False, "validated": False, "lot": True, "valet": False}, "ByAppointmentOnly": "False", "RestaurantsPriceRange2": "3"}', 'Active Life, Gun/Rifle Ranges, Guns & Ammo, Shopping', '{"Monday": "10:0-18:0", "Tuesday": "11:0-20:0", "Wednesday": "10:0-18:0", "Thursday": "11:0-20:0", "Friday": "11:0-20:0", "Saturday": "11:0-20:0", "Sunday": "13:0-18:0"}');
]
(Background on this error at: http://sqlalche.me/e/13/f405)

In [16]:
pd.read_sql(con=engine.connect(),sql="SELECT * FROM business_info")

Unnamed: 0,business_id,address,name,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BikeParking': 'True', 'GoodForKids': 'False'...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Friday': '11:0-20:0', 'Monday': '10:0-18:0',..."
