# Creates a Yelp POSTGRESSQL database
 - Uses the open source Yelp dataset and inserts json files into SQL tables

## Table of contents 
- importing json into python using pandas tools
- importing json to postgresql database

In [1]:
import json 
import pandas as pd 
from pandas.io.json._normalize import json_normalize,nested_to_record #flattening json in pandas df

from re import sub

In [45]:
def stringify_json_records(series):
    '''
    converts a series of dicts into a stringified 
    set of json records (which is necessary format for json storage)
    for example single quotes are replaced by double quotes
    and the dict is converted into a single quote string.
    '''
    if series.apply(lambda x: x is None or type(x) is dict).all():
       json_string_records = (
           series
           .apply(json.dumps)
           .apply(_restringify_nested_json_records) 
       )
    else:
        print("the series {} are not all dict records".format(series.name))
        json_string_records = series
    return json_string_records

def _restringify_nested_json_records(stringified_record):
    '''
    yelp open dataset has a nested (depth 2) json that is a string (invalid) and has single quotes
    rather than double quotes and should not have . need to change to be considered valid 
    json and inserted into postgressql table.
    '''
    corrected_record = (
        stringified_record
        .replace('\"{','{') #nested json object should not have any quotes
        .replace('}\"','}')
        .replace('\'','\"') #change single to double quotes
        .replace('False,','\"False\",') #stringify booleans if not already
        .replace('True,','\"True\",')
        .replace('False}','\"False\"}') #end of dict boolean
        .replace('True}','\"True\"}')
        .replace('\"u\"','\"') #some records had double double quotes (ie "u"no"")
        .replace('\"\"','\"')
        .replace('None,','"None",') #stringify nones
        .replace('None}','"None"}')
    )
    return corrected_record

In [46]:
yelp_data_path = '/Users/michaelkranz/Documents/restaurant-app/data/yelp_dataset/'

yelp_json_filenames = {"tips":'yelp_academic_dataset_tip.json',"reviews":'yelp_academic_dataset_review.json',
"business_info":'yelp_academic_dataset_business.json',"user":'yelp_academic_dataset_user.json'}

In [47]:
#%%timeit
with open(yelp_data_path+yelp_json_filenames['business_info']) as json_file:
    json_list = json_file.readlines()
json_data = [json.loads(json_line) for json_line in json_list] 
json_df = pd.DataFrame(json_data)

In [48]:
#upload json to postgresSQL table
json_df.head(1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'..."


In [49]:
json_df['attributes'] = stringify_json_records(json_df['attributes'])
json_df['hours'] = stringify_json_records(json_df['hours'])

In [21]:
#2 nested json objects : attributes and hours
#df_flattened_data = json_normalize(json_data,sep="_")
json_flattened_data = nested_to_record(json_data,sep="_",) #dont need to convert to df as destination is SQL database

In [23]:
#for flattening in future, may want to expand to convert nested json string to json
##see attributes_Business
json_flattened_data[0] 

{'business_id': 'f9NumwFMBDn751xgFiRbNA',
 'name': 'The Range At Lake Norman',
 'address': '10913 Bailey Rd',
 'city': 'Cornelius',
 'state': 'NC',
 'postal_code': '28031',
 'latitude': 35.4627242,
 'longitude': -80.8526119,
 'stars': 3.5,
 'review_count': 36,
 'is_open': 1,
 'categories': 'Active Life, Gun/Rifle Ranges, Guns & Ammo, Shopping',
 'attributes_BusinessAcceptsCreditCards': 'True',
 'attributes_BikeParking': 'True',
 'attributes_GoodForKids': 'False',
 'attributes_BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
 'attributes_ByAppointmentOnly': 'False',
 'attributes_RestaurantsPriceRange2': '3',
 'hours_Monday': '10:0-18:0',
 'hours_Tuesday': '11:0-20:0',
 'hours_Wednesday': '10:0-18:0',
 'hours_Thursday': '11:0-20:0',
 'hours_Friday': '11:0-20:0',
 'hours_Saturday': '11:0-20:0',
 'hours_Sunday': '13:0-18:0'}

### Storing JSON in PostgreSQL
- upon further research, it looks like we can store JSON directly in PostgreSQL without the traditional field format
    - [Storing JSON in PostgreSQL: A must-know feature] (https://www.blendo.co/blog/storing-json-in-postgresql/)
    - [Replacing EAV with JSONB in PostgreSQL*](https://coussej.github.io/2016/01/14/Replacing-EAV-with-JSONB-in-PostgreSQL/)

    *EAV = Entity,Attribute,Value (ie three tables connected with joins to get fields for the entity)


- find connection information 

>michaelkranz$ `psql`

>michaelkranz=# `CREATE DATABASE restaurants`

>michaelkranz=# `\c restaurants`

>restaurants=# `\conninfo`

> You are now connected to database "restaurants" as user "michaelkranz".
restaurants=# \conninfo
You are connected to database "restaurants" as user "michaelkranz" via socket in "/tmp" at port "5432".

- [CHAR and VARCHAR : no performance differences and character limits used to check and will return error if longer](https://www.postgresqltutorial.com/postgresql-char-varchar-text/)

- https://www.postgresql.org/docs/12/datatype-json.html

In [50]:
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL 
import psycopg2 #postgressql driver

In [64]:
postgres_db_params = {'drivername': 'postgres',
                'database':'restaurants',
               'username': 'michaelkranz',
               'password': 'helloworld',
               'host': 'localhost',
               'port': 5432}

postgres_db_url = URL(**postgres_db_params)

In [65]:
engine = create_engine(postgres_db_url)

In [66]:
# store first level as a column but hours and attributes as jsonb format
business_info_create_table_str = '''
CREATE TABLE business_info (
    business_id VARCHAR PRIMARY KEY 
    ,address VARCHAR
    ,name VARCHAR
    ,city VARCHAR
    ,state VARCHAR
    ,postal_code VARCHAR  
    ,latitude FLOAT
    ,longitude FLOAT
    ,stars FLOAT(1)
    ,review_count INTEGER  
    ,is_open SMALLINT
    ,attributes JSONB
    ,categories VARCHAR
    ,hours JSONB
);
'''

In [110]:
with engine.connect() as conn:
    conn.execute("DROP TABLE IF EXISTS business_info;")
    conn.execute(business_info_create_table_str)
    print((pd.read_sql(con=conn,sql='''
    SELECT table_name
        ,column_name
        ,data_type
    FROM information_schema.columns
    WHERE
        table_name LIKE 'b%'
    ''')
    .sort_values('table_name',ascending=True)
    ))

table_name   column_name          data_type
0   business_info   business_id  character varying
1   business_info       address  character varying
2   business_info          name  character varying
3   business_info          city  character varying
4   business_info         state  character varying
5   business_info   postal_code  character varying
6   business_info      latitude   double precision
7   business_info     longitude   double precision
8   business_info         stars               real
9   business_info  review_count            integer
10  business_info       is_open           smallint
11  business_info    attributes              jsonb
12  business_info    categories  character varying
13  business_info         hours              jsonb


In [76]:
nonjson_fields = ['business_id', 'name', 'address', 'city', 'state'
'postal_code','latitude', 'longitude', 'stars', 'review_count', 'is_open','categories']
json_fields= ['hours','attributes']

In [74]:
def convert_to_sql_str(obj):
    '''
    if string : escape single quotes and/or make SQL quotes ($$), if not string : convert
    to string
    (see https://www.postgresqltutorial.com/dollar-quoted-string-constants/#:~:text=In%20PostgreSQL%2C%20you%20use%20single,doubling%20up%20the%20single%20quote.)
    '''
    if type(obj) is str:
        obj_str = "$$" + obj.replace("$","$ ") + "$$" #if $ at end of string, will have have invalid triple $$$
    else:
        obj_str = str(obj)
    return obj_str

def make_sql_str_values(sql_str_list):
    '''
    make valid values (ie record) for INSERT statement

    sql_str_list : iterable of strings
    '''
    #convert every element to valid sql statements
    #join fields with ',' and wrap in '()' to 
    sql_str_values = "({})".format(",".join(sql_str_list))
    return sql_str_values 

In [75]:
json_str_list = (
    json_df
    .fillna('null') #null is syntax for SQL NAs
    .applymap(convert_to_sql_str)
    .apply(make_sql_str_values,axis=1)
    .values
)

In [73]:
print(json_df[json_df['business_id']=='Bbgc7isi58uqRlOxPF9kdA']
.replace('','null')
.applymap(convert_to_sql_str).values)

[['$$Bbgc7isi58uqRlOxPF9kdA$$' '$$Dryer Vent Cleaning Vegas 89$ $$'
  '$$null$$' '$$Las Vegas$$' '$$NV$$' '$$89183$$' '35.9987358'
  '-115.1592718' '5.0' '4' '1'
  '$${"BusinessAcceptsCreditCards": "True", "BusinessAcceptsBitcoin": "False"}$$'
  '$$Appliances & Repair, Home Services, Local Services, Heating & Air Conditioning/HVAC, Air Duct Cleaning$$'
  '$${"Monday": "8:0-20:0", "Tuesday": "8:0-20:0", "Wednesday": "8:0-20:0", "Thursday": "8:0-20:0", "Friday": "8:0-20:0", "Saturday": "8:0-20:0", "Sunday": "8:0-20:0"}$$']]


In [63]:
json_df[json_df['business_id']=='Bbgc7isi58uqRlOxPF9kdA'].fillna('null').values

array([['Bbgc7isi58uqRlOxPF9kdA', 'Dryer Vent Cleaning Vegas 89$', '',
        'Las Vegas', 'NV', '89183', 35.9987358, -115.1592718, 5.0, 4, 1,
        '{"BusinessAcceptsCreditCards": "True", "BusinessAcceptsBitcoin": "False"}',
        'Appliances & Repair, Home Services, Local Services, Heating & Air Conditioning/HVAC, Air Duct Cleaning',
        '{"Monday": "8:0-20:0", "Tuesday": "8:0-20:0", "Wednesday": "8:0-20:0", "Thursday": "8:0-20:0", "Friday": "8:0-20:0", "Saturday": "8:0-20:0", "Sunday": "8:0-20:0"}']],
      dtype=object)

In [82]:
i

60367

In [111]:
with engine.connect() as conn:
    for i,json_record in enumerate(json_str_list):
        #print(i)
        sql_record = text( #escapes special chars for you
            '''
            INSERT INTO business_info
            VALUES {};
            '''
            .format(json_record)
        )
        conn.execute(sql_record)

In [88]:
json_df.iloc[[60366]]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
60366,oFqSBajk338oJ1KNG4vCjA,Toronto Kick Boxing & Muay Thai Academy,1992 Yonge Street,Toronto,ON,M4S 1Z7,43.700104,-79.397458,3.5,7,1,"{""WheelchairAccessible"": ""False"", ""BikeParking...","Martial Arts, Fitness & Instruction, Active Li...","{""Monday"": ""20:0-21:30"", ""Tuesday"": ""20:0-21:3..."


In [89]:
json_df.iloc[[60367]]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
60367,k1gDqlaVXilVRE2MxFR3HA,100% Natural Mexican Grill,7455 Eastern Ave,Las Vegas,NV,89123,36.054065,-115.118261,3.5,10,0,"{""GoodForKids"": ""True"", ""RestaurantsPriceRange...","Mexican, Restaurants","{""Monday"": ""10:0-22:0"", ""Tuesday"": ""10:0-22:0""..."


In [99]:
from sqlalchemy import text

In [101]:
test =text('$$100% Natural Mexican Grill$$')

In [104]:
test

<sqlalchemy.sql.compiler.StrSQLCompiler at 0x1bbeec490>

In [109]:
engine.connect().execute(
    text('''INSERT INTO business_info (business_id,name)
    VALUES ('1',$$100% Natural Mexican Grill$$)
    ''')
)

<sqlalchemy.engine.result.ResultProxy at 0x1bbf10ed0>

In [95]:
engine.connect().execute(
    '''INSERT INTO business_info 
    VALUES {}
    '''.format(json_str_list[60367])
)

TypeError: 'dict' object does not support indexing

In [97]:
print(json_str_list[60367])

($$k1gDqlaVXilVRE2MxFR3HA$$,$$100% Natural Mexican Grill$$,$$7455 Eastern Ave$$,$$Las Vegas$$,$$NV$$,$$89123$$,36.0540651,-115.1182612,3.5,10,0,$${"GoodForKids": "True", "RestaurantsPriceRange2": "1", "RestaurantsGoodForGroups": "True", "OutdoorSeating": "False", "RestaurantsReservations": "False", "BusinessAcceptsCreditCards": "True", "RestaurantsAttire": "casual", "RestaurantsTakeOut": "True", "BusinessParking": {"garage": "False", "street": "False", "validated": "False", "lot": "False", "valet": "False"}}$$,$$Mexican, Restaurants$$,$${"Monday": "10:0-22:0", "Tuesday": "10:0-22:0", "Wednesday": "10:0-22:0", "Thursday": "10:0-22:0", "Friday": "10:0-23:0", "Saturday": "10:0-23:0", "Sunday": "10:0-22:0"}$$)


In [84]:
print(json_df.iloc[60367].values)

['k1gDqlaVXilVRE2MxFR3HA' '100% Natural Mexican Grill' '7455 Eastern Ave'
 'Las Vegas' 'NV' '89123' 36.0540651 -115.1182612 3.5 10 0
 '{"GoodForKids": "True", "RestaurantsPriceRange2": "1", "RestaurantsGoodForGroups": "True", "OutdoorSeating": "False", "RestaurantsReservations": "False", "BusinessAcceptsCreditCards": "True", "RestaurantsAttire": "casual", "RestaurantsTakeOut": "True", "BusinessParking": {"garage": "False", "street": "False", "validated": "False", "lot": "False", "valet": "False"}}'
 'Mexican, Restaurants'
 '{"Monday": "10:0-22:0", "Tuesday": "10:0-22:0", "Wednesday": "10:0-22:0", "Thursday": "10:0-22:0", "Friday": "10:0-23:0", "Saturday": "10:0-23:0", "Sunday": "10:0-22:0"}']


In [78]:
test_sql = pd.read_sql(con=engine.connect(),sql="SELECT * FROM business_info")

In [79]:
test_sql

Unnamed: 0,business_id,address,name,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BikeParking': 'True', 'GoodForKids': 'False'...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Friday': '11:0-20:0', 'Monday': '10:0-18:0',..."
1,Yzvjg0SayhoZgCljUJRF9Q,"Carlos Santo, NMD","8880 E Via Linda, Ste 107",Scottsdale,AZ,85258,33.569404,-111.890264,5.0,4,1,"{'GoodForKids': 'True', 'ByAppointmentOnly': '...","Health & Medical, Fitness & Instruction, Yoga,...",
2,XNoUzKckATkOD1hP6vghZg,Felinus,3554 Rue Notre-Dame O,Montreal,QC,H4C 1P4,45.479984,-73.580070,5.0,5,1,,"Pets, Pet Services, Pet Groomers",
3,6OAZjbxqM5ol29BuHsil3w,Nevada House of Hose,1015 Sharp Cir,North Las Vegas,NV,89030,36.219728,-115.127725,2.5,3,0,"{'BikeParking': 'True', 'DogsAllowed': 'True',...","Hardware Stores, Home Services, Building Suppl...","{'Friday': '7:0-16:0', 'Monday': '7:0-16:0', '..."
4,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726648,4.5,26,1,"{'ByAppointmentOnly': 'True', 'BusinessAccepts...","Home Services, Plumbing, Electricians, Handyma...","{'Friday': '9:0-16:0', 'Monday': '0:0-0:0', 'T..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60362,zDNJXhNLFGdJk0eejd3J-w,JBA Motors,245 S Mulberry,Mesa,AZ,85202,33.410085,-111.860041,4.0,42,1,"{'WiFi': 'no', 'BusinessAcceptsCreditCards': '...","Automotive, Car Dealers","{'Friday': '9:30-18:0', 'Monday': '9:30-18:0',..."
60363,H1B5LebeXBo9ad930ddC-A,"Richard Parsanko, DDS",,Scottsdale,AZ,85260,33.616029,-111.892630,5.0,3,1,,"Health & Medical, Oral Surgeons, Dentists, Gen...",
60364,mKgCTo438WoENDcLQunhTw,Heliocol West,2305 W Huntington Dr,Tempe,AZ,85282,33.393754,-111.972650,3.0,20,1,"{'ByAppointmentOnly': 'True', 'BusinessAccepts...","Home Services, Local Services, Solar Installation","{'Friday': '9:0-17:0', 'Monday': '9:0-17:0', '..."
60365,IBSOhov5GLSZsGzz3ZsQjg,Boston Market,98 Clairton Blvd,Pittsburgh,PA,15236,40.338788,-79.961031,3.5,3,0,"{'HasTV': 'False', 'GoodForKids': 'True', 'Bus...","Event Planning & Services, Restaurants, Catere...","{'Friday': '10:30-22:0', 'Monday': '10:30-22:0..."
