In [1]:
# Install psycopg3 to Connect to PostgreSQL, Create a Table, and Insert Data
# pip install psycopg2-binary

In [2]:
import psycopg2
import pandas as pd
import numpy as np
from random import randrange
import json

In [3]:
# Connect to the default database
conn = psycopg2.connect("dbname=postgres user=kaiting host=localhost")
conn.autocommit = True
cur = conn.cursor()

# Create database if it doesn't exist
cur.execute("SELECT 1 FROM pg_database WHERE datname = 'yelp'")
if not cur.fetchone():
    cur.execute("CREATE DATABASE yelp")
    print("Database 'yelp' created.")
else:
    print("Database 'yelp' already exists.")

cur.close()
conn.close()

Database 'yelp' already exists.


In [4]:
# Connect to your PostgreSQL database
# Make sure it is already running on your computer.
# (This is done by default on JupyterHub)
# Assuming the database mydb exists...
conn = psycopg2.connect("postgresql://localhost/yelp")

# Create a cursor object
cur = conn.cursor()

# Create a table
cur.execute("""
  CREATE TABLE IF NOT EXISTS users (
    id SERIAL PRIMARY KEY,
    name TEXT,
    age INT
  )
""")

# Insert some data into the table
cur.execute("""
  INSERT INTO users (name, age) VALUES
  ('Alice', 30),
  ('Bob', 25),
  ('Charlie', 35)
""")

# Insert data using safe query interpolation
cur.execute(
  "INSERT INTO users (name, age) VALUES (%s, %s)",
  ('David', 28)
)

# Insert data using a prepared statement
# See also: https://www.postgresql.org/docs/8.1/sql-syntax.html#AEN1368
prepared_statement = "INSERT INTO users (name, age) VALUES (%s, %s)"
cur.execute(prepared_statement, ('Eve', 22))

# Insert data from a list
user_list = [
  ('Frank', 40),
  ('Grace', 29),
  ('Hannah', 33)
]

# Use executemany to insert multiple rows
# This properly escapes quotes, etc.
cur.executemany(
  "INSERT INTO users (name, age) VALUES (%s, %s)",
  user_list
)

# Commit the transaction
conn.commit()

# Close the cursor and connection
cur.close()
conn.close()

## Sampling

The Yelp dataset is relatively large and uneasy to run all of the records. Hence, we are using resevoir sampling in this project. We selected 15,000 out of 150,346 records from `businsses` and find corresponding `reviews` and `users`, which satisfies the minimum requirements:
- Total uncompressed size ≥ 1GB of data
- Minimum of 1 million records (across all entities)

In [5]:
def reservoir_sample(data, n, k):
  # fill the reservoir array
  r = []
  for i in range(k):
    r.append(data[i])

  # replace elements w/gradually decreasing prob.
  for i in range(k, n):
    # generates a uniform integer between 0 and a-1
    j = randrange(i+1)
    if j < k:
        r[j] = data[i]

  return r

### Load Datasets

In [6]:
# Load dataset
df_business = pd.read_json("yelp_academic_dataset_business.json", lines=True)
df_business_dict = df_business.to_dict(orient='records')

In [7]:
print(f"yelp_academic_dataset_business dataset total rows: {len(df_business)}")
df_business.head()

yelp_academic_dataset_business dataset total rows: 150346


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


### Resevoir Sampling

In [8]:
# Sample business dataset with a size of 15000
n = len(df_business)
k = 15000
df_business_sample = reservoir_sample(df_business_dict, n, k)
df_business_sample = pd.DataFrame(df_business_sample)

In [9]:
df_business_sample.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Hk66opSXJGu7m-dpGHDxVQ,Kidsplay Child Care & Preschool,459 Lafayette Ctr,Manchester,MO,63011,38.595409,-90.519688,3.0,6,1,{'BusinessAcceptsCreditCards': 'True'},"Preschools, Child Care & Day Care, Local Servi...","{'Monday': '6:30-22:0', 'Tuesday': '6:30-22:0'..."
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,_94pflFSusXDrv5sk8jQoA,Peter Piper Pizza,7621 N Oracle Rd,Tucson,AZ,85704,32.345452,-110.976341,3.5,11,0,"{'GoodForKids': 'True', 'RestaurantsDelivery':...","Italian, Restaurants, Pizza",
3,ds1ZbH019urJzYHYSGS48A,Icon East,4110 Gallatin Pike,Nashville,TN,37216,36.221945,-86.725887,4.5,6,0,"{'ByAppointmentOnly': 'False', 'BikeParking': ...","Shopping, Piercing, Tattoo, Jewelry, Beauty & ...","{'Monday': '13:0-21:0', 'Tuesday': '13:0-21:0'..."
4,8TFpa9C5mAts3Euoysll-A,Heatwave Heating & Cooling,"10161 49th St N, Ste A",Pinellas Park,FL,33782,27.864215,-82.699703,2.5,15,1,{'BusinessAcceptsCreditCards': 'True'},"Contractors, Home Services, Local Services, He...","{'Monday': '8:0-20:0', 'Tuesday': '8:0-20:0', ..."


In [10]:
# Extract business_id from df_business_sample
business_ids = set(df_business_sample["business_id"])

review_sample = []

with open('yelp_academic_dataset_review.json', 'r') as f:
    for line in f:
        review = json.loads(line)
        if review['business_id'] in business_ids:
            review_sample.append(review)

df_review_sample = pd.DataFrame(review_sample)

In [11]:
df_review_sample.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,XW_LfMv0fV21l9c6xQd_lw,9OAtfnWag-ajVxRbUTGIyg,lj-E32x9_FA7GmUrBGBEWg,4.0,0,0,0,Love going here for happy hour or dinner! Gre...,2014-06-27 22:44:01
2,xdQe2E8jR6pLA07p1aYtKA,yFuE8SPF-d1GXJUWYgKtzg,X8lfGPagfLzfOsx0k08NRQ,5.0,0,0,0,Took my vehicle here for some work a few years...,2017-05-13 14:15:29
3,L0jv8c2FbpWSlfNC6bbUEA,bFPdtzu11Oi0f92EAcjqmg,IDtLPgUrqorrpqSLdfMhZQ,5.0,0,0,0,What a great addition to the Funk Zone! Grab ...,2016-10-13 22:50:47
4,RB8UpF_kT2xoOC51OzXEeA,EZjT2qJN0mOXypMAqZdSrQ,A2q7d-CBM2-81tVkmS4JMw,2.0,1,1,0,"Straight to the point, it's cheap, it tastes a...",2017-07-08 18:58:42


In [12]:
# Extract user_id from df_review_sample
user_ids = set(df_review_sample["user_id"])

user_sample = []

with open('yelp_academic_dataset_user.json', 'r') as f:
    for line in f:
        user = json.loads(line)
        if user['user_id'] in user_ids:
            user_sample.append(user)

df_user_sample = pd.DataFrame(user_sample)

In [13]:
df_user_sample.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,13,10,17,3,66,96,119,119,35,18
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,4,1,6,2,12,16,26,26,10,9
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,...,1,0,0,0,1,1,0,0,0,0


In [14]:
print(f'Sampled businessed total rows: {len(df_business_sample)}')
print(f'Sampled reviews total rows: {len(df_review_sample)}')
print(f'Sampled users total rows: {len(df_user_sample)}')
print(f'Total rows: {len(df_business_sample)+len(df_review_sample)+len(df_user_sample)}')

Sampled businessed total rows: 15000
Sampled reviews total rows: 692267
Sampled users total rows: 414596
Total rows: 1121863


In [15]:
df_business_sample.to_json("business_sample.json", orient="records", lines=True)
df_review_sample.to_json("review_sample.json", orient="records", lines=True)
df_user_sample.to_json("user_sample.json", orient="records", lines=True)
print(f'Sampling datasets completed')

Sampling datasets completed


## Create Schema