In [1]:
# Install psycopg3 to Connect to PostgreSQL, create a table, and insert data
# pip install psycopg2-binary

In [2]:
import psycopg2
import pandas as pd
import numpy as np
from random import randrange
import json

## Create PostgreSQL Database

In [3]:
# Connect to the default database
conn = psycopg2.connect("dbname=postgres user=kaiting host=localhost")
conn.autocommit = True
cur = conn.cursor()

# Create `yelp` database if it doesn't exist
cur.execute("SELECT 1 FROM pg_database WHERE datname = 'yelp'")
if not cur.fetchone():
    cur.execute("CREATE DATABASE yelp")
    print("Database 'yelp' created.")
else:
    print("Database 'yelp' already exists.")

cur.close()
conn.close()

Database 'yelp' already exists.


Below are example codes how we can use `psycopg3` to create tabels and insert data.

In [4]:
# # Connect to your PostgreSQL database
# # Make sure it is already running on your computer.
# # (This is done by default on JupyterHub)
# # Assuming the database mydb exists...
# conn = psycopg2.connect("postgresql://localhost/yelp")

# # Create a cursor object
# cur = conn.cursor()

# # Create a table
# cur.execute("""
#   CREATE TABLE IF NOT EXISTS users (
#     id SERIAL PRIMARY KEY,
#     name TEXT,
#     age INT
#   )
# """)

# # Insert some data into the table
# cur.execute("""
#   INSERT INTO users (name, age) VALUES
#   ('Alice', 30),
#   ('Bob', 25),
#   ('Charlie', 35)
# """)

# # Insert data using safe query interpolation
# cur.execute(
#   "INSERT INTO users (name, age) VALUES (%s, %s)",
#   ('David', 28)
# )

# # Insert data using a prepared statement
# # See also: https://www.postgresql.org/docs/8.1/sql-syntax.html#AEN1368
# prepared_statement = "INSERT INTO users (name, age) VALUES (%s, %s)"
# cur.execute(prepared_statement, ('Eve', 22))

# # Insert data from a list
# user_list = [
#   ('Frank', 40),
#   ('Grace', 29),
#   ('Hannah', 33)
# ]

# # Use executemany to insert multiple rows
# # This properly escapes quotes, etc.
# cur.executemany(
#   "INSERT INTO users (name, age) VALUES (%s, %s)",
#   user_list
# )

# # Commit the transaction
# conn.commit()

# # Close the cursor and connection
# cur.close()
# conn.close()

## Sampling

The Yelp dataset is relatively large and uneasy to run all of the records. Hence, we are using resevoir sampling in this project. We selected 15,000 out of 150,346 records from `businsses` and find corresponding `reviews` and `users`, which satisfies the minimum requirements:
- Total uncompressed size ≥ 1GB of data
- Minimum of 1 million records (across all entities)

In [5]:
def reservoir_sample(data, n, k):
  # fill the reservoir array
  r = []
  for i in range(k):
    r.append(data[i])

  # replace elements w/gradually decreasing prob.
  for i in range(k, n):
    # generates a uniform integer between 0 and a-1
    j = randrange(i+1)
    if j < k:
        r[j] = data[i]

  return r

### Load Datasets

In [6]:
# Load dataset
df_business = pd.read_json("yelp_academic_dataset_business.json", lines=True)
df_business_dict = df_business.to_dict(orient='records')

In [7]:
print(f"yelp_academic_dataset_business dataset total rows: {len(df_business)}")
df_business.head()

yelp_academic_dataset_business dataset total rows: 150346


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


### Resevoir Sampling

In [8]:
# Sample business dataset with a size of 15000
n = len(df_business)
k = 15000
df_business_sample = reservoir_sample(df_business_dict, n, k)
df_business_sample = pd.DataFrame(df_business_sample)

In [9]:
df_business_sample.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,8Zq240AZghoF8zMENXjBEQ,Reno iPhone & iPod Repair,5000 Meadowood Mall Cir,Reno,NV,89502,39.472969,-119.783179,4.0,86,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Local Services, Mobile Phone Repair, Professio...","{'Monday': '10:0-21:0', 'Tuesday': '10:0-21:0'..."
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,J4M1_ZSSFGmasxOWpRKdnA,Bradburn's Parent Teacher Store,734 N New Ballas Rd,Creve Coeur,MO,63141,38.669124,-90.443246,3.5,5,0,"{'BusinessAcceptsCreditCards': 'True', 'Busine...","Art Supplies, Arts & Crafts, Shopping","{'Monday': '9:0-18:0', 'Tuesday': '9:0-18:0', ..."
3,b1JoF8rYrmopH1xqKNSKJg,U-Haul Moving & Storage of Garden City,8151 W Chinden Blvd,Boise,ID,83714,43.650369,-116.283593,1.5,11,1,"{'BusinessAcceptsCreditCards': 'True', 'Busine...","Automotive, Truck Rental, Home Services, Mover...","{'Monday': '7:0-19:0', 'Tuesday': '7:0-19:0', ..."
4,5OyJRLrxjfZXNw1aH-I07g,Tampa Fine Arts Academy,2148 Ashley Oaks Cir,Wesley Chapel,FL,33544,28.188135,-82.362306,3.0,7,1,"{'BusinessParking': '{'garage': False, 'street...","Musical Instruments & Teachers, Shopping, Educ...","{'Monday': '9:0-21:0', 'Tuesday': '9:0-21:0', ..."


In [10]:
# Extract business_id from df_business_sample
business_ids = set(df_business_sample["business_id"])

review_sample = []

with open('yelp_academic_dataset_review.json', 'r') as f:
    for line in f:
        review = json.loads(line)
        if review['business_id'] in business_ids:
            review_sample.append(review)

df_review_sample = pd.DataFrame(review_sample)

In [11]:
df_review_sample.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
1,pUycOfUwM8vqX7KjRRhUEA,59MxRhNVhU9MYndMkz0wtw,gebiRewfieSdtt17PTW6Zg,3.0,0,0,0,Had a party of 6 here for hibachi. Our waitres...,2016-07-25 07:31:06
2,qGQvUHmC02PAZW3H1WTIfw,RRTQpg8hutdimzAYuP_Hbw,eaJCpC6IhYphj7bwCDHTwQ,5.0,0,0,0,Stopped by after a Sunday morning walk in the ...,2015-08-30 13:41:47
3,QDY_xE91MY9O-nzn56yHxQ,enaHB1e956thdnafcHVAig,TyOe_EcbyAWMmPgg_ILwHQ,3.0,1,0,0,I've been to this location many times when I l...,2012-05-29 12:16:46
4,DWbmJF84jRrGaJRmlSSnYQ,aWlojpSpzEICTza3RgGJgg,SIoCIxjn4jLt2O-4DajWJw,4.0,0,0,0,My go-to spot on a Saturday night! I was real ...,2015-11-18 17:47:29


In [12]:
# Extract user_id from df_review_sample
user_ids = set(df_review_sample["user_id"])

user_sample = []

with open('yelp_academic_dataset_user.json', 'r') as f:
    for line in f:
        user = json.loads(line)
        if user['user_id'] in user_ids:
            user_sample.append(user)

df_user_sample = pd.DataFrame(user_sample)

In [13]:
df_user_sample.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,13,10,17,3,66,96,119,119,35,18
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,4,1,6,2,12,16,26,26,10,9
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,...,1,0,0,0,1,1,0,0,0,0


In [14]:
print(f'Sampled businessed total rows: {len(df_business_sample)}')
print(f'Sampled reviews total rows: {len(df_review_sample)}')
print(f'Sampled users total rows: {len(df_user_sample)}')
print(f'Total rows: {len(df_business_sample)+len(df_review_sample)+len(df_user_sample)}')

Sampled businessed total rows: 15000
Sampled reviews total rows: 692075
Sampled users total rows: 416820
Total rows: 1123895


In [15]:
df_business_sample.to_json("business_sample.json", orient="records", lines=True)
df_review_sample.to_json("review_sample.json", orient="records", lines=True)
df_user_sample.to_json("user_sample.json", orient="records", lines=True)
print(f'Sampling datasets completed')

Sampling datasets completed


## Create Schema and Insert Table

After doing the sampling, we will push the data to Postgres for task analysis.

### `Businesses` Dataset

In [16]:
# Check `df_business_sample` data types
df_business_sample.dtypes

business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object

In [17]:
# Connect to Yelp database
conn = psycopg2.connect("postgresql://localhost/yelp")

# Create a cursor object
cur = conn.cursor()

# Create `business` table
cur.execute("""
CREATE TABLE IF NOT EXISTS businesses (
    business_id TEXT PRIMARY KEY
    , name TEXT
    , address TEXT
    , city TEXT
    , state TEXT
    , postal_code TEXT
    , latitude FLOAT
    , longitude FLOAT
    , stars FLOAT
    , review_count INT
    , is_open INT
    , attributes JSONB
    , categories TEXT
    , hours JSONB
)
""")

# Create a list from `df_business_sample` for each column
business_rows = []
for _, row in df_business_sample.iterrows():
    business_rows.append((
        row["business_id"],
        row["name"],
        row["address"],
        row["city"],
        row["state"],
        row["postal_code"],
        row["latitude"],
        row["longitude"],
        row["stars"],
        row["review_count"],
        row["is_open"],
        json.dumps(row["attributes"]) if pd.notnull(row["attributes"]) else None,
        row["categories"],
        json.dumps(row["hours"]) if pd.notnull(row["hours"]) else None
    ))

# Insert data from a list
cur.executemany("""
    INSERT INTO businesses (
        business_id, name, address, city, state, postal_code,
        latitude, longitude, stars, review_count, is_open,
        attributes, categories, hours
    )
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (business_id) DO NOTHING
""", business_rows)

# Commit the transaction
# conn.commit()

In [18]:
# Preview `businesses` table
business_preview = pd.read_sql("SELECT * FROM businesses LIMIT 5;", conn)
print(business_preview)

              business_id                                    name  \
0  8Zq240AZghoF8zMENXjBEQ               Reno iPhone & iPod Repair   
1  mpf3x-BjTdTEA3yCZrAYPw                           The UPS Store   
2  J4M1_ZSSFGmasxOWpRKdnA         Bradburn's Parent Teacher Store   
3  b1JoF8rYrmopH1xqKNSKJg  U-Haul Moving & Storage of Garden City   
4  5OyJRLrxjfZXNw1aH-I07g                 Tampa Fine Arts Academy   

                           address           city state postal_code  \
0          5000 Meadowood Mall Cir           Reno    NV       89502   
1  87 Grasso Plaza Shopping Center         Affton    MO       63123   
2              734 N New Ballas Rd    Creve Coeur    MO       63141   
3              8151 W Chinden Blvd          Boise    ID       83714   
4             2148 Ashley Oaks Cir  Wesley Chapel    FL       33544   

    latitude   longitude  stars  review_count  is_open  \
0  39.472969 -119.783179    4.0            86        1   
1  38.551126  -90.335695    3.0           

### `Reviews` Dataset

In [19]:
# Check `df_review_sample` data types
df_review_sample.dtypes

review_id       object
user_id         object
business_id     object
stars          float64
useful           int64
funny            int64
cool             int64
text            object
date            object
dtype: object

In [20]:
# # Connect to Yelp database
# conn = psycopg2.connect("postgresql://localhost/yelp")

# # Create a cursor object
# cur = conn.cursor()

# Create `review` table
cur.execute("""
CREATE TABLE IF NOT EXISTS reviews (
    review_id TEXT PRIMARY KEY
    , user_id TEXT
    , business_id TEXT
    , stars FLOAT
    , useful INT
    , funny INT
    , cool INT
    , text TEXT
    , date TIMESTAMP
)
""")

# Create a list from `df_review_sample` for each column
review_rows = []
for _, row in df_review_sample.iterrows():
    review_rows.append((
        row["review_id"],
        row["user_id"],
        row["business_id"],
        row["stars"],
        row["useful"],
        row["funny"],
        row["cool"],
        row["text"],
        pd.to_datetime(row["date"]) if pd.notnull(row["date"]) else None
    ))

# Insert data from a list
cur.executemany("""
    INSERT INTO reviews (
        review_id, user_id, business_id, stars,
        useful, funny, cool, text, date
    )
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (review_id) DO NOTHING
""", review_rows)


# Commit the transaction
conn.commit()

# # Close the cursor and connection
# cur.close()
# conn.close()

In [21]:
# Preview `reviews` table
review_preview = pd.read_sql("SELECT * FROM reviews LIMIT 5;", conn)
print(review_preview)

                review_id                 user_id             business_id  \
0  saUsX_uimxRlCVr67Z4Jig  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A   
1  pUycOfUwM8vqX7KjRRhUEA  59MxRhNVhU9MYndMkz0wtw  gebiRewfieSdtt17PTW6Zg   
2  qGQvUHmC02PAZW3H1WTIfw  RRTQpg8hutdimzAYuP_Hbw  eaJCpC6IhYphj7bwCDHTwQ   
3  QDY_xE91MY9O-nzn56yHxQ  enaHB1e956thdnafcHVAig  TyOe_EcbyAWMmPgg_ILwHQ   
4  DWbmJF84jRrGaJRmlSSnYQ  aWlojpSpzEICTza3RgGJgg  SIoCIxjn4jLt2O-4DajWJw   

   stars  useful  funny  cool  \
0    3.0       0      0     0   
1    3.0       0      0     0   
2    5.0       0      0     0   
3    3.0       1      0     0   
4    4.0       0      0     0   

                                                text                date  
0  Family diner. Had the buffet. Eclectic assortm... 2014-02-05 20:30:30  
1  Had a party of 6 here for hibachi. Our waitres... 2016-07-25 07:31:06  
2  Stopped by after a Sunday morning walk in the ... 2015-08-30 13:41:47  
3  I've been to this location many tim

### `Users` Dataset

In [22]:
# Check `df_user_sample` data types
df_user_sample.dtypes

user_id                object
name                   object
review_count            int64
yelping_since          object
useful                  int64
funny                   int64
cool                    int64
elite                  object
friends                object
fans                    int64
average_stars         float64
compliment_hot          int64
compliment_more         int64
compliment_profile      int64
compliment_cute         int64
compliment_list         int64
compliment_note         int64
compliment_plain        int64
compliment_cool         int64
compliment_funny        int64
compliment_writer       int64
compliment_photos       int64
dtype: object

In [23]:
# # Connect to Yelp database
# conn = psycopg2.connect("postgresql://localhost/yelp")

# # Create a cursor object
# cur = conn.cursor()

# Drop the existing table
cur.execute("DROP TABLE IF EXISTS users")

# Create `user` table
cur.execute("""
CREATE TABLE IF NOT EXISTS users (
    user_id TEXT PRIMARY KEY
    , name TEXT
    , review_count INT
    , yelping_since TIMESTAMP
    , useful INT
    , funny INT
    , cool INT
    , elite TEXT
    , friends TEXT
    , fans INT
    , average_stars FLOAT
    , compliment_hot INT
    , compliment_more INT
    , compliment_profile INT
    , compliment_cute INT
    , compliment_list INT
    , compliment_note INT
    , compliment_plain INT
    , compliment_cool INT
    , compliment_funny INT
    , compliment_writer INT
    , compliment_photos INT
)
""")

# Create a list from `df_review_sample` for each column
user_rows = []
for _, row in df_user_sample.iterrows():
    user_rows.append((
        row["user_id"],
        row["name"],
        row["review_count"],
        pd.to_datetime(row["yelping_since"]) if pd.notnull(row["yelping_since"]) else None,
        row["useful"],
        row["funny"],
        row["cool"],
        row["elite"],
        row["friends"],
        row["fans"],
        row["average_stars"],
        row["compliment_hot"],
        row["compliment_more"],
        row["compliment_profile"],
        row["compliment_cute"],
        row["compliment_list"],
        row["compliment_note"],
        row["compliment_plain"],
        row["compliment_cool"],
        row["compliment_funny"],
        row["compliment_writer"],
        row["compliment_photos"]
    ))

# Insert data from a list
cur.executemany("""
    INSERT INTO users (
        user_id, name, review_count, yelping_since,
        useful, funny, cool, elite, friends, fans,
        average_stars, compliment_hot, compliment_more,
        compliment_profile, compliment_cute, compliment_list,
        compliment_note, compliment_plain, compliment_cool,
        compliment_funny, compliment_writer, compliment_photos
        )
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        ON CONFLICT (user_id) DO NOTHING
""", user_rows)


# Commit the transaction
conn.commit()

In [24]:
# Preview `users` table
user_preview = pd.read_sql("SELECT * FROM users LIMIT 5;", conn)
print(user_preview)

                  user_id    name  review_count       yelping_since  useful  \
0  qVc8ODYU5SZjKXVBgXdI7w  Walker           585 2007-01-25 16:47:26    7217   
1  j14WgRoU_-2ZE1aw1dXrJg  Daniel          4333 2009-01-25 04:35:42   43091   
2  2WnXYQFK0hXEoTxPtV2zvg   Steph           665 2008-07-25 10:41:00    2086   
3  SZDeASXq7o05mMNLshsdIA    Gwen           224 2005-11-29 04:38:33     512   
4  hA5lMy-EnncsH4JoR-hFGQ   Karen            79 2007-01-05 19:40:59      29   

   funny   cool                                              elite  \
0   1259   5994                                               2007   
1  13066  27281  2009,2010,2011,2012,2013,2014,2015,2016,2017,2...   
2   1010   1003                           2009,2010,2011,2012,2013   
3    330    299                                     2009,2010,2011   
4     15      7                                                      

                                             friends  fans  ...  \
0  NSCy54eWehBJyZdG2iE84w, pe42u7DcCH

In [25]:
# Close the cursor and connection
# cur.close()
# conn.close()