In [1]:
import pandas as pd
import numpy as np
import csv

# Cleaning

In [2]:
# read in data
df_list = []
for chunk in pd.read_json('yelp_academic_dataset_business.json', chunksize=200, lines=True):
    df_list.append(chunk)
df = pd.concat(df_list)

In [3]:
# clean table for faster loading
df.dropna(subset=['categories','hours','state'], inplace=True)
df = df[df['categories'].str.contains("Restaurants")]
df = df[df['is_open']==1]
df = df[~df['state'].isin(['ON','AB','QC','MB','BC'])]
df = df[~df['postal_code'].str.contains(" ")]
df = df.drop(columns=['attributes'])
df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories,hours
8,pQeaRpvuhoEqudo3uymHIQ,The Empanadas House,404 E Green St,Champaign,IL,61820,40.110446,-88.233073,4.5,5,1,"Ethnic Food, Food Trucks, Specialty Food, Impo...","{'Monday': '11:30-14:30', 'Tuesday': '11:30-14..."
33,vjTVxnsQEZ34XjYNS-XUpA,Wetzel's Pretzels,"4550 East Cactus Rd, #KSFC-4",Phoenix,AZ,85032,33.602822,-111.983533,4.0,10,1,"Food, Pretzels, Bakeries, Fast Food, Restaurants","{'Monday': '10:0-21:0', 'Tuesday': '10:0-21:0'..."
41,98hyK2QEUeI8v2y0AghfZA,Pho Lee's Vietnamese Restaurant,"1541 E 38th St, Ste 101",Cleveland,OH,44114,41.512155,-81.663332,4.5,23,1,"Restaurants, Vietnamese, Soup","{'Monday': '11:0-20:0', 'Wednesday': '11:0-20:..."
49,tLpkSwdtqqoXwU0JAGnApw,Wendy's,4602 Northfield Road,Cleveland,OH,44128,41.434614,-81.527026,3.5,7,1,"Restaurants, Fast Food, Burgers","{'Monday': '10:0-3:0', 'Tuesday': '10:0-3:0', ..."
54,lK-wuiq8b1TuU7bfbQZgsg,Hingetown,,Cleveland,OH,44113,41.489343,-81.711029,3.0,4,1,"Shopping Centers, Food, Coffee & Tea, Cafes, M...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."


In [5]:
# # export to csv: for python script
# df.to_csv('yelp_business.csv',index=False)

# Cassandra

In [5]:
# # import from csv: for python script
# df = pd.read_csv('yelp_business.csv')
# df.head()

In [4]:
# start cassandra cluster
from cassandra.cluster import Cluster
cluster = Cluster(['cassandra'])
session = cluster.connect()

In [12]:
# make keyspace
session.execute(
    "CREATE KEYSPACE IF NOT EXISTS ks "
    "WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 3 };"
)

session.set_keyspace('ks')

In [53]:
session.execute("""
DROP TABLE business;
""")

OperationTimedOut: errors={'10.37.0.58:9042': 'Client request timeout. See Session.execute[_async](timeout)'}, last_host=10.37.0.58:9042



In [43]:
# make table
session.execute("""
CREATE TABLE business(
   id text,
   name text,
   address text,
   city text,
   state text,
   stars text,
   review_count int,
   is_open int,
   categories text,
   hours text,
   PRIMARY KEY (id, name)
);
""")

<cassandra.cluster.ResultSet at 0x7fdca9eb2d90>

In [51]:
# insert dataframe into table
query = "INSERT INTO business (id, name, address, city, state, stars, review_count, is_open, categories, hours)  VALUES (?,?,?,?,?,?,?,?,?,?)"
prepared = session.prepare(query)

for index, item in df.iterrows():
    session.execute(prepared, (item[0], item[1], str(item[2]), item[3], item[4], item[8], item[9], item[10], item[11], item[12]))
    

KeyboardInterrupt: 

In [23]:
# make queries on state
query = """
SELECT id, name, address, city, state, stars, review_count, is_open, categories, hours
FROM business
WHERE state = %s and is_open = 1
ALLOW FILTERING;
"""

rows = session.execute(query, ('AZ', )) # input example AZ for state

# export query results to csv

header = ['id','name', 'address', 'city', 'state', 'stars', 'review_count', 'categories', 'hours']
with open('test.csv', 'w') as fp:
    writer = csv.writer(fp, delimiter=',')
    writer.writerow(header)
    for row in rows:
        writer.writerow([row.id, row.name, row.address, row.city, row.state, row.stars, row.review_count, row.categories, row.hours])

In [54]:
# drop table 
session.execute("""
DROP TABLE business;
""")

<cassandra.cluster.ResultSet at 0x7fdca99a3b90>

# Redis

In [52]:
# start redis session
import redis
r = redis.Redis(host='my-redis', port=6379, db=0,decode_responses=True)

In [31]:
# load query result from cassandra
cdf = pd.read_csv('test.csv')
#cdf

Unnamed: 0,id,name,address,city,state,stars,review_count,categories,hours
0,T4hXOBXNU5hq694fjpz1Ng,Mother's Tamales,330 S Gilbert Rd,Mesa,AZ,4.5,21,"Breakfast & Brunch, Restaurants, Mexican","{'Friday': '9:0-21:0', 'Monday': '9:0-21:0', '..."
1,rMCKv9puuu2NkYISHnwyWg,Rubio's,"4747 E Bell Rd, Ste 17",Phoenix,AZ,3.5,100,"Restaurants, Seafood, Event Planning & Service...","{'Friday': '10:30-21:30', 'Monday': '10:30-21:..."
2,64E4jIkHN20RVJoaM2fyGw,O'Kelley's,"2120 W Guadalupe Rd, Ste 17",Mesa,AZ,3.5,72,"Arts & Entertainment, Music Venues, Nightlife,...","{'Friday': '6:0-2:0', 'Monday': '6:0-2:0', 'Sa..."
3,b1m_PK-ggGR3CmBjdIyqkQ,Wrights at The Arizona Biltmore,2400 E Missouri Ave,Phoenix,AZ,4.0,139,"Restaurants, American (New), Breakfast & Brunch","{'Friday': '17:30-21:30', 'Monday': '17:30-21:..."
4,2Umo6re4oAAd_GkP5RFWwQ,Whataburger,14251 W Indian School Byp,Goodyear,AZ,2.5,57,"Restaurants, Fast Food, Burgers, American (Tra...","{'Friday': '0:0-0:0', 'Monday': '0:0-0:0', 'Sa..."
...,...,...,...,...,...,...,...,...,...
508,G_1M9_yX5ThAROn1zKKpQg,Panda Express,11555 W Van Buren St,Avondale,AZ,2.5,41,"Fast Food, Restaurants, Chinese","{'Friday': '10:0-21:30', 'Monday': '10:0-21:30..."
509,fl34c5BlCWmSL_BQk-cnaQ,Which Wich,"2795 S Market St, Ste 110",Gilbert,AZ,3.5,107,"Seafood, Sandwiches, Restaurants, Italian","{'Friday': '9:0-21:0', 'Monday': '9:0-21:0', '..."
510,zJGtD3y-pAIGNId4codEEg,Otro Cafe,6035 N 7th St,Phoenix,AZ,4.0,808,"Restaurants, Mexican, Breakfast & Brunch, Cafes","{'Friday': '8:0-22:0', 'Monday': '0:0-0:0', 'S..."
511,AcjO26DmOhqWlfo2s5DUow,Stackers Restaurant,"2855 W Cactus Rd, Ste 24",Phoenix,AZ,3.5,83,"American (Traditional), Nightlife, Pubs, Bars,...","{'Friday': '11:0-2:0', 'Monday': '11:0-1:0', '..."


In [53]:
# populate redis with query result
pipe = r.pipeline()
for i in range(len(cdf)):
    id_ = i
    pipe.hmset(id_, {'business_id': cdf['business_id'][i],
                     'name': cdf['name'][i],
                     'address': cdf['address'][i],
                     'city': cdf['city'][i], 
                     'state': cdf['state'][i],
                     'stars': float(cdf['stars'][i]),
                     'review_count': int(cdf['review_count'][i]),
                     'categories': cdf['categories'][i],
                     'hours': cdf['hours'][i]})
    pipe.zadd(cdf['city'][i], {id_:i})
pipe.execute()

  # This is added back by InteractiveShellApp.init_path()


[True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,
 True,
 0,

In [6]:
# for x in r.zrangebyscore('Phoenix', 0, len(cdf)):
#     pipe.hgetall(str(x))
#     #json.dump(pipe.hgetall(str(x.decode())),f)
#     #r.hgetall(str(x.decode()))
# pipe.execute()

In [69]:
# query on city; print out output to a JSON file
with open('test.json', 'w') as f:
    for x in r.zrangebyscore('Phoenix', 0, len(cdf)):
        pipe.hgetall(x)
    json.dump(pipe.execute(),f)

# Check Output

In [7]:
# check if output is formatted correctly
test_list = []
test = pd.read_json('test.json')
test

Unnamed: 0,name,state,categories,hours,city,stars,address,review_count
0,Rubio's,AZ,"Restaurants, Seafood, Event Planning & Service...","{'Friday': '10:30-21:30', 'Monday': '10:30-21:...",Phoenix,3.5,"4747 E Bell Rd, Ste 17",100
1,Wrights at The Arizona Biltmore,AZ,"Restaurants, American (New), Breakfast & Brunch","{'Friday': '17:30-21:30', 'Monday': '17:30-21:...",Phoenix,4.0,2400 E Missouri Ave,139
2,Famous Ray's Pizza,AZ,"Pizza, Restaurants","{'Friday': '10:0-22:30', 'Monday': '10:0-21:30...",Phoenix,3.0,2501 W Happy Valley Rd,94
3,Cafe Rio,AZ,"Food, Mexican, Restaurants","{'Friday': '10:30-23:0', 'Monday': '10:30-22:0...",Phoenix,3.0,12005 N Tatum Blvd,89
4,Crazy Jim's,AZ,"Mediterranean, Pizza, Greek, Breakfast & Brunc...","{'Friday': '6:30-14:30', 'Monday': '0:0-0:0', ...",Phoenix,3.5,"305 W Washington St, Ste 104",97
...,...,...,...,...,...,...,...,...
166,Hong Kong Asian Cuisine,AZ,"Japanese, Chinese, Asian Fusion, Restaurants","{'Friday': '11:0-22:0', 'Monday': '11:0-21:30'...",Phoenix,2.5,"3170 W Carefree Hwy, Ste 1",82
167,Chambers on First,AZ,"American (New), Bars, Restaurants, Pubs, Night...","{'Friday': '11:0-2:0', 'Monday': '11:0-2:0', '...",Phoenix,4.0,705 N 1st St,96
168,Otro Cafe,AZ,"Restaurants, Mexican, Breakfast & Brunch, Cafes","{'Friday': '8:0-22:0', 'Monday': '0:0-0:0', 'S...",Phoenix,4.0,6035 N 7th St,808
169,Stackers Restaurant,AZ,"American (Traditional), Nightlife, Pubs, Bars,...","{'Friday': '11:0-2:0', 'Monday': '11:0-1:0', '...",Phoenix,3.5,"2855 W Cactus Rd, Ste 24",83
