In [1]:
import os
import sys
sys.path.append(os.pardir)

### Imports

In [2]:
import json
import pathlib
import pandas as pd
from sqlalchemy.orm import sessionmaker
from credible import connectors
from credible.etl import Etl
from credible.objects import Base, Photo, Tip, Checkin, Business, User, Review

### Parameters

In [3]:
engine = connectors.connect_to_sqlite()

In [4]:
data_folderpath = os.path.join(os.pardir, 'data')
generator = pathlib.Path(data_folderpath).glob('*_x.json')
sorted_list = sorted(generator, key=os.path.getsize, reverse=True)
file_review, file_user, file_checkin, file_tip, file_business, file_photo = sorted_list

### Session

In [5]:
Session = sessionmaker(bind=engine)
etl = Etl(Base, Session, engine)

In [6]:
etl.create_all(drop_all=True)

### Import data to Photos

In [7]:
with open(file_photo, 'r') as json_data:
    data = json.load(json_data)
etl.bulk_insert(Photo, data)

In [8]:
pd.read_sql_table('photos', engine).sample()

Unnamed: 0,_id,photo_id,business_id,caption,label
163975,163976,FGnY4lZagiTpf4PisVlPTQ,vx4YAA02Qz6khRD1fZ1MFA,Lobster,food


### Import data to Tips

In [9]:
with open(file_tip, 'r') as json_data:
    data = json.load(json_data)

# requires data transformation
tips_list = []
for d in data:
    t = Tip(d)
    tips_list.append(t)

etl.bulk_insert(Tip, tips_list)

In [10]:
pd.read_sql_table('tips', engine).sample()

Unnamed: 0,_id,user_id,business_id,text,date,compliment_count
2383342,2383343,M63hGNC6_nYdILexBPZziQ,PDIh1lVtKlUqPJROXYJQvQ,It was a fairly decent,2016-01-25 18:12:50,0


### Import data to Checkins

In [11]:
with open(file_checkin, 'r') as json_data:
    data = json.load(json_data)
etl.bulk_insert(Checkin, data)

In [12]:
pd.read_sql_table('checkins', engine).sample()

Unnamed: 0,_id,business_id,date
27870,27871,A2STUTJ3OBkXe4kOYka3tw,2016-12-08 01:10:59


### Import data to Businesses

In [13]:
with open(file_business, 'r') as json_data:
    data = json.load(json_data)
    
# requires data transformation
business_list = []
for d in data:
    b = Business(d)
    business_list.append(b)

etl.bulk_insert(Business, business_list)

In [14]:
pd.read_sql_table('businesses', engine).sample()

Unnamed: 0,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
131233,131234,NzJnxsdKEUHUK7P14OOT1w,The UPS Store,"8640 University City Blvd, Ste A-3",Charlotte,NC,28213,35.296695,-80.737813,3.5,8,1,"{""BusinessAcceptsCreditCards"": ""True""}","Mailbox Centers, Printing Services, Local Serv...","{""Monday"": ""0:0-0:0"", ""Tuesday"": ""8:30-19:0"", ..."


### Import data to Users

In [15]:
with open(file_user, 'r') as json_data:
    data = json.load(json_data)


# requires data transformation
users_list = []
for d in data:
    t = User(d)
    users_list.append(t)

etl.bulk_insert(User, users_list)

In [16]:
pd.read_sql_table('users', engine).sample()

Unnamed: 0,_id,user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
1190019,1190020,jqI9Aaggpaii1F5UoCv7Hg,Herman,4,2016-07-01 00:09:50,"amn58JuFTPWrqDkd30sjiQ, i92IM7JvW4RO7tLjB-HL2w...",0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Import data to Reviews

In [17]:
with open(file_review, 'r') as json_data:
    data = json.load(json_data)
etl.bulk_insert(Review, data)

In [18]:
pd.read_sql_table('reviews', engine).sample()

Unnamed: 0,_id,review_id,business_id,user_id,stars,date,text,useful,funny,cool
1398911,1398912,SBQ9bEP3Qy5MdJrX4EpwtQ,NvKNe9DnQavC9GstglcBJQ,HxEAx05UPw_HWng1yhRCew,4,2015-04-03 17:19:57,Came in here because all the buffets were too ...,0,0,0
