In [10]:
import json
import pandas as pd

from tqdm import tqdm
from datetime import datetime, timedelta

In [11]:
def get_ratings_data():
    # Parsing reviews.json to dataframe
    line_count = len(open("data/review.json").readlines())
    user_ids, business_ids, stars, dates = [], [], [], []
    count = 0;
    with open("data/review.json") as f:
        for line in tqdm(f, total=line_count):
            blob = json.loads(line)
            user_ids += [blob["user_id"]]
            business_ids += [blob["business_id"]]
            stars += [blob["stars"]]
            dates += [blob["date"]]

    ratings = pd.DataFrame({"user_id": user_ids, 
                            "business_id": business_ids, 
                            "rating": stars, "date": dates})
    user_counts = ratings["user_id"].value_counts()

    # Subsetting active users, with more than 5 reviews
    active_users = user_counts.loc[user_counts >= 5].index.tolist()   
    
    return ratings

In [12]:
def get_business_data():
    # Parsing business.json to dataframe
    line_count = len(open('data/business.json').readlines())
    business_ids, city, state, stars, review_count, categories, hours = [], [], [], [], [], [], []

    with open('data/business.json') as f:
        for line in tqdm(f, total=line_count):
            blob = json.loads(line)
            business_ids += [blob["business_id"]]
            city += [blob["city"]]
            state += [blob["state"]]
            stars += [blob["stars"]]
            review_count += [blob["review_count"]]
            if blob["categories"] is None:
                categories += ['None']
            else:
                categories += [blob["categories"].split(', ')]
            hours += [blob["hours"]]

    business = pd.DataFrame({"business_id": business_ids, 
                             "stars": stars, 
                             "review_count": review_count, 
                             "categories": categories, 
                             "hours": hours})
    
    return business

In [13]:
def get_checkin_data():
    # Parsing checkin.json to dataframe
    line_count = len(open('data/checkin.json').readlines())
    business_ids, dates = [], []

    with open('data/checkin.json') as f:
        for line in tqdm(f, total=line_count):
            blob = json.loads(line)
            business_ids += [blob["business_id"]]
            dates += [blob["date"]]

    checkin = pd.DataFrame({"business_id": business_ids, 
                            "date": dates})
    
    return checkin

In [14]:
def get_user_data():    
    # Parsing user.json to dataframe
    line_count = len(open('data/user.json').readlines())

    user_ids, review_counts, yelping_since, friends, fans, elite, average_stars = [], [], [], [], [], [], []
    compliment_hot, compliment_more, compliment_profile, compliment_cute =  [], [], [], []
    compliment_list, compliment_note, compliment_plain, compliment_cool = [], [], [], []
    compliment_funny, compliment_writer, compliment_photos = [], [], []

    with open('data/user.json') as f:
        for line in tqdm(f, total=line_count):
            blob = json.loads(line)
            user_ids+= [blob["user_id"]]
            review_counts+= [blob["review_count"]]
            yelping_since += [blob["yelping_since"]]
            friends+=[blob["friends"]]
            fans+=[blob['fans']]
            elite+=[blob["elite"]]
            average_stars += [blob["average_stars"]]
            compliment_hot += [blob["compliment_hot"]]
            compliment_more += [blob["compliment_more"]]
            compliment_profile += [blob["compliment_profile"]]
            compliment_cute += [blob["compliment_cute"]]
            compliment_list += [blob["compliment_list"]]
            compliment_note += [blob["compliment_note"]]
            compliment_plain += [blob["compliment_plain"]]
            compliment_cool += [blob["compliment_cool"]]
            compliment_funny += [blob["compliment_funny"]]
            compliment_writer += [blob["compliment_writer"]]
            compliment_photos += [blob["compliment_photos"]]

    user = pd.DataFrame({"user_id": user_ids, 
                         "review_count": review_counts, 
                         "yelping_since": yelping_since, 
                         "friends" : friends,
                         "fans" : fans, 
                         "elite" : elite,
                         "average_stars" : average_stars,
                         "compliment_hot" : compliment_hot,
                         "compliment_more" : compliment_more,
                         "compliment_profile" : compliment_profile,
                         "compliment_cute" : compliment_cute,
                         "compliment_list" : compliment_list,
                         "compliment_note" : compliment_note,
                         "compliment_plain" : compliment_plain,
                         "compliment_cool" : compliment_cool,
                         "compliment_funny" : compliment_funny,
                         "compliment_writer" : compliment_writer,
                         "compliment_photos" : compliment_photos})

    # Subsetting active users, with more than 5 reviews
    user = user[user.review_count >= 5]

    return user

In [15]:
def get_tips_data():
    # Parsing tips.json to dataframe
    line_count = len(open('data/tip.json').readlines())
    user_ids, compliment_counts= [], []

    with open('data/tip.json') as f:
        for line in tqdm(f, total=line_count):
            blob = json.loads(line)
            user_ids += [blob["user_id"]]
            compliment_counts += [blob["compliment_count"]]

    tips = pd.DataFrame({"user_id": user_ids, 
                         "compliment_count": compliment_counts})
    
    return tips