In [0]:
import requests
import pandas as pd
import random
import collections
import sys
import os
import warnings

warnings.filterwarnings('ignore')

In [0]:
DATA_FOLDER  = "data/"

In [0]:
def get_yelp_api_key():
    try:
        f = open("key.txt", "r")
        key = f.readline()
        f.close()    
        return key

    except OSError:
        print('[ERROR]:Please provide your Yelp API key in key.txt')

In [0]:
def write_to_csv(df, file):
    try: 
        os.mkdir(DATA_FOLDER)
    except FileExistsError as _: 
        print('')    
    
    file_path = DATA_FOLDER + file
    df.to_csv(file_path)   
    print("### Done writing to file : ", file)


In [0]:
def read_csv(file, encoding = ''):
    file_path = DATA_FOLDER + file
    if len(encoding):
        df = pd.read_csv(file_path, encoding = encoding) 
    else:
        df = pd.read_csv(file_path)
    return df

In [0]:
def add_yelp_attrib(productID, fullResponse = False, attrib = "alias"):
  base_url = 'https://api.yelp.com/v3/businesses/'
  # biz_id = 'pbEiXam9YJL3neCYHGwLUA'
  biz_id = productID
  data_url = base_url + str(biz_id)
  api_key = get_yelp_api_key()
  headers = {
      'Authorization': 'Bearer %s' % api_key,
  }
  response = requests.request('GET', data_url, headers=headers, params={}).json()
  
  if fullResponse:
    return response
  else: 
    return response[attrib]


In [0]:
def get_chicago_data(restaurants = True):
  if restaurants:
    url = 'output_meta_yelpResData_NRYRcleaned.txt'
  else:
    url = 'output_meta_yelpHotelData_NRYRcleaned.txt'
  data = pd.read_csv(url, sep=" ", header = None)
  data.columns = ["Date", "review ID", "reviewer ID", "product ID", "label", "xx", "yy", "zz", "Star Rating"]
  data = data[["Date", "review ID", "reviewer ID", "product ID", "label", "Star Rating"]]
   
  return data

In [0]:
def get_data_attrib(item_set):
  
  result = []
  for hotel in item_set:
    hotel_id = hotel
    attribs = add_yelp_attrib(hotel_id, fullResponse = True)
    if len(attribs) == 0:
        print("couldn't get data for id : ", hotel_id)

    try:
      zip_code = attribs['location']['zip_code']
    except:
      zip_code = "Oops!"  
    try:
      alias = attribs['alias']
    except:
      alias = "Oops!"
    
    try:
      name = attribs['name']
    except:
      name = "Oops!"
    
    try:
      coord = attribs['coordinates']
      latitude = coord['latitude']  
      longitude = coord['longitude']  
    except:
      latitude = -1
      longitude = -1
    
    try:
      rating = attribs['rating']
    except:
      rating = -1  
  
    try:
      review_count = attrib['review_count']
    except:
      review_count = -1

    result.append((hotel_id, name, alias, latitude, longitude, zip_code, rating, review_count))

  return result

In [0]:
data = get_chicago_data()
items = data['product ID']
item_set = set(items.tolist())
result = get_data_attrib(item_set)
df = pd.DataFrame(result)
df.columns = ['rest_id', 'name', 'alias', 'latitude', 'longitude', 'zip_code', 'rating', 'review_count']
write_to_csv(df, 'chicago_restaurants.csv')

Total items 129
result 129
df len 129

### Done writing to file :  chicago_restaurants.csv


In [0]:
data = get_chicago_data(restaurants = False)
items = data['product ID']
item_set = set(items.tolist())
result = get_data_attrib(item_set)

df = pd.DataFrame(result)
df.columns = ['rest_id', 'name', 'alias', 'latitude', 'longitude', 'zip_code', 'rating', 'review_count']
write_to_csv(df, 'chicago_hotels.csv')

Total items 72

### Done writing to file :  chicago_hotels.csv


In [0]:
#Chicago Filenames
chi_hotel_data = 'output_meta_yelpHotelData_NRYRcleaned.txt'
chi_hotel_review = 'output_review_yelpHotelData_NRYRcleaned.txt'
chi_hotel_metadata = 'chicago_hotels.csv'

chi_restaurants_data = 'output_meta_yelpResData_NRYRcleaned.txt'
chi_restaurants_review = 'output_review_yelpResData_NRYRcleaned.txt'
chi_restaurants_metadata = 'chicago_restaurants.csv'

In [0]:
chi_hotel_data = get_chicago_data(restaurants=False)
chi_hotel_reviews = pd.DataFrame(open(chi_hotel_review).read().splitlines())

chi_hotel_data = pd.concat([chi_hotel_data, chi_hotel_reviews], axis = 1)
chi_hotel_data.rename(columns = {0: "review"}, inplace=True)

chi_hotel_df = pd.read_csv(chi_hotel_metadata)

chicago_flat_hotel = pd.merge(chi_hotel_data, chi_hotel_df, how='left', left_on='product ID', right_on='rest_id')

write_to_csv(chicago_flat_hotel, 'chicago_flat_hotels.csv', False)

In [0]:
chi_rest_data = get_chicago_data(restaurants=True)
chi_rest_reviews = pd.DataFrame(open(chi_restaurants_review).read().splitlines())

chi_rest_data = pd.concat([chi_rest_data, chi_rest_reviews], axis = 1)
chi_rest_data.rename(columns = {0: "review"}, inplace=True)

chi_rest_df = pd.read_csv(chi_restaurants_metadata)

chicago_flat_rest = pd.merge(chi_rest_data, chi_rest_df, how='left', left_on='product ID', right_on='rest_id')

write_to_csv(chicago_flat_rest, 'chicago_flat_restaurants.csv', False)

In [0]:
#Review based
data = chicago_flat_rest
data = data.rename(columns={"reviewer ID": "user_id", "Date": "date", "product ID": "prod_id"})
data['review_len'] = data['review'].str.split().str.len()
data['review_char'] = data['review'].str.len()

punct_counter = lambda l1,l2: sum([1 for x in l1 if x in l2])
data['punct_count'] = data['review'].apply(lambda s: punct_counter(s, string.punctuation))

data['uppercase_count'] = data['review'].str.count(r'[A-Z]')


In [0]:
data['review_per_user'] = data.groupby('user_id')['user_id'].transform('size')
data['avg_rating_per_user'] =data.groupby('user_id')['rating'].transform('sum')/data['review_per_user']
data['avg_words_per_user'] = data.groupby('user_id')['review_len'].transform('sum')/data['review_per_user']
data['num_review_per_user_each_day'] = data.groupby(['user_id', 'date'])['user_id'].transform('size')


In [0]:
#Product based

data['review_per_prod'] = data.groupby('prod_id')['prod_id'].transform('size')
data['avg_rating_per_prod'] =data.groupby('prod_id')['rating'].transform('sum')/data['review_per_prod']
data['avg_words_per_prod'] = data.groupby('prod_id')['review_len'].transform('sum')/data['review_per_prod']
data['num_review_per_prod_each_day'] = data.groupby(['prod_id', 'date'])['prod_id'].transform('size')

In [0]:
write_to_csv(data, 'chicago_features_restaurants.csv', False)