In [12]:
import pandas as pd
import json

In [194]:
business_df = pd.read_json('./Data/yelp_academic_dataset_business.json', lines=True)
business_df = business_df[(business_df['state'] == 'TN') & (business_df['is_open'] == 1)]
business_df['categories'] = business_df['categories'].fillna('')
business_df = business_df[business_df['categories'].str.contains('Restaurants')]

In [14]:
checkin_df = pd.read_json('./Data/yelp_academic_dataset_checkin.json', lines=True)
checkin_df = checkin_df.loc[checkin_df['business_id'].isin(business_df['business_id'])]

In [15]:
useful_chunks = []

for chunk in pd.read_json('./Data/yelp_academic_dataset_review.json', lines=True, chunksize=100000):
    filtered_chunk = chunk.loc[chunk['business_id'].isin(business_df['business_id'])]
    useful_chunks.append(filtered_chunk)

reviews_df = pd.concat(useful_chunks)

In [16]:
tip_df = pd.read_json('./Data/yelp_academic_dataset_tip.json', lines=True)
tip_df = tip_df.loc[tip_df['business_id'].isin(business_df['business_id'])]

In [183]:
# display(business_df.shape)
# display(checkin_df.shape)
# display(reviews_df.shape)
# display(tip_df.shape)

In [18]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [195]:
all_restaurant_types = business_df['categories'].str.split(',').explode().str.strip().value_counts().index
valid_types = all_restaurant_types[:128].tolist()
types_to_remove = ['Restaurants','Event Planning & Services','Caterers','Music Venues','Food Delivery Services','Venues & Event Spaces','Hotels & Travel','Convenience Stores','International Grocery','Performing Arts','Florists','Active Life','Food','Nightlife', 'Arcades', 'Flowers & Gifts','Butcher', 'Jazz & Blues','Party & Event Planning','Dance Clubs']
for type in types_to_remove:
    valid_types.remove(type)

In [196]:
business_df.dropna(subset=['attributes'], inplace=True)
#extract: Outdoor Seating, Alcohol, RestaurantsPriceRange2
business_df['OutdoorSeating'] = business_df['attributes'].apply(lambda x: x.get('OutdoorSeating', None))
business_df['Alcohol'] = business_df['attributes'].apply(lambda x: x.get('Alcohol', None))
business_df['RestaurantsPriceRange2'] = business_df['attributes'].apply(lambda x: x.get('RestaurantsPriceRange2', None))

#fill outdoor seating with false
business_df['OutdoorSeating'].fillna(False, inplace=True)
business_df['OutdoorSeating'].replace({'False': False, 'True': True, 'None': False}, inplace=True)
#fill alcohol with none
business_df['Alcohol'].fillna('none', inplace=True)
business_df['Alcohol'].replace({
                            "u'none'" : 'none',
                            "u'full_bar'" : 'full_bar',
                            "u'beer_and_wine'" : 'beer_and_wine',
                            "'none'" : 'none',
                            "'full_bar'" : 'full_bar',
                            "'beer_and_wine'" : 'beer_and_wine',
                            }, inplace=True)
#fill price range with 2
business_df['RestaurantsPriceRange2'].fillna(2, inplace=True)
business_df['RestaurantsPriceRange2'] = business_df['RestaurantsPriceRange2'].astype(int)

#fill hours with generic hours dict
business_df['hours'].fillna("{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'Wednesday': '0:0-0:0', 'Thursday': '0:0-0:0', 'Friday': '0:0-0:0', 'Saturday': '0:0-0:0', 'Sunday': '0:0-0:0'}", inplace=True)

In [197]:
def encode_top_categories(row, valid_types):
    row_categories = set(row['categories'])
    return [1 if cat in row_categories else 0 for cat in valid_types]


business_df['categories'] = business_df['categories'].str.split(',')
business_df['categories'] = business_df['categories'].apply(lambda x: [str(cat).strip() for cat in x])

In [198]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=valid_types)

encoded_array = mlb.fit_transform(business_df['categories'])
# Create a DataFrame from the encoded array
encoded_df = pd.DataFrame(encoded_array, columns=mlb.classes_, index=business_df.index)

# Concatenate the original DataFrame with the new encoded DataFrame
business_df = pd.concat([business_df, encoded_df], axis=1)



In [199]:
#onehotencode alcohol, outdoor seating, and price using pandas get_dummies
business_df = pd.get_dummies(business_df, columns=['Alcohol', 'OutdoorSeating', 'RestaurantsPriceRange2'], dtype=int)


In [201]:
#scale the review data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(business_df[['stars']])
business_df['stars_scaled'] = scaler.transform(business_df[['stars']])