# Yelp Dataset Exploritory Data Analysis

## Business Dataset
Looking at the business dataset, I begin with the following questions:
- [How many businesses are there in Las Vegas](#numRestaurants)
- [Opening/Closing times](#openClose)
- [Basic stats about the stars](#starData)
 - STD
 - Mean
 - Mode
 - Median
- [Attributes](#attributes)
- [Closed Businesses](#closed)
- Is there a correlation between stars and price-range


In [None]:
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt
import json
import re
from scipy import stats

In [None]:
def make_pie(labels, count, explode_label='',t=''):
    
    explode = np.zeros(len(labels))

    if explode_label != '':
        explode_index = labels.index(explode_label)
        explode[explode_index] = 0.1

    # Plot
    plt.pie(count, explode=explode, labels=labels,
            autopct='%1.1f%%', shadow=True, startangle=140)
    plt.title(t)
    plt.axis('equal')
    plt.show()
    
    
def dict_to_pies(d_true, d_false):
    for key in d_true.keys():
        if key not in d_false.keys():
            # need key in both true and false otherwise results won't make sense
            continue
        if isinstance(d_true[key],dict):
            dict_to_pies(d_true[key], d_false[key])
        else:
            labels = ['True','False']
            values = [d_true[key],d_false[key]]
            make_pie(labels, values, t=key)
        
    
def make_bar(labels, count, t="", yl="", xl=""):
    y_pos = np.arange(len(labels))

    plt.bar(y_pos, count, align='center', alpha=0.5)
    plt.xticks(y_pos, labels)
    plt.ylabel(yl)
    plt.xlabel(xl)
    plt.title(t)

    plt.show()

<a id="numRestaurants"></a>
## Number of Restaurants in Las Vegas

In [None]:
# load in the file
business_file = 'yelp_dataset\\csv\\cities\\business_las_vegas.csv'
business_data = pd.read_csv(business_file)

# strip the entries that don't have anything in the categories subset.
business_data_clean_categories = business_data.dropna(subset=['categories'])

# grab all the frames with "Restaurants" category
restaurant_df = business_data_clean_categories[business_data_clean_categories['categories'].str.contains('Restaurants')]

nRestaurants = restaurant_df.shape[0]
print("There are %d restaurants in the file." % nRestaurants)

<a id="openClose"></a>
## Opening and Closing Times

In [None]:
hours = restaurant_df['hours']
hours = hours.dropna()
hours_dict_open = {'Monday':{},'Tuesday':{},'Wednesday':{},'Thursday':{},'Friday':{},'Saturday':{},'Sunday':{}}
hours_dict_close = {'Monday':{},'Tuesday':{},'Wednesday':{},'Thursday':{},'Friday':{},'Saturday':{},'Sunday':{}}

for row in hours:
    json_hours = row.replace("'", "\"")
    d = json.loads(json_hours)
    
    for key in d:
        times = d[key].split('-')
        _open = times[0].split(':')[0]
        _close = times[1].split(':')[0]
        
        if _open in hours_dict_open[key]:
            hours_dict_open[key][_open] += 1
        else:
            hours_dict_open[key][_open] = 1
            
        if _close in hours_dict_close[key]:
            hours_dict_close[key][_close] += 1
        else:
            hours_dict_close[key][_close] = 1
            
for key in hours_dict_close:
    keys_sorted_open = list(hours_dict_open[key].keys())
    keys_sorted_open.sort(key=int)
    values_sorted_open = [x for _,x in sorted(zip(list(hours_dict_open[key].keys()),list(hours_dict_open[key].values())))]
    make_bar(keys_sorted_open,values_sorted_open,t="Opening Time on "+key,xl="Hour of Day",yl="# of Restaurants")
    
    keys_sorted_close = list(hours_dict_close[key].keys())
    keys_sorted_close.sort(key=int)
    values_sorted_close = [x for _,x in sorted(zip(list(hours_dict_close[key].keys()),list(hours_dict_close[key].values())))]
    make_bar(keys_sorted_close,values_sorted_close,t="Closing Time on "+key,xl="Hour of Day",yl="# of Restaurants")

<a id="starData"></a>
## Star Data

In [None]:
stars = restaurant_df['stars'].values
print("Average:            %f" % stars.mean())
print("Standard Deviation: %f"% stars.std())
print("Mode:               %f" % stats.mode(stars)[0][0])
print("Median:             %f" % np.median(stars))

plt.hist(stars, bins=9,density=1,alpha=0.75)
plt.ylabel('% of Restaurants')
plt.xlabel("Number of Stars")
plt.title("Stars for Restaurants")
plt.grid(True)
plt.show()

<a id="attributes"></a>
## Attributes
This section compiles the attributes into two dictionaries. `att_dict` contains all the "True" counts aggregated on the attribute types. `att_dict_compliment` holds all the false counts. `att_dict_compliment` is needed because sometimes restaurants don't have 'True' or 'False' listed for a specific attribute, and assuming it to be 'False' would be incorrect.

In [None]:
attributes = restaurant_df['attributes']
attributes = attributes.dropna()
att_dict = {}
att_dict_compliment = {}

for att in attributes:
    try:
        json_attributes = att.replace("'", "\"")
    except AttributeError as e:
        print(att)
        raise e
    json_attributes = json_attributes.replace(': True',': "True"')
    json_attributes = json_attributes.replace(': False',': \"False\"')
    json_attributes = json_attributes.replace('"{','{')
    json_attributes = json_attributes.replace('}"','}')

    d=json.loads(json_attributes)
    for key in d:
        if isinstance(d[key],dict):
            if key not in att_dict.keys():
                att_dict[key] = {}
                att_dict_compliment[key] = {}
            for subkey in d[key]:
                if d[key][subkey] == 'True':
                    add_num = 1
                    add_num_compliment = 0
                elif d[key][subkey] == 'False':
                    add_num = 0
                    add_num_compliment = 1
                else:
                    raise KeyError
                if subkey in att_dict[key].keys():
                    att_dict[key][subkey] += add_num
                    att_dict_compliment[key][subkey] += add_num_compliment
                else:
                    att_dict[key][subkey] = add_num
                    att_dict_compliment[key][subkey] = add_num_compliment
        else:
            add_num = -1
            if d[key] == 'True':
                add_num = 1
                add_num_compliment = 0
            elif d[key] == 'False':
                add_num = 0
                add_num_compliment = 1
            else:
                add_num = d[key]
                            
            if type(add_num) == str:
                if key not in att_dict.keys():
                    att_dict[key] = {}
                if add_num in att_dict[key].keys():
                    att_dict[key][add_num] += 1
                else:
                    att_dict[key][add_num] = 1
            else:
                if key not in att_dict.keys():
                    att_dict[key] = add_num
                    att_dict_compliment[key] = add_num_compliment
                else:
                    att_dict[key] += add_num
                    att_dict_compliment[key] += add_num_compliment

# ======== UNCOMMENT TO PRINT THE ACTUAL NUMBERS ==============================
pprint(att_dict)                   
pprint(att_dict_compliment)

# ======== UNCOMMENT TO SEE BINARY PIE CHARTS =================================
dict_to_pies(att_dict, att_dict_compliment)
        

# ambience_labels = att_dict['Ambience'].keys()
# ambience_counts_true = att_dict['Ambience'].values()    
# ambience_counts_false = att_dict_compliment['Ambience'].values()


# make_pie(ambience_labels, ambience_counts_true,t='Ambience True Counts')
# make_pie(ambience_labels, ambience_counts_false,t='Ambience False Counts')

<a id="closed"></a>
## Businesses Open vs. Closed
Finding the number of businesses that are still open.

In [None]:
is_open = restaurant_df[['is_open','stars','review_count']].values
num = len(is_open[:,0])
closed = num - np.count_nonzero(is_open[:,0])
print("percent closed: %f" % (closed/num*100))

In [None]:
average_closed = is_open[:,1][np.where(is_open[:,0] == 0)].mean()
average_review_ct_closed = is_open[:,2][np.where(is_open[:,0] == 0)].mean()

print("Average stars for closed restaurants: %f" % average_closed)
print("Average review count for closed restaurants: %f" % average_review_ct_closed)


average_open = is_open[:,1][np.where(is_open[:,0] == 1)].mean()
average_review_ct_open = is_open[:,2][np.where(is_open[:,0] == 1)].mean()

print("\nAverage stars for open restaurants: %f" % average_open)
print("Average review count for open restaurants: %f" % average_review_ct_open)


In [None]:
restaurant_df