In [None]:
import pandas as pd
import numpy as np
from sklearn import linear_model, metrics
import time

# Load Business Data and Basic Cleaning

In [None]:
biz_df = pd.read_json('../../data/yelp_academic_dataset_business.json', lines=True)

In [None]:
biz_df.loc[:, 'cat_str_array'] = biz_df.loc[:, 'categories'].str.replace(' ','').str.split(',')

# Remove bad lat/lons

In [None]:
bad_coords_indices = np.invert(np.array([(v[0]<180) & (v[0]>-180) & (v[1]<90) & (v[1]>-90) 
                                         for v in biz_df[['longitude','latitude']].values]))

In [None]:
biz_df = biz_df.iloc[np.invert(bad_coords_indices)]
biz_df.index = range(biz_df.shape[0])

# Las Vegas only

In [None]:
las_vegas_misspells = np.array(['Las Vegas', 'Lake Las Vegas', 'Las  Vegas', 
                               'Las vegas', 'Las Vegass', 'La Vegas', 'Las Vegas,',
                               'Las Vegas Nv', 'Las Vegas, NV', 'Las Vegas Nevada', 
                               'Las Vegas East', 'LasVegas', 'Las Vegas & Henderson',
                               'las vegas', 'las Vegas'
                              ])
biz_df.loc[biz_df['city'].isin(las_vegas_misspells), 'city'] = 'Las Vegas'
biz_df = biz_df.loc[biz_df['city'] == 'Las Vegas']

# Sparse vector for categories

In [None]:
from itertools import chain
from collections import Counter

In [None]:
# Find all possible categories for a restaurant
restr_indices = biz_df['categories'].str.contains('Restaurants', na=False)
biz_df.loc[:, 'is_restaurant'] = 0
biz_df.loc[restr_indices, 'is_restaurant'] = 1
restr_cats_all = biz_df.loc[restr_indices, 'cat_str_array']
# Count code from: https://stackoverflow.com/questions/51813266/get-unique-values-from-pandas-series-of-lists
unique_cats = pd.DataFrame.from_dict(Counter(chain(*restr_cats_all)), orient='index').sort_values(0, ascending=False)
# Only include categories that occur more than once, and remove restaurants
unique_cats = unique_cats.loc[unique_cats[0] > 1].index[1:]

In [None]:
# Sparse restaurant categories vector
biz_df.loc[:, 'cat_vector'] = None
biz_df.loc[restr_indices, 'cat_vector'] = biz_df.loc[restr_indices, 'cat_str_array'].apply(
    lambda x: np.isin(unique_cats, x)*1)

# Reproject lat/lon to NA Albers Equal Area

In [None]:
import pyproj
wgs84_proj = pyproj.Proj(init='epsg:4326')
aea_proj = pyproj.Proj('+proj=aea +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs')
x_coords, y_coords = pyproj.transform(wgs84_proj, aea_proj, biz_df['longitude'].values, biz_df['latitude'].values)
biz_df.loc[:, 'x_coord'], biz_df.loc[:, 'y_coord'] = x_coords, y_coords

# Function to find the k closest business by lat/lon, faster

In [None]:
from sklearn import neighbors
dist_tree = neighbors.KDTree([(v[0],v[1]) for v in biz_df[['x_coord','y_coord']].values], leaf_size=2)

In [None]:
def closest_bizs(biz_index, df, k, kdtree):
    """Simple function to retrieve the n closest businesses"""
    indices = kdtree.query(np.array(
        [df.loc[biz_index, ['x_coord', 'y_coord']]
         .values]), k+1)[1][0, 1:]
    return df.iloc[indices]

def radius_bizs(biz_index, df, dist, kdtree):
    """Simple function to retrieve all businesses within distance 'dist' (meters)"""
    indices = kdtree.query_radius(
        df.loc[biz_index, ['x_coord', 'y_coord']].values.reshape(1, -1),
        r=dist)[0][1:]
    if len(indices) > 0:
        return df.iloc[indices]
    else: 
        return pd.DataFrame()

# Calc attributes from nearby businesses

In [None]:
def calc_review_count_feats(review_counts):
    return np.mean(review_counts), np.max(review_counts), np.min(review_counts)

def calc_star_feats(stars):
    return np.mean(stars), np.max(stars), np.min(stars)

def nearby_simp(df, kdtree, dist=1000):
    new_att_list = [
        'nn_count', 'nn_percent_rest', 
        'nn_all_percent_open', 'nn_rest_percent_open',
        'nn_mean_cat_sim', 'nn_max_cat_sim', 'nn_min_cat_sim',
        'nn_avg_stars', 'nn_max_stars', 'nn_min_stars', 
        'nn_avg_review_count', 'nn_max_review_count', 'nn_min_review_count',
        'nn_rest_avg_stars', 'nn_rest_max_stars', 'nn_rest_min_stars', 
        'nn_rest_avg_review_count', 'nn_rest_max_review_count', 'nn_rest_min_review_count',
        'nn_weighted_avg_stars','nn_weighted_avg_review_count',
        'nn_weighted_sum_stars', 'nn_weighted_sum_review_count']
    for att in new_att_list:
        df.loc[:,att] = -1
    
    rest_indices_list = biz_df.loc[biz_df['is_restaurant'] == 1].index
    output_list = []
    tracker = 1
    tcheck = time.time()
    for i in rest_indices_list:
        if tracker % 100 == 0:
            print('100 biz {}'.format(time.time() - tcheck))
            tcheck = time.time()
            print(tracker)
        # Get nearest businesses
        nn_df = radius_bizs(i, df, dist, kdtree)
        if nn_df.shape[0] > 0:
            cur_biz = df.loc[i]
            out_dict = {}
            
            tcheck = time.time()
            # Total business counts and open percentage
            tot_count = nn_df.shape[0]
            out_dict['nn_count'] = tot_count
            out_dict['nn_all_percent_open'] = int(100*nn_df['is_open'].sum()/tot_count)
            # Ratings
            out_dict['nn_avg_stars'], out_dict['nn_max_stars'], out_dict['nn_min_stars'] =\
                   calc_star_feats(nn_df['stars'].values)
            
            # Review counts
            out_dict['nn_avg_review_count'], out_dict['nn_max_review_count'], out_dict['nn_min_review_count'] =\
                   calc_star_feats(nn_df['review_count'].values)
            
            # Restaurant percentage
            nn_rest_df = nn_df.loc[nn_df['categories'].str.contains('Restaurants', na=False)]
            rest_count = nn_rest_df.shape[0]
            out_dict['nn_percent_rest'] = int(100*rest_count/tot_count)
            
            # Restaurant specific features, only work if there's a restaurant nearby
            if rest_count >0:

                out_dict['nn_rest_percent_open'] = int(100*nn_rest_df['is_open'].sum()/rest_count)

                # Check restaurant category similarities
                dot_products = np.array(
                    [np.dot(cur_biz['cat_vector'], cvec) for cvec in nn_rest_df['cat_vector'].values])
                out_dict['nn_mean_cat_sim'], out_dict['nn_max_cat_sim'], out_dict['nn_min_cat_sim'] =\
                    np.mean(dot_products), np.max(dot_products), np.min(dot_products)


                # Stars and review counts for nearby restaurants
                out_dict['nn_rest_avg_stars'], out_dict['nn_rest_max_stars'], out_dict['nn_rest_min_stars'] =\
                       calc_star_feats(nn_rest_df['stars'].values)
                out_dict['nn_rest_avg_review_count'], out_dict['nn_rest_max_review_count'], out_dict['nn_rest_min_review_count'] =\
                       calc_star_feats(nn_rest_df['review_count'].values)

                # Weighted stars and review counts
                dot_prod_total = np.sum(dot_prod_total)
                out_dict['nn_weighted_sum_stars'] = np.sum(dot_products*nn_rest_df['stars'])/dot_prod_total
                out_dict['nn_weighted_sum_review_count'] = np.sum(dot_products*nn_rest_df['review_count'])/dot_prod_total
                out_dict['nn_weighted_avg_stars'] = np.mean(dot_products*nn_rest_df['stars'])/dot_prod_total
                out_dict['nn_weighted_avg_review_count'] = np.mean(dot_products*nn_rest_df['review_count'])/dot_prod_total

            cur_biz.loc[out_dict.keys()] = tuple(out_dict.values())
            output_list.append(cur_biz)
            
        tracker+=1
                   
    return pd.DataFrame(output_list)


In [None]:
biz_att_df = nearby_atts_simp(biz_df, dist_tree)

In [None]:
biz_att_df_all = biz_att_df
biz_att_df = biz_att_df.loc[biz_att_df.nn_percent_rest>0]

In [None]:
biz_att_df.columns

# Plots of nearby biz atts

In [None]:
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
### KDE hist plot for neighbor counts
sns.set_style(style='white')

plt.figure(figsize=(20,10), facecolor='white')
plt.tick_params(axis='both', which='major', labelsize=18)


# Draw the density plot
snsfig = sns.distplot(biz_att_df['nn_count'], hist = True, norm_hist=False, kde=False,
                      kde_kws = {'linewidth': 3},
                      label = 'Total Nearby Businesses',
                      bins = range(0, int(np.max(biz_att_df['nn_count'])), 20))
snsfig = sns.distplot(biz_att_df['nn_count']*biz_att_df['nn_percent_rest']/100, hist = True,norm_hist=False, kde=False,
                      kde_kws = {'linewidth': 3},
                      label = 'Nearby Restaurants Only',
                      bins = range(0, int(np.max(biz_att_df['nn_count']*biz_att_df['nn_percent_rest']/100)), 20))

    
# Plot formatting
leg = plt.legend(prop={'size': 20}, title = 'User Label')
leg.get_title().set_fontsize(25)
plt.xlim([0,1000])
plt.title('# of Businesses within 1km', size=30)
plt.xlabel('# of Nearby Business/Restaurants', size = 20)
plt.ylabel('Count', size = 20)
plt.show()

In [None]:
def scatter_bfit(x, y, title, xlabel, ylabel, xlims=None, ylims=None):
    plt.figure(figsize=(8,8), facecolor='white')
    plt.scatter(x, y, alpha=0.25)
    plt.tick_params(axis='both', which='major', labelsize=12)
    m, b = np.polyfit(x, y, 1)
    if xlims != None:
        plt.xlim(xlims[0], xlims[1])
    else:
        xlims = (np.min(x), np.max(x))
    if ylims != None:
        plt.ylim(ylims[0], ylims[1])
    
    plt_x = np.arange(xlims[0], xlims[1], (xlims[1]-xlims[0])/len(x))
    print(plt_x)
    plt.plot(plt_x, m*plt_x + b, '--', linewidth=2.5, color='orange')
    
    # Labels
    plt.title(title, size=20)
    plt.xlabel(xlabel, size=16)
    plt.ylabel(ylabel, size=16)
    plt.show()

In [None]:
scatter_bfit(biz_att_df.nn_avg_stars, biz_att_df.stars, 'Neighbors Average Rating vs. Restaurant Rating',
            'Neighbors\' Avg Rating', 'Rating', ylims=[0.8,5.2])

In [None]:
scatter_bfit(biz_att_df.nn_avg_review_count, biz_att_df.stars, 'Neighbor Avg Review Count vs Rating',
            'Neighbor Avg Review Count', 'Rating', ylims=[0.8,5.2])

In [None]:
scatter_bfit(biz_att_df.nn_avg_review_count, biz_att_df.review_count, 'Neighbor Avg Review Count vs Review Count',
            'Neighbor Avg Review Count', 'Restaurant Review Count')

In [None]:
scatter_bfit(biz_att_df.nn_weighted_sum_stars, biz_att_df.stars, 'Similarity-Weighted Sum of Neighbor Ratings',
            'Weighted Neighbor Stars', 'Rating', ylims=[0.8,5.2])

In [None]:
scatter_bfit(biz_att_df.nn_weighted_sum_review_count, biz_att_df.stars, 'Similarity-Weighted Sum of Neighbor Review Counts',
            'Weighted Neighbor Review Counts', 'Rating', ylims=[0.8,5.2])

In [None]:
scatter_bfit(biz_att_df.nn_mean_cat_sim, biz_att_df.stars, 'Mean Neighbor Restaurant Category Similarity',
            'Mean Neighbor Similarity', 'Rating', xlims=[-0.2,5],ylims=[0.8,5.2])

# Census layers

In [None]:
from osgeo import ogr, osr
import geopandas as gpd
from shapely.geometry import Point
# Get county based on lat lon
counties = gpd.read_file('../../data/census/census_shapefiles/tl_2017_us_county.shp')

In [None]:
geometry = [Point(xy) for xy in zip(biz_att_df.longitude, biz_att_df.latitude)]
crs = {'init': 'epsg:4326'} #http://www.spatialreference.org/ref/epsg/2263/
geo_df = gpd.GeoDataFrame(biz_att_df, crs=crs, geometry=geometry)

In [None]:
# Get county of each.
county_intersection = [geo_df.geometry.intersects(cg) for cg in counties.geometry]

In [None]:
# They're all in the same county, but at least it's Clark County (the right one)
np.unique(np.where(np.array(county_intersection)!=0)[0])
counties.loc[151]

# Traffic Data