### Global variables that I use with different methods

In [81]:
import datetime
print(datetime.datetime.now())

2020-07-05 19:40:59.431243


In [82]:
# Global Variables that we will use throughout the notebook

TEST_SIZE              = 0.20 # train_size = 1.00 - test_size
TRAIN_SIZE             = 1.00 - TEST_SIZE
RANDOM_STATE           = 20200427
RESTORE_PICKLE         = True

In [83]:
import numpy as np
import pandas as pd
import scipy
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

from matplotlib.colors import ListedColormap

from scipy.spatial.distance import cdist
import seaborn as sn
from sklearn import cluster, metrics
from sklearn import datasets, ensemble, metrics, linear_model
from sklearn.cluster import AffinityPropagation, KMeans, MeanShift, estimate_bandwidth, SpectralClustering, DBSCAN
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, FeatureHasher
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix, pairwise_distances, mean_squared_error
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV,cross_val_score, train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, normalize, LabelEncoder
from sklearn.utils import shuffle

import json
import spacy
import statsmodels.api as sm
import unicodedata

import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
# conda install -c anaconda nltk
# nltk.download('gutenberg')
# nltk.download('punkt')
# nltk.download("wordnet")

from collections import Counter

# import umap

from ast import literal_eval
import chardet, codecs, datetime, os, platform, random, re, string, sys, time, unicodedata
from IPython.display import Markdown, display
from zipfile import ZipFile

In [84]:
#Global settings
pd.set_option('display.max_rows', 1000)
pd.set_option("max_rows", 1000)
pd.options.display.width=400
pd.options.display.max_colwidth=180
pd.set_option('display.max_columns', 200)
pd.set_option("max_columns", 200)

In [85]:
# We will uses this with TF-IDF

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [86]:
def run_tfidf_vectorizer(df, parameters={}):
    
    vectorizer = TfidfVectorizer(**parameters)
    vectors = vectorizer.fit_transform(df.values.astype('U'))
    feature_names = vectorizer.get_feature_names()
    tfidf_df_dense = vectors.toarray()
    df = pd.DataFrame(tfidf_df_dense, columns=feature_names)

    return df

In [87]:
# This function allows us to read a csv from a Zip file, specifying the columns that we want to use
# It returns a data frame

def file_stuff(path, filename, filetype, zipfilename=None, dtypes=None, usecols=None):
    fullfilename = "{}".format(path+'/'+filename)
    
    print("fullfilename = {}".format(fullfilename))
        
    if zipfilename:
        zip_file = ZipFile(zipfilename)
        
    if filetype == 'csv':
        if zipfilename:
            df = pd.read_csv(zip_file.open(fullfilename), dtype=dtypes, usecols=usecols)
        else:
            df = pd.read_csv(fullfilename, dtype=dtypes, usecols=usecols)
    
    if filetype == 'json':
        if zipfilename:
            df = pd.read_json(zip_file.open(fullfilename))
        else:
            
            df = pd.read_json(fullfilename)
       
    print("There are {} rows in this file.".format(df.shape[0]))
    
    return df

In [88]:
def pickle_it(mode, df, file_name):
    
    if mode == 'to_pickle':
        print_timestamp('Now pickling file {}.'.format(file_name))
        df.to_pickle(file_name)  # where to save it, usually as a .pkl
        print_timestamp('File pickled successfully {}.'.format(file_name))
    
    else: 
        # Then you can load it back using:
        print_timestamp('Now restoring pickled file {}.'.format(file_name))
        df = pd.read_pickle(file_name)
        print_timestamp('Restored pickled file {}.'.format(file_name))
        return df

In [89]:
def print_timestamp(displaytext):    
    datetime_now = str(datetime.datetime.now())
    printFormatted("{:19.19}: In: {} {} ".format(datetime_now, sys._getframe(1).f_code.co_name, displaytext))

In [90]:
def printFormatted(string):
    newline = '\n'
    display(Markdown(string))
    write_to_logfile(string+newline)

In [91]:
def write_to_logfile(message, mdformat=''):
    bufsize = 0
    with open('TestResults.md', 'a+') as the_file:
        the_file.write('{} {}'.format(mdformat, message))

In [92]:
def read_json_to_pandas(filename):
    with open(filename) as json_file:      
        data = json_file.readlines()
        # this line below may take at least 8-10 minutes of processing for 4-5 million rows. It converts all strings in list to actual json objects. 
        data = list(map(json.loads, data)) 

    df = pd.DataFrame(data)
    
    print("There are {} rows in this file.".format(df.shape[0]))
    #json_lines... 

    return df

In [93]:
path = '/Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive'
biz_file = 'yelp_academic_dataset_business'
user_file = 'yelp_academic_dataset_user'
review_file = 'yelp_academic_dataset_review'

# It took 10m to load this entire dataset via JSON files.
if RESTORE_PICKLE:
    print("here we go")
    df_yelp_business = pickle_it('from_pickle', None, "{}/{}.{}".format(path, biz_file,'pkl'))
    df_yelp_user = pickle_it('from_pickle', None, "{}/{}.{}".format(path, user_file,'pkl'))
    df_yelp_review = pickle_it('from_pickle', None, "{}/{}.{}".format(path, review_file,'pkl'))
    
else:
    print_timestamp('Starting json_to_pandas on yelp_business')
    df_yelp_business = read_json_to_pandas("{}/{}.{}".format(path, biz_file,'json'))
    print_timestamp('Starting json_to_pandas on yelp_user')
    df_yelp_user = read_json_to_pandas("{}/{}.{}".format(path, user_file,'json'))
    print_timestamp('Starting json_to_pandas on yelp_review')
    df_yelp_review = read_json_to_pandas("{}/{}.{}".format(path, review_file,'json'))
    print_timestamp('Finished with json_to_pandas on yelp_review')

    pickle_it('to_pickle', df_yelp_business, "{}/{}.{}".format(path, biz_file,'pkl'))
    pickle_it('to_pickle', df_yelp_user, "{}/{}.{}".format(path, user_file,'pkl'))
    pickle_it('to_pickle', df_yelp_review, "{}/{}.{}".format(path, review_file,'pkl'))

print("that's all for now")

# 10m to load from JSON
# 46ss to load from Pickle

here we go


2020-07-05 19:40:59: In: pickle_it Now restoring pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_business.pkl. 

2020-07-05 19:41:01: In: pickle_it Restored pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_business.pkl. 

2020-07-05 19:41:01: In: pickle_it Now restoring pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_user.pkl. 

2020-07-05 19:41:12: In: pickle_it Restored pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_user.pkl. 

2020-07-05 19:41:22: In: pickle_it Now restoring pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_review.pkl. 

2020-07-05 19:41:55: In: pickle_it Restored pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_review.pkl. 

that's all for now


In [94]:
# df_yelp_business.sample(5)
df_yelp_business.head(5)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikeParking': 'True', 'GoodForKids': 'False', 'BusinessParking': '{'garage': False, 'street': False, 'validated': False, 'lot': True, 'v...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Shopping","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0', 'Wednesday': '10:0-18:0', 'Thursday': '11:0-20:0', 'Friday': '11:0-20:0', 'Saturday': '11:0-20:0', 'Sunday': '13:0-18:0'}"
1,Yzvjg0SayhoZgCljUJRF9Q,"Carlos Santo, NMD","8880 E Via Linda, Ste 107",Scottsdale,AZ,85258,33.569404,-111.890264,5.0,4,1,"{'GoodForKids': 'True', 'ByAppointmentOnly': 'True'}","Health & Medical, Fitness & Instruction, Yoga, Active Life, Pilates",
2,XNoUzKckATkOD1hP6vghZg,Felinus,3554 Rue Notre-Dame O,Montreal,QC,H4C 1P4,45.479984,-73.58007,5.0,5,1,,"Pets, Pet Services, Pet Groomers",
3,6OAZjbxqM5ol29BuHsil3w,Nevada House of Hose,1015 Sharp Cir,North Las Vegas,NV,89030,36.219728,-115.127725,2.5,3,0,"{'BusinessAcceptsCreditCards': 'True', 'ByAppointmentOnly': 'False', 'DogsAllowed': 'True', 'BikeParking': 'True', 'BusinessParking': '{'garage': False, 'street': False, 'valid...","Hardware Stores, Home Services, Building Supplies, Home & Garden, Shopping","{'Monday': '7:0-16:0', 'Tuesday': '7:0-16:0', 'Wednesday': '7:0-16:0', 'Thursday': '7:0-16:0', 'Friday': '7:0-16:0'}"
4,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726648,4.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppointmentOnly': 'True'}","Home Services, Plumbing, Electricians, Handyman, Contractors","{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', 'Wednesday': '9:0-16:0', 'Thursday': '9:0-16:0', 'Friday': '9:0-16:0'}"


In [95]:
df_yelp_business[['name','city','state','latitude','longitude','attributes','categories']].head(4)

Unnamed: 0,name,city,state,latitude,longitude,attributes,categories
0,The Range At Lake Norman,Cornelius,NC,35.462724,-80.852612,"{'BusinessAcceptsCreditCards': 'True', 'BikeParking': 'True', 'GoodForKids': 'False', 'BusinessParking': '{'garage': False, 'street': False, 'validated': False, 'lot': True, 'v...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Shopping"
1,"Carlos Santo, NMD",Scottsdale,AZ,33.569404,-111.890264,"{'GoodForKids': 'True', 'ByAppointmentOnly': 'True'}","Health & Medical, Fitness & Instruction, Yoga, Active Life, Pilates"
2,Felinus,Montreal,QC,45.479984,-73.58007,,"Pets, Pet Services, Pet Groomers"
3,Nevada House of Hose,North Las Vegas,NV,36.219728,-115.127725,"{'BusinessAcceptsCreditCards': 'True', 'ByAppointmentOnly': 'False', 'DogsAllowed': 'True', 'BikeParking': 'True', 'BusinessParking': '{'garage': False, 'street': False, 'valid...","Hardware Stores, Home Services, Building Supplies, Home & Garden, Shopping"


In [96]:
# print(df_yelp_business.shape)
print(df_yelp_business.city.nunique())

1251


In [97]:
df_yelp_user[['name','review_count','yelping_since','useful','funny','cool', 'elite','friends','fans']].head(4)

Unnamed: 0,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans
0,Rafael,553,2007-07-06 03:27:11,628,225,227,,"oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfDg, IkRib6Xs91PPW7pon7VVig, A8Aq8f0-XvLBcyMk2GJdJQ, eEZM1kogR7eL4GOBZyPvBA, e1o1LN7ez5ckCpQeAab4iw, _HrJVzFaRFUhPva8cwBjpQ, pZeGZGzX...",14
1,Michelle,564,2008-04-28 01:29:25,790,316,400,200820092010201120122013,"ly7EnE8leJmyqyePVYFlug, pRlR63iDytsnnniPb3AOug, kc-rnN-ndnFTdHG4TfIgeQ, GYndf-h6dAwpGP0lDBz2Wg, FPo3SwQuAK53QVZm_eIyBg, 9fF_T3pQu3ay1oA7h_VYNA, G5T3bd6dUs5zkQ2VMZtRUw, tufuEc5f...",27
2,Martin,60,2008-08-28 23:40:05,151,125,103,2010,"Uwlk0txjQBPw_JhHsQnyeg, Ybxr1tSCkv3lYA0I1qmnPQ, DNmeLov3wXNxlxjN5feBoQ, x7n69vEsYFh9xnW3D5lPPQ, -AaBjWJYiQxXkCMDlXfPGw, COXnA2hnzFDai3ywx_iM8A, dUFoyswTt5ZQbleF3_4TCg, uj2AWSvs...",5
3,John,206,2008-09-20 00:08:14,233,160,84,2009,"iog3Nyg1i4jeumiTVG_BSA, M92xWY2Vr9w0xoH8bPplfQ, So46aZ3y7zRl2VmFK35vCQ, vrZmtsiaIZBr42KwAve5qA, SaNDaz5rBQs-5gyhOkO1MA, xTcuKbp7ocDcZDD_bcK9hw, PpzliPkE_fzsI6r15UMZFA, Ygr_c6So...",6


In [98]:
df_yelp_review.sample(4)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
470844,3pa2xjQ9BiBVu_MDHpeyRA,IYpocDjXsAtL9B6XYOpewg,MOB5oCEKCw3S76SsiuQjqA,5.0,0,0,0,"I came based on all the yelp reviews. Really cute, open and nice seating area. Gets busy at lunch but it's a nice a place for lunch or brunch. The portions are reasonable and t...",2018-10-25 17:03:01
1882384,Bccl8U4c3MybA6Swz92ESg,WrnUsnDK_RiTH7cPw3sl5Q,iJBnqweAPDTCfyMcRrG90w,5.0,0,0,0,This place is great. The owner Doug is the best very friendly people. and the food is AAAA+++++,2009-07-06 01:28:11
452040,xNjZcWIyOy8t8TZeSv_rTg,sDA3lmmU1E_EbptNw8DMag,259_yfBSoAMrhIb8PgBGtA,5.0,1,0,0,"This is a lovely park that is a permanent home to many folks. The ""park models"" are very well kept, the grounds are neat, clean and well groomed and the RV sites are nicely le...",2018-05-02 21:06:10
3740084,J1HKKILgku2zDxAykbZhQA,xIjQs4DT-7ZgsWJNGgxg5w,GPVHfoDjSOivqgIDjIOA-Q,4.0,0,0,0,"Great service, The server Monze martinez is amazing! always on top of everything. I highly recommend her!",2018-08-08 19:04:08


In [99]:
df_yelp_review[['text','date']].head(4)

Unnamed: 0,text,date
0,"As someone who has worked with many museums, I was eager to visit this gallery on my most recent trip to Las Vegas. When I saw they would be showing infamous eggs of the House ...",2015-04-15 05:21:16
1,I am actually horrified this place is still in business. My 3 year old son needed a haircut this past summer and the lure of the $7 kids cut signs got me in the door. We had to...,2013-12-07 03:16:52
2,"I love Deagan's. I do. I really do. The atmosphere is cozy and festive. The shrimp tacos and house fries are my standbys. The fries are sometimes good and sometimes great, and ...",2015-12-05 03:18:11
3,"Dismal, lukewarm, defrosted-tasting ""TexMex"" glop;\n\nMumbly, unengaged waiter;\n\nClueless manager, who seeing us with barely nibbled entrees\non plates shoved forward for pic...",2011-05-27 05:30:52


In [100]:
df_yelp_business.describe()

Unnamed: 0,latitude,longitude,stars,review_count,is_open
count,209393.0,209393.0,209393.0,209393.0,209393.0
mean,38.579934,-97.390217,3.538055,36.937505,0.806632
std,4.940448,16.718535,1.023543,123.343597,0.39494
min,21.497258,-158.025525,1.0,3.0,0.0
25%,33.638658,-112.269476,3.0,4.0,1.0
50%,36.147408,-111.743531,3.5,9.0,1.0
75%,43.611693,-79.972679,4.5,27.0,1.0
max,51.299943,-72.80655,5.0,10129.0,1.0


In [101]:
df_yelp_user.describe()

Unnamed: 0,review_count,useful,funny,cool,fans,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
count,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0
mean,22.1693,39.82709,17.03435,21.70796,1.458824,3.648087,1.980362,0.3004191,0.1873203,0.1614454,0.07019241,1.372517,2.849952,2.819167,2.819167,1.058364,1.114744
std,76.74226,513.3536,355.0568,445.7187,16.67521,1.172525,72.29082,12.75309,15.07502,11.62099,9.967903,58.82533,96.94462,86.51499,86.51499,31.54894,92.26612
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,3.0,0.0,0.0,0.0,3.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,15.0,13.0,3.0,3.0,0.0,4.57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,14455.0,197130.0,165861.0,191359.0,11568.0,5.0,25452.0,13501.0,14179.0,13654.0,12669.0,58480.0,72370.0,40508.0,40508.0,15445.0,82622.0


In [102]:
df_yelp_review.describe()

Unnamed: 0,stars,useful,funny,cool
count,8021122.0,8021122.0,8021122.0,8021122.0
mean,3.703575,1.322882,0.4596423,0.574562
std,1.490486,3.550831,2.188143,2.476906
min,1.0,-1.0,0.0,-1.0
25%,3.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0
75%,5.0,1.0,0.0,0.0
max,5.0,1122.0,976.0,502.0


In [103]:
# Pandas Yelp Queries
# df_yelp_user[(df_yelp_user.review_count > 100)].count()
df_yelp_review['text'].sample(20)

3527270    Located right in the centre of old montreal, this restaurant was one of the only ones open on Christmas. It was suppose to have live jazz music but we didn't see a live band.\n...
2967402    I'm a bit overdue for this review, my car broke down awhile ago and I needed something quick. Started looking around for a used car and everything was just super expensive, unt...
3013367    Amazing, professional and caring staff and doctors .\nVery thankful to work so close to this location. I may walk-in with discomfort, but I walk out feeling better every time I...
7633246    Jazzman saved the day today!!!\n\nI needed a belt to complete a suit for a wedding I'm attending this weekend and one of the employees in the store helped me by asking about my...
4163932    Aww come on! What's up with these bad reviews? The service sucks but it works! The food is average, I liked it...but that's not the appeal! It just makes the appeal that much b...
1394054    So after running around the city d

In [104]:
# cleanup functions

def cleanup_text(text):
    text = re.sub(r"\n","", text)                              # remove newlines
    text = re.sub('[^\w\s]','', text)                          # remove punctuation
    text = text.lower()                                        # lower case all letters
    text_list = text.split()                                   # split it for the list comprehension
    text_list = [x for x in text_list if len(x) > 1            # we only want words longer than 1 character
                 & x.isdigit() == False ]                      # no numerics
    text = " ".join(text_list)                                 # put it back into a string    
    return text

def cleanup_attributes(column_value):
    column_value = re.sub(r"'True'",'1', column_value)
    column_value = re.sub(r"'False'",'0', column_value)
    column_value = re.sub(r"'None'",'0', column_value)

    column_value = re.sub(r"''","'", column_value)
    column_value = re.sub(r"''","'", column_value)
    return column_value

In [105]:
# total_rows['ColumnID'] = total_rows['ColumnID'].astype(str)
# df_yelp_business[df_yelp_business['categories'].str.contains('Restaurant')].count()
# df_yelp_business[df_yelp_business['categories'].str.contains('Restaurant')].sample(20)
# df_yelp_business_restaurants = df_yelp_business[df_yelp_business['categories'].str.contains('Restaurant')]
# Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
#        'latitude', 'longitude', 'stars', 'review_count', 'is_open',
#        'attributes', 'categories', 'hours']

# Cleanup the features: categories, attributes, city, and name

# Cleanup df_yelp_business
df_yelp_business.dropna(subset=['categories', 'attributes'], inplace=True)
df_yelp_business['attributes'] = df_yelp_business['attributes'].astype(str) # df['COLUMN1'].str.decode("utf-8") .str.decode("utf-8") 
df_yelp_business['city'] = df_yelp_business['city'].str.lower() # move to lowercase - title tries to be too smart
df_yelp_business['name'] = df_yelp_business['name'].str.lower()
df_yelp_business['categories'] = df_yelp_business['categories'].str.lower()
# df_yelp_business['attributes'] = df_yelp_business['attributes'].str.lower()
df_yelp_business['categories'] = df_yelp_business['categories'].astype(str)
# df_yelp_business['attributes'] = df_yelp_business['attributes'].astype(str) # could this be what is messing up attributes???
# df_yelp_business['attributes'] = df_yelp_business['attributes'].apply(lambda col: cleanup_attributes(col)) # get the primarygenre
df_yelp_business_restaurants = df_yelp_business[df_yelp_business['categories'].str.contains('restaurant')][['business_id','name','city', 
                                                                                                            'postal_code', 'state',
                                                                                                            'latitude','longitude',
                                                                                                            'review_count', 'attributes',
                                                                                                             'categories']]
df_yelp_biz_LV_Rest = df_yelp_business[
                                       (df_yelp_business['categories'].str.contains('restaurant') == True) &
                                       (df_yelp_business['city'].str.contains('vegas') == True) &
                                       (df_yelp_business.review_count > 40)
                                      ]

# We should only use df_yelp_biz_LV_Rest from here on...
df_yelp_biz_LV_Rest.head(5)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
130,Q_dh08clYUPj13GmCRzIVA,kulinarya express filipino kitchen,"7960 S Rainbow Blvd, Ste 8000A",las vegas,NV,89139,36.043663,-115.241881,4.0,82,0,"{'BusinessParking': ""{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}"", 'RestaurantsDelivery': 'True', 'HasTV': 'False', 'OutdoorSeating': 'F...","filipino, restaurants, breakfast & brunch, food, street vendors","{'Monday': '0:0-0:0', 'Tuesday': '10:0-20:0', 'Wednesday': '10:0-20:0', 'Thursday': '10:0-20:0', 'Friday': '10:0-20:0', 'Saturday': '10:0-20:0', 'Sunday': '10:0-20:0'}"
157,Yr_w9lakJrKMyEG_hI6zbA,fat moe's pizza & wings,"6125 W Tropicana Ave, Ste F",las vegas,NV,89103,36.099361,-115.226636,4.0,141,1,"{'RestaurantsAttire': ""u'casual'"", 'RestaurantsDelivery': 'True', 'NoiseLevel': ""'quiet'"", 'HasTV': 'True', 'RestaurantsPriceRange2': '1', 'Ambience': ""{'romantic': False, 'int...","pizza, salad, burgers, restaurants","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0', 'Wednesday': '11:0-22:0', 'Thursday': '11:0-22:0', 'Friday': '11:0-22:0', 'Saturday': '11:0-22:0', 'Sunday': '11:0-22:0'}"
238,AN0bWhisCf6LN9eHZ7DQ3w,los olivos ristorante,3759 E Desert Inn Rd,las vegas,NV,89121,36.129178,-115.092483,5.0,222,1,"{'WiFi': ""u'free'"", 'RestaurantsPriceRange2': '2', 'ByAppointmentOnly': 'False', 'BikeParking': 'True', 'RestaurantsGoodForGroups': 'True', 'RestaurantsDelivery': 'False', 'Has...","restaurants, italian","{'Monday': '0:0-0:0', 'Tuesday': '16:0-21:0', 'Wednesday': '16:0-21:0', 'Thursday': '16:0-21:0', 'Friday': '16:0-21:0', 'Saturday': '16:0-21:0', 'Sunday': '16:0-21:0'}"
246,AtD6B83S4Mbmq0t7iDnUVA,veggie house,"5115 Spring Mountain Rd, Ste 203",las vegas,NV,89146,36.125569,-115.210911,4.5,1142,1,"{'RestaurantsPriceRange2': '2', 'BikeParking': 'True', 'OutdoorSeating': 'False', 'RestaurantsGoodForGroups': 'True', 'Caters': 'True', 'Alcohol': ""u'none'"", 'GoodForKids': 'Tr...","restaurants, specialty food, japanese, sushi bars, dim sum, vegetarian, food, ramen, chinese, vegan","{'Monday': '11:30-21:30', 'Tuesday': '11:30-21:30', 'Wednesday': '11:30-21:30', 'Thursday': '11:30-21:30', 'Friday': '11:30-21:30', 'Saturday': '11:30-21:30', 'Sunday': '11:30-..."
308,oUX2bYbqjqST-urKbOHG6w,loftti cafe,"7729 S Rainbow Blvd, Ste 9B",las vegas,NV,89139,36.047942,-115.244167,4.5,284,1,"{'OutdoorSeating': 'True', 'BusinessParking': ""{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}"", 'HasTV': 'False', 'BusinessAcceptsCreditCa...","sandwiches, shaved ice, coffee & tea, desserts, cafes, themed cafes, shaved snow, food, bubble tea, juice bars & smoothies, restaurants","{'Monday': '0:0-0:0', 'Tuesday': '8:0-3:0', 'Wednesday': '8:0-3:0', 'Thursday': '8:0-3:0', 'Friday': '8:0-3:0', 'Saturday': '8:0-3:0', 'Sunday': '11:30-20:0'}"


In [106]:
# df_yelp_biz_LV_Rest.head(20) # there are 4,284 restaurants in Las Vegas, with 40 or more reviews! 24-Jun-2020 
# df_yelp_reviews_LV_Rest = 
# https://stackoverflow.com/questions/34055584/python-pandas-string-contains-and-doesnt-contain
# df_yelp_biz2_LV[(df_yelp_biz2_LV.review_count > 50)].sample(20)
# df_yelp_biz2.sample(20)
# df_yelp_biz_LV_Rest.count()
# df_yelp_business.count()
# df_yelp_business_restaurants.count()
# df_yelp_biz2_socal.head(30)
# df_yelp_biz_LV_Rest['attributes'].head(20)
df_yelp_business['attributes'].sample(30)

24312                                                                                 {'BusinessAcceptsCreditCards': 'False', 'BusinessAcceptsBitcoin': 'False', 'ByAppointmentOnly': 'True'}
139753    {'BusinessAcceptsCreditCards': 'True', 'NoiseLevel': "u'average'", 'RestaurantsTakeOut': 'True', 'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot...
12461                                       {'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}", 'BusinessAcceptsCreditCards': 'True'}
118776                                                                                                                                                 {'BusinessAcceptsCreditCards': 'True'}
62037     {'Ambience': "{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}",...
74406     {'OutdoorSeating': 'True', 'RestaurantsP

In [107]:
# df = pd.merge(df,df2[['Key_Column','Target_Column']],on='Key_Column', how='left')
df_yelp_review_LV_Rest = pd.merge(df_yelp_review.sample(100000),df_yelp_biz_LV_Rest['business_id'],on='business_id') # this worked! count=1,484,887!
# Cleanup df_yelp_review
df_yelp_review_LV_Rest['text'] = df_yelp_review_LV_Rest['text'].apply(lambda col: cleanup_text(col)) # Cleanup the yelp_review_LV_Rest only

In [108]:
# df_yelp_review_LV_Rest.head(5)
# df_yelp_biz_LV_Rest.sample(20)
df_yelp_business.sample(4)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
2043,E0OKctOhZpUQgwUUBc_RwA,the habit burger grill,"4830 Blue Diamond Rd, Ste 130",las vegas,NV,89139,36.033398,-115.207118,4.0,322,1,"{'BikeParking': 'True', 'BusinessAcceptsCreditCards': 'True', 'Caters': 'True', 'RestaurantsAttire': ""'casual'"", 'NoiseLevel': ""'average'"", 'DriveThru': 'True', 'WiFi': ""u'no'""...","sandwiches, salad, burgers, restaurants, food, food trucks","{'Monday': '0:0-0:0', 'Tuesday': '10:30-22:0', 'Wednesday': '10:30-22:0', 'Thursday': '10:30-22:0', 'Friday': '10:30-23:0', 'Saturday': '10:30-23:0', 'Sunday': '10:30-22:0'}"
185675,aOvEr1tluvkmwZjpzfxBAg,streat mobile bistro,801 W Superior Ave,cleveland,OH,44113,41.496183,-81.697264,4.5,20,1,"{'Ambience': ""{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': False}"",...","street vendors, food stands, food, american (new), restaurants, cafes, food trucks","{'Monday': '9:0-23:30', 'Tuesday': '9:0-23:30', 'Wednesday': '9:0-23:30', 'Thursday': '9:0-23:30', 'Friday': '9:0-23:30', 'Saturday': '9:0-23:30', 'Sunday': '9:0-23:30'}"
129369,L_X5eAd4moE40AsX6Havhg,1900 mexican grill,5110-1E Park Rd,charlotte,NC,28209,35.161133,-80.849282,3.5,112,1,"{'Alcohol': ""u'full_bar'"", 'BikeParking': 'True', 'RestaurantsReservations': 'False', 'RestaurantsAttire': ""u'casual'"", 'OutdoorSeating': 'True', 'RestaurantsDelivery': 'False'...","restaurants, mexican","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0', 'Wednesday': '11:0-22:0', 'Thursday': '11:0-22:0', 'Friday': '11:0-23:0', 'Saturday': '11:0-23:0', 'Sunday': '12:0-21:0'}"
21771,wpStZsQ3yKudY92DRq-mOA,pizza hut,4970 S Gilbert Rd,chandler,AZ,85249,33.234237,-111.790912,2.0,19,1,"{'Ambience': ""{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}"",...","chicken wings, restaurants, italian, pizza","{'Monday': '10:30-23:0', 'Tuesday': '10:30-23:0', 'Wednesday': '10:30-23:0', 'Thursday': '10:30-23:0', 'Friday': '10:30-0:0', 'Saturday': '10:30-22:0', 'Sunday': '10:30-23:0'}"


In [109]:
parameters = {
              'lowercase': True,
              'max_df': 0.65, 
              'min_df': 10,  
              'max_features': 100,
              'norm': u'l2',
              'smooth_idf' :True,
              'stop_words': 'english',
              'tokenizer': LemmaTokenizer(),
              'use_idf': True
}
df_yelp_review_LV_Rest_tfidf = run_tfidf_vectorizer(df_yelp_review_LV_Rest['text'],parameters=parameters)

  'stop_words.' % sorted(inconsistent))


In [110]:
# df_yelp_review_LV_Rest.count()
# df_yelp_review.count()
df_yelp_business.count()
# df_yelp_review_LV_Rest[['text']].sample(20)
# df_yelp_review_LV_Rest_tfidf[['ha','wa','u']].sample(10)
# df_yelp_review_LV_Rest_tfidf[['ha','wa','u']].describe()

business_id     180347
name            180347
address         180347
city            180347
state           180347
postal_code     180347
latitude        180347
longitude       180347
stars           180347
review_count    180347
is_open         180347
attributes      180347
categories      180347
hours           146796
dtype: int64

In [111]:
# sorted_df.tail(20)
df_yelp_biz_LV_Rest['categories'].sample(10)
df_yelp_biz_LV_Rest['attributes'].sample(10)

136542    {'RestaurantsAttire': "u'casual'", 'Caters': 'True', 'Alcohol': "u'none'", 'Ambience': "{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': Fal...
83689     {'BusinessParking': "{'garage': True, 'street': False, 'validated': False, 'lot': False, 'valet': True}", 'NoiseLevel': "u'average'", 'BikeParking': 'False', 'BusinessAcceptsCr...
204135    {'Caters': 'False', 'RestaurantsPriceRange2': '2', 'BikeParking': 'False', 'BusinessParking': "{'garage': True, 'street': False, 'validated': False, 'lot': False, 'valet': Fals...
23898     {'HasTV': 'False', 'WiFi': "u'free'", 'RestaurantsAttire': "u'casual'", 'BusinessParking': "{'garage': True, 'street': False, 'validated': False, 'lot': False, 'valet': False}"...
45175     {'RestaurantsGoodForGroups': 'True', 'RestaurantsTakeOut': 'True', 'WiFi': "u'no'", 'RestaurantsDelivery': 'False', 'RestaurantsPriceRange2': '2', 'RestaurantsReservations': 'T...
129002    {'RestaurantsReservations': 'True', 'Res

In [112]:
# type(eval(df_yelp_business.categories[0]))

In [113]:
# def get_first_name_in_list(r, column_number, colname):
#     return eval(r.values.tolist()[column_number])[0]['{}'.format(colname)]

# df_movies_dataset['genre'] = df_movies_dataset.apply(lambda row: get_first_name_in_list(row, 1,'name'), axis=1) # get the primarygenre
# df_yelp_business['attributes'].head(5)
# print(type(df_yelp_business['attributes'][0]))
# df_yelp_biz_LV_Rest.count()
df_yelp_biz_LV_Rest.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
130,Q_dh08clYUPj13GmCRzIVA,kulinarya express filipino kitchen,"7960 S Rainbow Blvd, Ste 8000A",las vegas,NV,89139,36.043663,-115.241881,4.0,82,0,"{'BusinessParking': ""{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}"", 'RestaurantsDelivery': 'True', 'HasTV': 'False', 'OutdoorSeating': 'F...","filipino, restaurants, breakfast & brunch, food, street vendors","{'Monday': '0:0-0:0', 'Tuesday': '10:0-20:0', 'Wednesday': '10:0-20:0', 'Thursday': '10:0-20:0', 'Friday': '10:0-20:0', 'Saturday': '10:0-20:0', 'Sunday': '10:0-20:0'}"
157,Yr_w9lakJrKMyEG_hI6zbA,fat moe's pizza & wings,"6125 W Tropicana Ave, Ste F",las vegas,NV,89103,36.099361,-115.226636,4.0,141,1,"{'RestaurantsAttire': ""u'casual'"", 'RestaurantsDelivery': 'True', 'NoiseLevel': ""'quiet'"", 'HasTV': 'True', 'RestaurantsPriceRange2': '1', 'Ambience': ""{'romantic': False, 'int...","pizza, salad, burgers, restaurants","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0', 'Wednesday': '11:0-22:0', 'Thursday': '11:0-22:0', 'Friday': '11:0-22:0', 'Saturday': '11:0-22:0', 'Sunday': '11:0-22:0'}"
238,AN0bWhisCf6LN9eHZ7DQ3w,los olivos ristorante,3759 E Desert Inn Rd,las vegas,NV,89121,36.129178,-115.092483,5.0,222,1,"{'WiFi': ""u'free'"", 'RestaurantsPriceRange2': '2', 'ByAppointmentOnly': 'False', 'BikeParking': 'True', 'RestaurantsGoodForGroups': 'True', 'RestaurantsDelivery': 'False', 'Has...","restaurants, italian","{'Monday': '0:0-0:0', 'Tuesday': '16:0-21:0', 'Wednesday': '16:0-21:0', 'Thursday': '16:0-21:0', 'Friday': '16:0-21:0', 'Saturday': '16:0-21:0', 'Sunday': '16:0-21:0'}"
246,AtD6B83S4Mbmq0t7iDnUVA,veggie house,"5115 Spring Mountain Rd, Ste 203",las vegas,NV,89146,36.125569,-115.210911,4.5,1142,1,"{'RestaurantsPriceRange2': '2', 'BikeParking': 'True', 'OutdoorSeating': 'False', 'RestaurantsGoodForGroups': 'True', 'Caters': 'True', 'Alcohol': ""u'none'"", 'GoodForKids': 'Tr...","restaurants, specialty food, japanese, sushi bars, dim sum, vegetarian, food, ramen, chinese, vegan","{'Monday': '11:30-21:30', 'Tuesday': '11:30-21:30', 'Wednesday': '11:30-21:30', 'Thursday': '11:30-21:30', 'Friday': '11:30-21:30', 'Saturday': '11:30-21:30', 'Sunday': '11:30-..."
308,oUX2bYbqjqST-urKbOHG6w,loftti cafe,"7729 S Rainbow Blvd, Ste 9B",las vegas,NV,89139,36.047942,-115.244167,4.5,284,1,"{'OutdoorSeating': 'True', 'BusinessParking': ""{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}"", 'HasTV': 'False', 'BusinessAcceptsCreditCa...","sandwiches, shaved ice, coffee & tea, desserts, cafes, themed cafes, shaved snow, food, bubble tea, juice bars & smoothies, restaurants","{'Monday': '0:0-0:0', 'Tuesday': '8:0-3:0', 'Wednesday': '8:0-3:0', 'Thursday': '8:0-3:0', 'Friday': '8:0-3:0', 'Saturday': '8:0-3:0', 'Sunday': '11:30-20:0'}"


In [114]:
print("here comes the datatype for the attributes column")
type(df_yelp_biz_LV_Rest['attributes'])

here comes the datatype for the attributes column


pandas.core.series.Series

In [115]:
def remove_quotes_from_string(attr1):
    if type(attr1) == str and attr1.startswith("u'"):
        attr2 = attr1[2:-1]
    elif type(attr1) == str and attr1[0] == "'" and attr1[-1] == "'":
        attr2 = attr1[1:-1]
    else:
        attr2 = attr1
    return attr2

def make_list_of_true_valued_keys(column_value_string):
#  
    debug = False
    non_bool1 = []
    import ast
    
    column_value_string = cleanup_attributes(column_value_string)
    column_value = eval(column_value_string) # do this after you do all data cleanup
    
    
    # checkout AST abstract syntax tree... ast.literal_eval(dict_as_string)
#     column_value = ast.literal_eval(column_value_string)
    if debug:  print("column_value_string={}, column_value={}".format(column_value_string, column_value))
    temp_dict = {}
    # these are embedded dictionaries, that I am removing now, but may add back in later
    if 'BusinessParking' in column_value:
        del column_value['BusinessParking']
    if 'Ambience' in column_value:
        del column_value['Ambience']
    if 'Music' in column_value:
        del column_value['Music']
#         Business_parking = {}
#         Business_parking = column_value['BusinessParking']
#         column_value.pop('BusinessParking')
#         temp_dict = {**column_value, **Business_parking}

    return column_value
# print("df_yelp_biz_LV_Rest['attributes'] is a {} object".format(type(df_yelp_biz_LV_Rest['attributes'][0])))
# df_yelp_biz_LV_Rest['attributes_text'] = df_yelp_biz_LV_Rest['attributes']
df_yelp_biz_LV_Rest['attributes_text'] = df_yelp_biz_LV_Rest['attributes'].apply(lambda col: make_list_of_true_valued_keys(col)) # get the primarygenre
text1234 = df_yelp_biz_LV_Rest['attributes_text']
df2 = pd.DataFrame.from_dict(text1234, orient='columns') 

df_yelp_business_attributes = pd.DataFrame(pd.json_normalize(df2.loc[:,'attributes_text'])) # this is throwing this error:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [116]:
# df_yelp_business_attributes.sample(10)
# df_yelp_business_attributes.loc[1271,'RestaurantsAttire'] == "casual"
df_yelp_business_attributes['RestaurantsPriceRange2'].unique()

array(['2', '1', '4', '3', nan], dtype=object)

In [118]:
df_yelp_business_attributes['BikeParking'] = df_yelp_business_attributes['BikeParking'].fillna(0)
df_yelp_business_attributes['BusinessAcceptsCreditCards'] = df_yelp_business_attributes['BikeParking'].fillna(0)
df_yelp_business_attributes['GoodForKids'] = df_yelp_business_attributes['BikeParking'].fillna(0)
df_yelp_business_attributes['ByAppointmentOnly'] = df_yelp_business_attributes['BikeParking'].fillna(0)
df_yelp_business_attributes['RestaurantsAttire'] = df_yelp_business_attributes['RestaurantsAttire'].apply(lambda col: remove_quotes_from_string(col))
df_yelp_business_attributes['RestaurantsPriceRange2'] = df_yelp_business_attributes['RestaurantsPriceRange2'].apply(lambda col: remove_quotes_from_string(col))
df_yelp_business_attributes['RestaurantsAttire'] = df_yelp_business_attributes['RestaurantsAttire'].apply(lambda col: remove_quotes_from_string(col))



columns_to_drop = ['BestNights', 'GoodForMeal','DietaryRestrictions']
df_yelp_business_attributes.drop(columns_to_drop, axis=1, inplace=True)

for biz_attrib in ['BusinessAcceptsCreditCards', 
          'BikeParking', 
          'GoodForKids', 
          'ByAppointmentOnly', 
          'RestaurantsPriceRange2', 
          'DogsAllowed', 
          'WiFi', 
          'RestaurantsAttire', 
          'RestaurantsTakeOut', 
          'NoiseLevel', 
          'RestaurantsReservations', 
          'RestaurantsGoodForGroups', 
          'HasTV', 
          'Alcohol', 
          'RestaurantsDelivery', 
          'OutdoorSeating', 
          'Caters', 
          'WheelchairAccessible', 
          'AcceptsInsurance', 
          'RestaurantsTableService',
#           'GoodForMeal', 
          'HappyHour', 
          'BusinessAcceptsBitcoin', 
          'BYOB', 
          'Corkage', 
          'GoodForDancing', 
          'CoatCheck', 
#           'BestNights': 0, 
          'Smoking', 
#           'DietaryRestrictions', 
          'DriveThru', 
#           'HairSpecializesIn', 
          'BYOBCorkage', 
          'AgesAllowed', 
          'RestaurantsCounterService', 
          'Open24Hours']:
    df_yelp_business_attributes[biz_attrib] = df_yelp_business_attributes[biz_attrib].apply(lambda col: remove_quotes_from_string(col))
#     df_yelp_business_attributes[biz_attrib] = pd.to_numeric(df_yelp_business_attributes[biz_attrib])
    
    

# values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
# >>> df.fillna(value=values)
values2 = {
          'BusinessAcceptsCreditCards' : 0, 
          'BikeParking': 0, 
          'GoodForKids': 0, 
          'ByAppointmentOnly': 0, 
          'RestaurantsPriceRange2': 0, 
          'DogsAllowed': 0, 
          'WiFi': 0, 
          'RestaurantsAttire' : '0', 
          'RestaurantsTakeOut': 0, 
          'NoiseLevel' : 'unknown', 
          'RestaurantsReservations': 0, 
          'RestaurantsGoodForGroups': 0, 
          'HasTV': 0, 
          'Alcohol' : '0', 
          'RestaurantsDelivery': 0, 
          'OutdoorSeating': 0, 
          'Caters': 0, 
          'WheelchairAccessible': 0, 
          'AcceptsInsurance': 0, 
          'RestaurantsTableService': 0,
#           'GoodForMeal', 
          'HappyHour': 0, 
          'BusinessAcceptsBitcoin': 0, 
          'BYOB': 0, 
          'Corkage': 0, 
          'GoodForDancing': 0, 
          'CoatCheck': 0, 
#           'BestNights': 0, 
          'Smoking': 0, 
#           'DietaryRestrictions': 0, 
          'DriveThru': 0, 
          'HairSpecializesIn': 0, 
          'BYOBCorkage': 0, 
          'AgesAllowed': 0, 
          'RestaurantsCounterService': 0, 
          'Open24Hours': 0
         } 
df_yelp_business_attributes.fillna(value=values2, inplace=True)
# df_yelp_business_attributes.sample(50)
# array([nan, "u'quiet'", "u'average'", "u'loud'", "u'very_loud'",
#        "'average'", "'quiet'", "'loud'", "'very_loud'", 'None'],
#       dtype=object)
# array([nan, "'none'", "u'none'", "u'full_bar'", "u'beer_and_wine'",
#        "'full_bar'", "'beer_and_wine'", 'None'], dtype=object)

# df['new column name'] = df['column name'].apply(lambda x: 'value if condition is met' if x condition else 'value if condition is not met')
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].str.contains('none')] = '0'
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].str.contains('none')] = 1
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].str.contains('none')] = 2
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].str.contains('none')] = 3

# df_yelp_business_attributes.loc[(df_yelp_business_attributes['RestaurantsAttire'] == 'none') | (df_yelp_business_attributes['RestaurantsAttire'] == 'None')] = 0
# df_yelp_business_attributes.loc[(df_yelp_business_attributes['RestaurantsAttire'] == 'casual') | (df_yelp_business_attributes['RestaurantsAttire'] == u"casual")] = 1
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'] == 'dressy'] = 2
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'] == 'formal'] = 3
# S.replace('(-d)','',regex=True, inplace = True)
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].replace('casual','1', regex=False, inplace=True)]
# value = re.sub(r"[^0-9]+", "", value)
# df_yelp_business_attributes['RestaurantsAttire'] = df_yelp_business_attributes['RestaurantsAttire'].str.replace(u'casual','').astype('int')
# df['P'] = df['P'].str.replace(r'\D+', '').astype('int')
# df_yelp_business_attributes[df_yelp_business_attributes['RestaurantsAttire'].str.contains('*casual*') == True] = 1
# df[df['Country (region)'].str.contains('^I.*')==True]

# S.replace('(-d)','',regex=True, inplace = True)

KeyError: "['BestNights' 'GoodForMeal' 'DietaryRestrictions'] not found in axis"

In [130]:
df_yelp_business_attributes['RestaurantsAttire'].replace('None','0', inplace=True)
df_yelp_business_attributes['RestaurantsAttire'].replace('casual','1', inplace=True)
df_yelp_business_attributes['RestaurantsAttire'].replace('dressy','2', inplace=True)
df_yelp_business_attributes['RestaurantsAttire'].replace('formal','3', inplace=True)
df_yelp_business_attributes['RestaurantsDelivery'].replace('None','0', inplace=True)
df_yelp_business_attributes['OutdoorSeating'].replace('None','0', inplace=True)
df_yelp_business_attributes['WiFi'].replace('no','0', inplace=True)
df_yelp_business_attributes['WiFi'].replace('free','1', inplace=True)
df_yelp_business_attributes['WiFi'].replace('paid','2', inplace=True)
df_yelp_business_attributes['Alcohol'].replace('None','0', inplace=True)
df_yelp_business_attributes['Alcohol'].replace('none','0', inplace=True)
df_yelp_business_attributes['Alcohol'].replace('beer_and_wine','1', inplace=True)
df_yelp_business_attributes['Alcohol'].replace('full_bar','2', inplace=True)
df_yelp_business_attributes['NoiseLevel'].replace('None','0', inplace=True)
df_yelp_business_attributes['NoiseLevel'].replace('unknown','0', inplace=True)
df_yelp_business_attributes['NoiseLevel'].replace('quiet','1', inplace=True)
df_yelp_business_attributes['NoiseLevel'].replace('average','2', inplace=True)
df_yelp_business_attributes['NoiseLevel'].replace('loud','3', inplace=True)
df_yelp_business_attributes['NoiseLevel'].replace('very_loud','3', inplace=True)
df_yelp_business_attributes['Smoking'].replace('None','0', inplace=True)
df_yelp_business_attributes['Smoking'].replace('no','0', inplace=True)
df_yelp_business_attributes['Smoking'].replace('outdoor','1', inplace=True)
df_yelp_business_attributes['Smoking'].replace('yes','2', inplace=True)
df_yelp_business_attributes['BYOBCorkage'].replace('None','0', inplace=True)
df_yelp_business_attributes['BYOBCorkage'].replace('no','0', inplace=True)
df_yelp_business_attributes['BYOBCorkage'].replace('yes_free','1', inplace=True)
df_yelp_business_attributes['BYOBCorkage'].replace('yes_corkage','2', inplace=True)
df_yelp_business_attributes['AgesAllowed'].replace('allages','0', inplace=True)
df_yelp_business_attributes['AgesAllowed'].replace('21plus','1', inplace=True)

df_yelp_business_attributes.replace({'RestaurantsDelivery' : 'None', 'OutdoorSeating': 'None', 'Caters': 'None' ,'WheelchairAccessible': 'None',
                                     'RestaurantsReservations': 'None', 'RestaurantsTakeOut': 'None', 'BusinessAcceptsCreditCards': 'None',
                                     'GoodForKids': 'None', 'BikeParking': 'None', 'ByAppointmentOnly': 'None'}, '0')


# df["a"] = pd.to_numeric(df["a"])
columns_to_numeric = ['RestaurantsAttire','RestaurantsPriceRange2','RestaurantsDelivery','HasTV','Caters','WheelchairAccessible',
                      'RestaurantsReservations','RestaurantsTakeOut','WiFi','BusinessAcceptsCreditCards','Alcohol','GoodForKids',
                      'BikeParking', 'RestaurantsTableService', 'HappyHour', 'NoiseLevel','DriveThru', 'ByAppointmentOnly',
                      'DogsAllowed', 'BusinessAcceptsBitcoin', 'GoodForDancing', 'Corkage', 'BYOB', 'CoatCheck', 'Smoking',
                      'BYOBCorkage', 'RestaurantsCounterService', 'AgesAllowed', 'AcceptsInsurance', 'Open24Hours', 'OutdoorSeating', 'RestaurantsGoodForGroups']
for col in columns_to_numeric:
    df_yelp_business_attributes[col] = df_yelp_business_attributes[col].astype(int)

In [131]:
df_yelp_business_attributes.sample(20)

Unnamed: 0,RestaurantsDelivery,HasTV,OutdoorSeating,Caters,RestaurantsAttire,WheelchairAccessible,RestaurantsPriceRange2,RestaurantsGoodForGroups,RestaurantsReservations,RestaurantsTakeOut,WiFi,BusinessAcceptsCreditCards,Alcohol,GoodForKids,BikeParking,RestaurantsTableService,HappyHour,NoiseLevel,DriveThru,ByAppointmentOnly,DogsAllowed,BusinessAcceptsBitcoin,GoodForDancing,Corkage,BYOB,CoatCheck,Smoking,BYOBCorkage,RestaurantsCounterService,AgesAllowed,AcceptsInsurance,Open24Hours
3447,1,1,1,0,0,0,2,1,0,1,1,1,2,1,1,0,1,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3562,0,0,0,0,0,0,3,1,1,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2856,0,1,1,0,0,0,1,1,0,1,0,1,0,1,1,0,1,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2898,1,1,1,1,0,1,2,1,1,1,1,1,2,1,1,1,1,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1425,0,0,0,0,0,0,2,1,1,0,1,0,2,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3232,0,1,1,0,0,0,1,1,0,1,1,0,2,0,0,0,1,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1254,0,1,1,0,0,0,1,1,0,1,0,1,2,1,1,1,1,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3866,0,1,0,0,0,0,2,1,1,1,0,0,2,0,0,0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,1,1,0,0,0,4,1,1,1,0,0,2,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2572,1,0,0,1,0,1,2,1,0,1,0,1,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [38]:
# df_yelp_business['attributes'].tail(4)
# df_yelp_business_attributes['RestaurantsAttire'].sample(100)
# df_yelp_business_attributes['WiFi'].unique()
# df_yelp_business_attributes.sample(20)
for attr1 in df_yelp_business_attributes.columns:
    print("Here are the distinct values for feature {}:".format(attr1))
    print(df_yelp_business_attributes[attr1].unique())
# text123.head(20)
# type(text123[0])
# df_yelp_business_attributes.columns
# df_yelp_business_attributes.RestaurantsAttire.unique()
# df_yelp_business_attributes['RestaurantsAttire'].dtype


Here are the distinct values for feature RestaurantsDelivery:
[1. 0.]
Here are the distinct values for feature HasTV:
[0. 1.]
Here are the distinct values for feature OutdoorSeating:
[0. 1.]
Here are the distinct values for feature Caters:
[1. 0.]
Here are the distinct values for feature RestaurantsAttire:
[1 0 2 3]
Here are the distinct values for feature WheelchairAccessible:
[1. 0.]
Here are the distinct values for feature RestaurantsPriceRange2:
['2' '1' '4' '3' 0]
Here are the distinct values for feature RestaurantsGoodForGroups:
[1. 0.]
Here are the distinct values for feature RestaurantsReservations:
[0. 1.]
Here are the distinct values for feature RestaurantsTakeOut:
[1. 0.]
Here are the distinct values for feature WiFi:
['no' 'free' 0 'paid']
Here are the distinct values for feature BusinessAcceptsCreditCards:
[1. 0.]
Here are the distinct values for feature Alcohol:
['none' '0' 'full_bar' 'beer_and_wine' 0]
Here are the distinct values for feature GoodForKids:
[1. 0.]
Here ar

In [39]:
df_yelp_biz_LV_Rest['attributes_text'].tail(40)

207192    {'WiFi': ''free'', 'RestaurantsPriceRange2': '2', 'OutdoorSeating': 0, 'RestaurantsGoodForGroups': 1, 'HasTV': 0, 'Alcohol': ''none'', 'NoiseLevel': 'u'average'', 'Caters': 0, ...
207244    {'RestaurantsAttire': ''casual'', 'BikeParking': 1, 'GoodForMeal': '{'dessert': False, 'latenight': False, 'lunch': True, 'dinner': False, 'brunch': False, 'breakfast': False}'...
207292    {'OutdoorSeating': 0, 'GoodForKids': 1, 'WiFi': 'u'no'', 'HasTV': 1, 'RestaurantsReservations': 0, 'Caters': 1, 'BikeParking': 1, 'BusinessAcceptsCreditCards': 1, 'NoiseLevel':...
207310    {'Caters': 1, 'BusinessAcceptsCreditCards': 1, 'NoiseLevel': 'u'average'', 'RestaurantsDelivery': 0, 'RestaurantsAttire': 'u'casual'', 'RestaurantsReservations': 1, 'Restaurant...
207345    {'RestaurantsGoodForGroups': 1, 'GoodForKids': 1, 'WiFi': 'u'free'', 'BikeParking': 1, 'Alcohol': 'u'none'', 'BusinessAcceptsCreditCards': 1, 'RestaurantsTakeOut': 1, 'OutdoorS...
207457    {'OutdoorSeating': 1, 'RestaurantsPriceR

In [40]:
df_yelp_business['categories'].sample(20)

168526                                                                                          seafood markets, food, specialty food
57896                                                                  desserts, shaved ice, food, juice bars & smoothies, acai bowls
152907                         limos, airport shuttles, party bus rentals, event planning & services, transportation, hotels & travel
102668                                                                                  dentists, health & medical, general dentistry
69054                                                                            desserts, coffeeshops, coffee & tea, nightlife, food
155980                                                                                     sports medicine, health & medical, doctors
28135                                                                                                              automotive, towing
29458                                                         

In [41]:
import datetime
print(datetime.datetime.now())

2020-07-05 19:21:33.093296
