### Global variables that I use with different methods

In [51]:
import datetime
print(datetime.datetime.now())

2020-06-30 20:45:24.448235


In [52]:
# Global Variables that we will use throughout the notebook

TEST_SIZE              = 0.20 # train_size = 1.00 - test_size
TRAIN_SIZE             = 1.00 - TEST_SIZE
RANDOM_STATE           = 20200427
RESTORE_PICKLE         = True

In [53]:
import numpy as np
import pandas as pd
import scipy
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

from matplotlib.colors import ListedColormap

from scipy.spatial.distance import cdist
import seaborn as sn
from sklearn import cluster, metrics
from sklearn import datasets, ensemble, metrics, linear_model
from sklearn.cluster import AffinityPropagation, KMeans, MeanShift, estimate_bandwidth, SpectralClustering, DBSCAN
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, FeatureHasher
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix, pairwise_distances, mean_squared_error
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV,cross_val_score, train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, normalize, LabelEncoder
from sklearn.utils import shuffle

import json
import spacy
import statsmodels.api as sm
import unicodedata

import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
# conda install -c anaconda nltk
# nltk.download('gutenberg')
# nltk.download('punkt')
# nltk.download("wordnet")

from collections import Counter

# import umap

from ast import literal_eval
import chardet, codecs, datetime, os, platform, random, re, string, sys, time, unicodedata
from IPython.display import Markdown, display
from zipfile import ZipFile

In [54]:
#Global settings
pd.set_option('display.max_rows', 1000)
pd.set_option("max_rows", 1000)
pd.options.display.width=400
pd.options.display.max_colwidth=180
pd.set_option('display.max_columns', 200)
pd.set_option("max_columns", 200)

In [55]:
# We will uses this with TF-IDF

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [56]:
def run_tfidf_vectorizer(df, parameters={}):
    
    vectorizer = TfidfVectorizer(**parameters)
    vectors = vectorizer.fit_transform(df.values.astype('U'))
    feature_names = vectorizer.get_feature_names()
    tfidf_df_dense = vectors.toarray()
    df = pd.DataFrame(tfidf_df_dense, columns=feature_names)

    return df

In [57]:
# This function allows us to read a csv from a Zip file, specifying the columns that we want to use
# It returns a data frame

def file_stuff(path, filename, filetype, zipfilename=None, dtypes=None, usecols=None):
    fullfilename = "{}".format(path+'/'+filename)
    
    print("fullfilename = {}".format(fullfilename))
        
    if zipfilename:
        zip_file = ZipFile(zipfilename)
        
    if filetype == 'csv':
        if zipfilename:
            df = pd.read_csv(zip_file.open(fullfilename), dtype=dtypes, usecols=usecols)
        else:
            df = pd.read_csv(fullfilename, dtype=dtypes, usecols=usecols)
    
    if filetype == 'json':
        if zipfilename:
            df = pd.read_json(zip_file.open(fullfilename))
        else:
            
            df = pd.read_json(fullfilename)
       
    print("There are {} rows in this file.".format(df.shape[0]))
    
    return df

In [58]:
def pickle_it(mode, df, file_name):
    
    if mode == 'to_pickle':
        print_timestamp('Now pickling file {}.'.format(file_name))
        df.to_pickle(file_name)  # where to save it, usually as a .pkl
        print_timestamp('File pickled successfully {}.'.format(file_name))
    
    else: 
        # Then you can load it back using:
        print_timestamp('Now restoring pickled file {}.'.format(file_name))
        df = pd.read_pickle(file_name)
        print_timestamp('Restored pickled file {}.'.format(file_name))
        return df

In [59]:
def print_timestamp(displaytext):    
    datetime_now = str(datetime.datetime.now())
    printFormatted("{:19.19}: In: {} {} ".format(datetime_now, sys._getframe(1).f_code.co_name, displaytext))

In [60]:
def printFormatted(string):
    newline = '\n'
    display(Markdown(string))
    write_to_logfile(string+newline)

In [61]:
def write_to_logfile(message, mdformat=''):
    bufsize = 0
    with open('TestResults.md', 'a+') as the_file:
        the_file.write('{} {}'.format(mdformat, message))

In [62]:
def read_json_to_pandas(filename):
    with open(filename) as json_file:      
        data = json_file.readlines()
        # this line below may take at least 8-10 minutes of processing for 4-5 million rows. It converts all strings in list to actual json objects. 
        data = list(map(json.loads, data)) 

    df = pd.DataFrame(data)
    
    print("There are {} rows in this file.".format(df.shape[0]))
    #json_lines... 

    return df

In [63]:
path = '/Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive'
biz_file = 'yelp_academic_dataset_business'
user_file = 'yelp_academic_dataset_user'
review_file = 'yelp_academic_dataset_review'

# It took 10m to load this entire dataset via JSON files.
if RESTORE_PICKLE:
    print("here we go")
    df_yelp_business = pickle_it('from_pickle', None, "{}/{}.{}".format(path, biz_file,'pkl'))
    df_yelp_user = pickle_it('from_pickle', None, "{}/{}.{}".format(path, user_file,'pkl'))
    df_yelp_review = pickle_it('from_pickle', None, "{}/{}.{}".format(path, review_file,'pkl'))
    
else:
    print_timestamp('Starting json_to_pandas on yelp_business')
    df_yelp_business = read_json_to_pandas("{}/{}.{}".format(path, biz_file,'json'))
    print_timestamp('Starting json_to_pandas on yelp_user')
    df_yelp_user = read_json_to_pandas("{}/{}.{}".format(path, user_file,'json'))
    print_timestamp('Starting json_to_pandas on yelp_review')
    df_yelp_review = read_json_to_pandas("{}/{}.{}".format(path, review_file,'json'))
    print_timestamp('Finished with json_to_pandas on yelp_review')

    pickle_it('to_pickle', df_yelp_business, "{}/{}.{}".format(path, biz_file,'pkl'))
    pickle_it('to_pickle', df_yelp_user, "{}/{}.{}".format(path, user_file,'pkl'))
    pickle_it('to_pickle', df_yelp_review, "{}/{}.{}".format(path, review_file,'pkl'))

print("that's all for now")

# 10m to load from JSON
# 46ss to load from Pickle

here we go


2020-06-30 20:45:24: In: pickle_it Now restoring pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_business.pkl. 

2020-06-30 20:45:26: In: pickle_it Restored pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_business.pkl. 

2020-06-30 20:45:26: In: pickle_it Now restoring pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_user.pkl. 

2020-06-30 20:45:33: In: pickle_it Restored pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_user.pkl. 

2020-06-30 20:45:47: In: pickle_it Now restoring pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_review.pkl. 

2020-06-30 20:46:28: In: pickle_it Restored pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_review.pkl. 

that's all for now


In [64]:
# df_yelp_business.sample(5)
df_yelp_business.head(5)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikeParking': 'True', 'GoodForKids': 'False', 'BusinessParking': '{'garage': False, 'street': False, 'validated': False, 'lot': True, 'v...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Shopping","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0', 'Wednesday': '10:0-18:0', 'Thursday': '11:0-20:0', 'Friday': '11:0-20:0', 'Saturday': '11:0-20:0', 'Sunday': '13:0-18:0'}"
1,Yzvjg0SayhoZgCljUJRF9Q,"Carlos Santo, NMD","8880 E Via Linda, Ste 107",Scottsdale,AZ,85258,33.569404,-111.890264,5.0,4,1,"{'GoodForKids': 'True', 'ByAppointmentOnly': 'True'}","Health & Medical, Fitness & Instruction, Yoga, Active Life, Pilates",
2,XNoUzKckATkOD1hP6vghZg,Felinus,3554 Rue Notre-Dame O,Montreal,QC,H4C 1P4,45.479984,-73.58007,5.0,5,1,,"Pets, Pet Services, Pet Groomers",
3,6OAZjbxqM5ol29BuHsil3w,Nevada House of Hose,1015 Sharp Cir,North Las Vegas,NV,89030,36.219728,-115.127725,2.5,3,0,"{'BusinessAcceptsCreditCards': 'True', 'ByAppointmentOnly': 'False', 'DogsAllowed': 'True', 'BikeParking': 'True', 'BusinessParking': '{'garage': False, 'street': False, 'valid...","Hardware Stores, Home Services, Building Supplies, Home & Garden, Shopping","{'Monday': '7:0-16:0', 'Tuesday': '7:0-16:0', 'Wednesday': '7:0-16:0', 'Thursday': '7:0-16:0', 'Friday': '7:0-16:0'}"
4,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726648,4.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppointmentOnly': 'True'}","Home Services, Plumbing, Electricians, Handyman, Contractors","{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', 'Wednesday': '9:0-16:0', 'Thursday': '9:0-16:0', 'Friday': '9:0-16:0'}"


In [65]:
df_yelp_business[['name','city','state','latitude','longitude','attributes','categories']].head(4)

Unnamed: 0,name,city,state,latitude,longitude,attributes,categories
0,The Range At Lake Norman,Cornelius,NC,35.462724,-80.852612,"{'BusinessAcceptsCreditCards': 'True', 'BikeParking': 'True', 'GoodForKids': 'False', 'BusinessParking': '{'garage': False, 'street': False, 'validated': False, 'lot': True, 'v...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Shopping"
1,"Carlos Santo, NMD",Scottsdale,AZ,33.569404,-111.890264,"{'GoodForKids': 'True', 'ByAppointmentOnly': 'True'}","Health & Medical, Fitness & Instruction, Yoga, Active Life, Pilates"
2,Felinus,Montreal,QC,45.479984,-73.58007,,"Pets, Pet Services, Pet Groomers"
3,Nevada House of Hose,North Las Vegas,NV,36.219728,-115.127725,"{'BusinessAcceptsCreditCards': 'True', 'ByAppointmentOnly': 'False', 'DogsAllowed': 'True', 'BikeParking': 'True', 'BusinessParking': '{'garage': False, 'street': False, 'valid...","Hardware Stores, Home Services, Building Supplies, Home & Garden, Shopping"


In [66]:
# print(df_yelp_business.shape)
print(df_yelp_business.city.nunique())

1251


In [67]:
df_yelp_user[['name','review_count','yelping_since','useful','funny','cool', 'elite','friends','fans']].head(4)

Unnamed: 0,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans
0,Rafael,553,2007-07-06 03:27:11,628,225,227,,"oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfDg, IkRib6Xs91PPW7pon7VVig, A8Aq8f0-XvLBcyMk2GJdJQ, eEZM1kogR7eL4GOBZyPvBA, e1o1LN7ez5ckCpQeAab4iw, _HrJVzFaRFUhPva8cwBjpQ, pZeGZGzX...",14
1,Michelle,564,2008-04-28 01:29:25,790,316,400,200820092010201120122013,"ly7EnE8leJmyqyePVYFlug, pRlR63iDytsnnniPb3AOug, kc-rnN-ndnFTdHG4TfIgeQ, GYndf-h6dAwpGP0lDBz2Wg, FPo3SwQuAK53QVZm_eIyBg, 9fF_T3pQu3ay1oA7h_VYNA, G5T3bd6dUs5zkQ2VMZtRUw, tufuEc5f...",27
2,Martin,60,2008-08-28 23:40:05,151,125,103,2010,"Uwlk0txjQBPw_JhHsQnyeg, Ybxr1tSCkv3lYA0I1qmnPQ, DNmeLov3wXNxlxjN5feBoQ, x7n69vEsYFh9xnW3D5lPPQ, -AaBjWJYiQxXkCMDlXfPGw, COXnA2hnzFDai3ywx_iM8A, dUFoyswTt5ZQbleF3_4TCg, uj2AWSvs...",5
3,John,206,2008-09-20 00:08:14,233,160,84,2009,"iog3Nyg1i4jeumiTVG_BSA, M92xWY2Vr9w0xoH8bPplfQ, So46aZ3y7zRl2VmFK35vCQ, vrZmtsiaIZBr42KwAve5qA, SaNDaz5rBQs-5gyhOkO1MA, xTcuKbp7ocDcZDD_bcK9hw, PpzliPkE_fzsI6r15UMZFA, Ygr_c6So...",6


In [68]:
df_yelp_review.sample(4)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
2631246,zfni-Ze2pLR3pRslu1dBlA,5NBRoECEwypyr2N7uoaqoQ,bRntmjztJ4dFDA_LYiEECw,4.0,0,0,0,"Cool place to chill, their late hours are useful for people who are night creatures. I was a tad disappointed they stop taking orders for food after a certain time, most places...",2015-07-08 07:08:20
445500,jEtwaX9I6NeCGi8EzpcJVA,wUX3QMaYSGeYNnVrsMYuzw,usFTOvVzaY92IqYERg4UBw,5.0,0,0,0,"The staff here is very nice and welcoming. The atmosphere is restful and the services are exceptional. I have come in twice with a headache, after an hour, the headaches were s...",2016-01-16 18:10:49
501241,EBr26rQnsk9fI84ySCbxvw,1soJrazO_8OFGQ0e4XOCSQ,QCCVxVRt1amqv0AaEWSKkg,5.0,1,0,1,I have been in a few times and loved every meal. Most of the visits have been business related so it was nice to have our anniversary dinner at Esther's since I knew the food w...,2018-11-29 19:29:47
6351464,T6RV_lWxj4JorUZsVlVGpQ,we_ONmXR0wP5-Ejx9AbIAA,_HiVw--LpiP0zWTPvRCfFg,5.0,2,0,1,Fantastic experience! I contacted them on super short notice and I was in desperate need of a hair cut. Lyndsie was able to get me in that week! They are very responsive on any...,2017-03-02 20:19:32


In [69]:
df_yelp_review[['text','date']].head(4)

Unnamed: 0,text,date
0,"As someone who has worked with many museums, I was eager to visit this gallery on my most recent trip to Las Vegas. When I saw they would be showing infamous eggs of the House ...",2015-04-15 05:21:16
1,I am actually horrified this place is still in business. My 3 year old son needed a haircut this past summer and the lure of the $7 kids cut signs got me in the door. We had to...,2013-12-07 03:16:52
2,"I love Deagan's. I do. I really do. The atmosphere is cozy and festive. The shrimp tacos and house fries are my standbys. The fries are sometimes good and sometimes great, and ...",2015-12-05 03:18:11
3,"Dismal, lukewarm, defrosted-tasting ""TexMex"" glop;\n\nMumbly, unengaged waiter;\n\nClueless manager, who seeing us with barely nibbled entrees\non plates shoved forward for pic...",2011-05-27 05:30:52


In [70]:
df_yelp_business.describe()

Unnamed: 0,latitude,longitude,stars,review_count,is_open
count,209393.0,209393.0,209393.0,209393.0,209393.0
mean,38.579934,-97.390217,3.538055,36.937505,0.806632
std,4.940448,16.718535,1.023543,123.343597,0.39494
min,21.497258,-158.025525,1.0,3.0,0.0
25%,33.638658,-112.269476,3.0,4.0,1.0
50%,36.147408,-111.743531,3.5,9.0,1.0
75%,43.611693,-79.972679,4.5,27.0,1.0
max,51.299943,-72.80655,5.0,10129.0,1.0


In [71]:
df_yelp_user.describe()

Unnamed: 0,review_count,useful,funny,cool,fans,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
count,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0
mean,22.1693,39.82709,17.03435,21.70796,1.458824,3.648087,1.980362,0.3004191,0.1873203,0.1614454,0.07019241,1.372517,2.849952,2.819167,2.819167,1.058364,1.114744
std,76.74226,513.3536,355.0568,445.7187,16.67521,1.172525,72.29082,12.75309,15.07502,11.62099,9.967903,58.82533,96.94462,86.51499,86.51499,31.54894,92.26612
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,3.0,0.0,0.0,0.0,3.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,15.0,13.0,3.0,3.0,0.0,4.57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,14455.0,197130.0,165861.0,191359.0,11568.0,5.0,25452.0,13501.0,14179.0,13654.0,12669.0,58480.0,72370.0,40508.0,40508.0,15445.0,82622.0


In [72]:
df_yelp_review.describe()

Unnamed: 0,stars,useful,funny,cool
count,8021122.0,8021122.0,8021122.0,8021122.0
mean,3.703575,1.322882,0.4596423,0.574562
std,1.490486,3.550831,2.188143,2.476906
min,1.0,-1.0,0.0,-1.0
25%,3.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0
75%,5.0,1.0,0.0,0.0
max,5.0,1122.0,976.0,502.0


In [73]:
# Pandas Yelp Queries
# df_yelp_user[(df_yelp_user.review_count > 100)].count()
df_yelp_review['text'].sample(20)

2680134    this place would get all five stars if it were based purely on our experience at the bar. we first got here and left due to our game (angels playoff) not being played. but came...
6342463    Oh my goodness. I'm so happy to have met Shane and Mitch. They awesome with and my suv. I was treated like family. After they fixed my wrecked truck they detailed my car inside...
3516174    Roma has amazing Italian food cooked by Italians.  \nItalians that speak Italian and know what true Italian food should taste like.  If you can speak a little Italian and order...
4374601    Truly love European Wax. The employees are always very friendly and very professional. whenever I make an appointment they are always on time. I never have to worry about someo...
3332703    I was next door waiting for a haircut at sports clips. Felt really tired after a long long long day. Popped in here just to see if they had coffee. They did and saw the plethor...
5083361    I had an emergency tear along the 

In [74]:
# cleanup functions

def cleanup_text(text):
    text = re.sub(r"\n","", text)                              # remove newlines
    text = re.sub('[^\w\s]','', text)                          # remove punctuation
    text = text.lower()                                        # lower case all letters
    text_list = text.split()                                   # split it for the list comprehension
    text_list = [x for x in text_list if len(x) > 1            # we only want words longer than 1 character
                 & x.isdigit() == False ]                      # no numerics
    text = " ".join(text_list)                                 # put it back into a string    
    return text

def cleanup_attributes(column_value):
    column_value = re.sub(r"'True'",'1', column_value)
    column_value = re.sub(r"'False'",'0', column_value)
    column_value = re.sub(r"''","'", column_value)
    column_value = re.sub(r"''","'", column_value)
    return column_value

In [75]:
# total_rows['ColumnID'] = total_rows['ColumnID'].astype(str)
# df_yelp_business[df_yelp_business['categories'].str.contains('Restaurant')].count()
# df_yelp_business[df_yelp_business['categories'].str.contains('Restaurant')].sample(20)
# df_yelp_business_restaurants = df_yelp_business[df_yelp_business['categories'].str.contains('Restaurant')]
# Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
#        'latitude', 'longitude', 'stars', 'review_count', 'is_open',
#        'attributes', 'categories', 'hours']

# Cleanup the features: categories, attributes, city, and name

# Cleanup df_yelp_business
df_yelp_business.dropna(subset=['categories', 'attributes'], inplace=True)
df_yelp_business['attributes'] = df_yelp_business['attributes'].astype(str) # df['COLUMN1'].str.decode("utf-8") .str.decode("utf-8") 
df_yelp_business['city'] = df_yelp_business['city'].str.lower() # move to lowercase - title tries to be too smart
df_yelp_business['name'] = df_yelp_business['name'].str.lower()
df_yelp_business['categories'] = df_yelp_business['categories'].str.lower()
# df_yelp_business['attributes'] = df_yelp_business['attributes'].str.lower()
df_yelp_business['categories'] = df_yelp_business['categories'].astype(str)
# df_yelp_business['attributes'] = df_yelp_business['attributes'].astype(str) # could this be what is messing up attributes???
# df_yelp_business['attributes'] = df_yelp_business['attributes'].apply(lambda col: cleanup_attributes(col)) # get the primarygenre
df_yelp_business_restaurants = df_yelp_business[df_yelp_business['categories'].str.contains('restaurant')][['business_id','name','city', 
                                                                                                            'postal_code', 'state',
                                                                                                            'latitude','longitude',
                                                                                                            'review_count', 'attributes',
                                                                                                             'categories']]
df_yelp_biz_LV_Rest = df_yelp_business[
                                       (df_yelp_business['categories'].str.contains('restaurant') == True) &
                                       (df_yelp_business['city'].str.contains('vegas') == True) &
                                       (df_yelp_business.review_count > 40)
                                      ]

# We should only use df_yelp_biz_LV_Rest from here on...
df_yelp_biz_LV_Rest.head(5)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
130,Q_dh08clYUPj13GmCRzIVA,kulinarya express filipino kitchen,"7960 S Rainbow Blvd, Ste 8000A",las vegas,NV,89139,36.043663,-115.241881,4.0,82,0,"{'BusinessParking': ""{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}"", 'RestaurantsDelivery': 'True', 'HasTV': 'False', 'OutdoorSeating': 'F...","filipino, restaurants, breakfast & brunch, food, street vendors","{'Monday': '0:0-0:0', 'Tuesday': '10:0-20:0', 'Wednesday': '10:0-20:0', 'Thursday': '10:0-20:0', 'Friday': '10:0-20:0', 'Saturday': '10:0-20:0', 'Sunday': '10:0-20:0'}"
157,Yr_w9lakJrKMyEG_hI6zbA,fat moe's pizza & wings,"6125 W Tropicana Ave, Ste F",las vegas,NV,89103,36.099361,-115.226636,4.0,141,1,"{'RestaurantsAttire': ""u'casual'"", 'RestaurantsDelivery': 'True', 'NoiseLevel': ""'quiet'"", 'HasTV': 'True', 'RestaurantsPriceRange2': '1', 'Ambience': ""{'romantic': False, 'int...","pizza, salad, burgers, restaurants","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0', 'Wednesday': '11:0-22:0', 'Thursday': '11:0-22:0', 'Friday': '11:0-22:0', 'Saturday': '11:0-22:0', 'Sunday': '11:0-22:0'}"
238,AN0bWhisCf6LN9eHZ7DQ3w,los olivos ristorante,3759 E Desert Inn Rd,las vegas,NV,89121,36.129178,-115.092483,5.0,222,1,"{'WiFi': ""u'free'"", 'RestaurantsPriceRange2': '2', 'ByAppointmentOnly': 'False', 'BikeParking': 'True', 'RestaurantsGoodForGroups': 'True', 'RestaurantsDelivery': 'False', 'Has...","restaurants, italian","{'Monday': '0:0-0:0', 'Tuesday': '16:0-21:0', 'Wednesday': '16:0-21:0', 'Thursday': '16:0-21:0', 'Friday': '16:0-21:0', 'Saturday': '16:0-21:0', 'Sunday': '16:0-21:0'}"
246,AtD6B83S4Mbmq0t7iDnUVA,veggie house,"5115 Spring Mountain Rd, Ste 203",las vegas,NV,89146,36.125569,-115.210911,4.5,1142,1,"{'RestaurantsPriceRange2': '2', 'BikeParking': 'True', 'OutdoorSeating': 'False', 'RestaurantsGoodForGroups': 'True', 'Caters': 'True', 'Alcohol': ""u'none'"", 'GoodForKids': 'Tr...","restaurants, specialty food, japanese, sushi bars, dim sum, vegetarian, food, ramen, chinese, vegan","{'Monday': '11:30-21:30', 'Tuesday': '11:30-21:30', 'Wednesday': '11:30-21:30', 'Thursday': '11:30-21:30', 'Friday': '11:30-21:30', 'Saturday': '11:30-21:30', 'Sunday': '11:30-..."
308,oUX2bYbqjqST-urKbOHG6w,loftti cafe,"7729 S Rainbow Blvd, Ste 9B",las vegas,NV,89139,36.047942,-115.244167,4.5,284,1,"{'OutdoorSeating': 'True', 'BusinessParking': ""{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}"", 'HasTV': 'False', 'BusinessAcceptsCreditCa...","sandwiches, shaved ice, coffee & tea, desserts, cafes, themed cafes, shaved snow, food, bubble tea, juice bars & smoothies, restaurants","{'Monday': '0:0-0:0', 'Tuesday': '8:0-3:0', 'Wednesday': '8:0-3:0', 'Thursday': '8:0-3:0', 'Friday': '8:0-3:0', 'Saturday': '8:0-3:0', 'Sunday': '11:30-20:0'}"


In [76]:
# df_yelp_biz_LV_Rest.head(20) # there are 4,284 restaurants in Las Vegas, with 40 or more reviews! 24-Jun-2020 
# df_yelp_reviews_LV_Rest = 
# https://stackoverflow.com/questions/34055584/python-pandas-string-contains-and-doesnt-contain
# df_yelp_biz2_LV[(df_yelp_biz2_LV.review_count > 50)].sample(20)
# df_yelp_biz2.sample(20)
# df_yelp_biz_LV_Rest.count()
# df_yelp_business.count()
# df_yelp_business_restaurants.count()
# df_yelp_biz2_socal.head(30)
# df_yelp_biz_LV_Rest['attributes'].head(20)
df_yelp_business['attributes'].sample(30)

38045     {'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}", 'BusinessAcceptsCreditCards': 'True', 'RestaurantsPriceRange2': '2', ...
14020     {'NoiseLevel': "u'average'", 'GoodForKids': 'False', 'RestaurantsPriceRange2': '1', 'HasTV': 'True', 'RestaurantsGoodForGroups': 'True', 'OutdoorSeating': 'False', 'BusinessAcc...
179495       {'RestaurantsPriceRange2': '3', 'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}", 'BusinessAcceptsCreditCards': 'True'}
171538    {'BusinessAcceptsCreditCards': 'True', 'ByAppointmentOnly': 'False', 'RestaurantsPriceRange2': '2', 'BikeParking': 'True', 'DogsAllowed': 'True', 'BusinessParking': "{'garage':...
18405     {'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}", 'RestaurantsPriceRange2': '2', 'BikeParking': 'False', 'BusinessAccep...
142170    {'BusinessParking': "{'garage': False, '

In [77]:
# df = pd.merge(df,df2[['Key_Column','Target_Column']],on='Key_Column', how='left')
df_yelp_review_LV_Rest = pd.merge(df_yelp_review.sample(100000),df_yelp_biz_LV_Rest['business_id'],on='business_id') # this worked! count=1,484,887!
# Cleanup df_yelp_review
df_yelp_review_LV_Rest['text'] = df_yelp_review_LV_Rest['text'].apply(lambda col: cleanup_text(col)) # Cleanup the yelp_review_LV_Rest only

In [78]:
# df_yelp_review_LV_Rest.head(5)
# df_yelp_biz_LV_Rest.sample(20)
df_yelp_business.sample(4)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
180699,xIGtS7kgAV6JTwnRNgWU-w,$10 dress,1616 S Las Vegas Blvd,las vegas,NV,89104,36.151508,-115.152535,4.5,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAcceptsCreditCards': 'True', 'BikeParking': 'True', 'WheelchairAccessible': 'True', 'ByAppointmentOnly': 'False', 'BusinessParking': ""{...","jewelry, women's clothing, fashion, shopping, accessories","{'Monday': '10:0-20:0', 'Tuesday': '10:0-20:0', 'Wednesday': '10:0-20:0', 'Thursday': '10:0-20:0', 'Friday': '10:0-20:0', 'Saturday': '10:0-20:0', 'Sunday': '11:0-18:0'}"
153414,t4NBRC-d7rnzCgM7D1o9BQ,the wellness room,"7345 E Shoeman Ln, Ste 2",scottsdale,AZ,85251,33.500536,-111.92237,5.0,3,1,"{'BusinessAcceptsCreditCards': 'True', 'BikeParking': 'True'}","reiki, professional services, active life, life coach, yoga, health & medical, fitness & instruction",
159326,FY1tEzYi1fWBI-BH91HxRQ,las vegas pool services,,las vegas,NV,89131,36.36564,-115.224485,5.0,3,1,"{'ByAppointmentOnly': 'True', 'BusinessAcceptsCreditCards': 'True'}","home services, pool cleaners","{'Monday': '7:0-17:30', 'Tuesday': '7:0-17:30', 'Wednesday': '7:0-17:30', 'Thursday': '7:0-17:30', 'Friday': '7:0-17:30', 'Saturday': '7:0-17:30'}"
168003,4VQuPEpXTjJtJhmfo_QEgw,holiday inn express & suites newmarket,100 Pony Drive,newmarket,ON,L3Y 7B6,44.068806,-79.427972,3.0,12,1,"{'WiFi': ""u'free'"", 'RestaurantsPriceRange2': '2'}","venues & event spaces, hotels & travel, event planning & services, hotels","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'Wednesday': '0:0-0:0', 'Thursday': '0:0-0:0', 'Friday': '0:0-0:0', 'Saturday': '0:0-0:0', 'Sunday': '0:0-0:0'}"


In [79]:
parameters = {
              'lowercase': True,
              'max_df': 0.65, 
              'min_df': 10,  
              'max_features': 100,
              'norm': u'l2',
              'smooth_idf' :True,
              'stop_words': 'english',
              'tokenizer': LemmaTokenizer(),
              'use_idf': True

}
df_yelp_review_LV_Rest_tfidf = run_tfidf_vectorizer(df_yelp_review_LV_Rest['text'],parameters=parameters)

  'stop_words.' % sorted(inconsistent))


In [80]:
# df_yelp_review_LV_Rest.count()
# df_yelp_review.count()
df_yelp_business.count()
# df_yelp_review_LV_Rest[['text']].sample(20)
# df_yelp_review_LV_Rest_tfidf[['ha','wa','u']].sample(10)
# df_yelp_review_LV_Rest_tfidf[['ha','wa','u']].describe()

business_id     180347
name            180347
address         180347
city            180347
state           180347
postal_code     180347
latitude        180347
longitude       180347
stars           180347
review_count    180347
is_open         180347
attributes      180347
categories      180347
hours           146796
dtype: int64

In [81]:
# sorted_df.tail(20)
df_yelp_biz_LV_Rest['categories'].sample(10)
df_yelp_biz_LV_Rest['attributes'].sample(10)

132046    {'BusinessAcceptsCreditCards': 'True', 'RestaurantsReservations': 'False', 'RestaurantsTakeOut': 'True', 'GoodForKids': 'True', 'RestaurantsAttire': "'casual'", 'RestaurantsGoo...
34104     {'NoiseLevel': "u'quiet'", 'RestaurantsAttire': "u'casual'", 'Caters': 'True', 'OutdoorSeating': 'False', 'RestaurantsReservations': 'False', 'BikeParking': 'False', 'Alcohol':...
146374    {'NoiseLevel': "u'average'", 'RestaurantsGoodForGroups': 'True', 'OutdoorSeating': 'False', 'Caters': 'True', 'BusinessAcceptsCreditCards': 'True', 'HasTV': 'True', 'Restaurant...
29331     {'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}", 'Ambience': "{'touristy': False, 'hipster': False, 'romantic': False...
13967     {'RestaurantsDelivery': 'False', 'WiFi': "u'no'", 'Alcohol': "u'beer_and_wine'", 'OutdoorSeating': 'False', 'GoodForKids': 'True', 'RestaurantsAttire': "u'casual'", 'Restaurant...
103423    {'GoodForKids': 'True', 'NoiseLevel': "u

In [82]:
# type(eval(df_yelp_business.categories[0]))

In [83]:
# def get_first_name_in_list(r, column_number, colname):
#     return eval(r.values.tolist()[column_number])[0]['{}'.format(colname)]

# df_movies_dataset['genre'] = df_movies_dataset.apply(lambda row: get_first_name_in_list(row, 1,'name'), axis=1) # get the primarygenre
# df_yelp_business['attributes'].head(5)
# print(type(df_yelp_business['attributes'][0]))
# df_yelp_biz_LV_Rest.count()
df_yelp_biz_LV_Rest.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
130,Q_dh08clYUPj13GmCRzIVA,kulinarya express filipino kitchen,"7960 S Rainbow Blvd, Ste 8000A",las vegas,NV,89139,36.043663,-115.241881,4.0,82,0,"{'BusinessParking': ""{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}"", 'RestaurantsDelivery': 'True', 'HasTV': 'False', 'OutdoorSeating': 'F...","filipino, restaurants, breakfast & brunch, food, street vendors","{'Monday': '0:0-0:0', 'Tuesday': '10:0-20:0', 'Wednesday': '10:0-20:0', 'Thursday': '10:0-20:0', 'Friday': '10:0-20:0', 'Saturday': '10:0-20:0', 'Sunday': '10:0-20:0'}"
157,Yr_w9lakJrKMyEG_hI6zbA,fat moe's pizza & wings,"6125 W Tropicana Ave, Ste F",las vegas,NV,89103,36.099361,-115.226636,4.0,141,1,"{'RestaurantsAttire': ""u'casual'"", 'RestaurantsDelivery': 'True', 'NoiseLevel': ""'quiet'"", 'HasTV': 'True', 'RestaurantsPriceRange2': '1', 'Ambience': ""{'romantic': False, 'int...","pizza, salad, burgers, restaurants","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0', 'Wednesday': '11:0-22:0', 'Thursday': '11:0-22:0', 'Friday': '11:0-22:0', 'Saturday': '11:0-22:0', 'Sunday': '11:0-22:0'}"
238,AN0bWhisCf6LN9eHZ7DQ3w,los olivos ristorante,3759 E Desert Inn Rd,las vegas,NV,89121,36.129178,-115.092483,5.0,222,1,"{'WiFi': ""u'free'"", 'RestaurantsPriceRange2': '2', 'ByAppointmentOnly': 'False', 'BikeParking': 'True', 'RestaurantsGoodForGroups': 'True', 'RestaurantsDelivery': 'False', 'Has...","restaurants, italian","{'Monday': '0:0-0:0', 'Tuesday': '16:0-21:0', 'Wednesday': '16:0-21:0', 'Thursday': '16:0-21:0', 'Friday': '16:0-21:0', 'Saturday': '16:0-21:0', 'Sunday': '16:0-21:0'}"
246,AtD6B83S4Mbmq0t7iDnUVA,veggie house,"5115 Spring Mountain Rd, Ste 203",las vegas,NV,89146,36.125569,-115.210911,4.5,1142,1,"{'RestaurantsPriceRange2': '2', 'BikeParking': 'True', 'OutdoorSeating': 'False', 'RestaurantsGoodForGroups': 'True', 'Caters': 'True', 'Alcohol': ""u'none'"", 'GoodForKids': 'Tr...","restaurants, specialty food, japanese, sushi bars, dim sum, vegetarian, food, ramen, chinese, vegan","{'Monday': '11:30-21:30', 'Tuesday': '11:30-21:30', 'Wednesday': '11:30-21:30', 'Thursday': '11:30-21:30', 'Friday': '11:30-21:30', 'Saturday': '11:30-21:30', 'Sunday': '11:30-..."
308,oUX2bYbqjqST-urKbOHG6w,loftti cafe,"7729 S Rainbow Blvd, Ste 9B",las vegas,NV,89139,36.047942,-115.244167,4.5,284,1,"{'OutdoorSeating': 'True', 'BusinessParking': ""{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}"", 'HasTV': 'False', 'BusinessAcceptsCreditCa...","sandwiches, shaved ice, coffee & tea, desserts, cafes, themed cafes, shaved snow, food, bubble tea, juice bars & smoothies, restaurants","{'Monday': '0:0-0:0', 'Tuesday': '8:0-3:0', 'Wednesday': '8:0-3:0', 'Thursday': '8:0-3:0', 'Friday': '8:0-3:0', 'Saturday': '8:0-3:0', 'Sunday': '11:30-20:0'}"


In [84]:
print("here comes the datatype for the attributes column")
type(df_yelp_biz_LV_Rest['attributes'])

here comes the datatype for the attributes column


pandas.core.series.Series

In [128]:
def remove_quotes_from_string(attr1):
    if type(attr1) == str and attr1.startswith("u'"):
        attr2 = attr1[2:-1]
    elif type(attr1) == str and attr1[0] == "'" and attr1[-1] == "'":
        attr2 = attr1[1:-1]
    else:
        attr2 = attr1
    return attr2

def make_list_of_true_valued_keys(column_value_string):
#  
    debug = False
    non_bool1 = []
    import ast
    
    column_value_string = cleanup_attributes(column_value_string)
    column_value = eval(column_value_string) # do this after you do all data cleanup
    
    
    # checkout AST abstract syntax tree... ast.literal_eval(dict_as_string)
#     column_value = ast.literal_eval(column_value_string)
    if debug:  print("column_value_string={}, column_value={}".format(column_value_string, column_value))
    temp_dict = {}
    # these are embedded dictionaries, that I am removing now, but may add back in later
    if 'BusinessParking' in column_value:
        del column_value['BusinessParking']
    if 'Ambience' in column_value:
        del column_value['Ambience']
    if 'Music' in column_value:
        del column_value['Music']
#         Business_parking = {}
#         Business_parking = column_value['BusinessParking']
#         column_value.pop('BusinessParking')
#         temp_dict = {**column_value, **Business_parking}

    return column_value
print("a-4")
# print("df_yelp_biz_LV_Rest['attributes'] is a {} object".format(type(df_yelp_biz_LV_Rest['attributes'][0])))
# df_yelp_biz_LV_Rest['attributes_text'] = df_yelp_biz_LV_Rest['attributes']
df_yelp_biz_LV_Rest['attributes_text'] = df_yelp_biz_LV_Rest['attributes'].apply(lambda col: make_list_of_true_valued_keys(col)) # get the primarygenre
print("b-4")
text1234 = df_yelp_biz_LV_Rest['attributes_text']
df2 = pd.DataFrame.from_dict(text1234, orient='columns') 
print("c-4")
# from: https://stackoverflow.com/questions/21104592/json-to-pandas-dataframe
# path1 = '42.974049,-81.205203|42.974298,-81.195755'
# request=Request('http://maps.googleapis.com/maps/api/elevation/json?locations='+path1+'&sensor=false')
# response = urlopen(request)
# elevations = response.read()
# data = json.loads(elevations)
# df = pd.json_normalize(data['results'])

# df_yelp_business_attributes = pd.json_normalize(df_yelp_biz_LV_Rest['attributes_text'])
df_yelp_business_attributes = pd.DataFrame(pd.json_normalize(df2.loc[:,'attributes_text'])) # this is throwing this error:
"""
df_yelp_business_attributes = pd.json_normalize(df2['attributes_text']) # this is throwing this error:

/Users/lou/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:31: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""

a-4
b-4
c-4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




In [98]:
# df_yelp_business_attributes.sample(10)
df_yelp_business_attributes.loc[1271,'RestaurantsAttire'] == "casual"

False

In [134]:
df_yelp_business_attributes['BikeParking'] = df_yelp_business_attributes['BikeParking'].fillna(0)
df_yelp_business_attributes['BusinessAcceptsCreditCards'] = df_yelp_business_attributes['BikeParking'].fillna(0)
df_yelp_business_attributes['GoodForKids'] = df_yelp_business_attributes['BikeParking'].fillna(0)
df_yelp_business_attributes['ByAppointmentOnly'] = df_yelp_business_attributes['BikeParking'].fillna(0)
df_yelp_business_attributes['RestaurantsAttire'] = df_yelp_business_attributes['RestaurantsAttire'].apply(lambda col: remove_quotes_from_string(col))
df_yelp_business_attributes['RestaurantsPriceRange2'] = df_yelp_business_attributes['RestaurantsAttire'].apply(lambda col: remove_quotes_from_string(col))

for biz_attrib in ['BusinessAcceptsCreditCards', 
        'BikeParking', 
          'GoodForKids', 
          'ByAppointmentOnly', 
          'RestaurantsPriceRange2', 
          'DogsAllowed', 
          'WiFi', 
          'RestaurantsAttire', 
          'RestaurantsTakeOut', 
          'NoiseLevel', 
          'RestaurantsReservations', 
          'RestaurantsGoodForGroups', 
          'HasTV', 
          'Alcohol', 
          'RestaurantsDelivery', 
          'OutdoorSeating', 
          'Caters', 
          'WheelchairAccessible', 
          'AcceptsInsurance', 
          'RestaurantsTableService',
#           'GoodForMeal', 
          'HappyHour', 
          'BusinessAcceptsBitcoin', 
          'BYOB', 
          'Corkage', 
          'GoodForDancing', 
          'CoatCheck', 
#           'BestNights': 0, 
          'Smoking', 
          'DietaryRestrictions', 
          'DriveThru', 
#           'HairSpecializesIn', 
          'BYOBCorkage', 
          'AgesAllowed', 
          'RestaurantsCounterService', 
          'Open24Hours']:
    df_yelp_business_attributes[biz_attrib] = df_yelp_business_attributes[biz_attrib].apply(lambda col: remove_quotes_from_string(col))
#     df_yelp_business_attributes[biz_attrib] = pd.to_numeric(df_yelp_business_attributes[biz_attrib])
    
    

# values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
# >>> df.fillna(value=values)
values2 = {'BusinessAcceptsCreditCards' : 0, 
          'BikeParking': 0, 
          'GoodForKids': 0, 
          'ByAppointmentOnly': 0, 
          'RestaurantsPriceRange2': 0, 
          'DogsAllowed': 0, 
          'WiFi': 0, 
          'RestaurantsAttire' : '0', 
          'RestaurantsTakeOut': 0, 
          'NoiseLevel' : 'unknown', 
          'RestaurantsReservations': 0, 
          'RestaurantsGoodForGroups': 0, 
          'HasTV': 0, 
          'Alcohol' : 'Nun', 
          'RestaurantsDelivery': 0, 
          'OutdoorSeating': 0, 
          'Caters': 0, 
          'WheelchairAccessible': 0, 
          'AcceptsInsurance': 0, 
          'RestaurantsTableService': 0,
#           'GoodForMeal', 
          'HappyHour': 0, 
          'BusinessAcceptsBitcoin': 0, 
          'BYOB': 0, 
          'Corkage': 0, 
          'GoodForDancing': 0, 
          'CoatCheck': 0, 
#           'BestNights': 0, 
          'Smoking': 0, 
          'DietaryRestrictions': 0, 
          'DriveThru': 0, 
          'HairSpecializesIn': 0, 
          'BYOBCorkage': 0, 
          'AgesAllowed': 0, 
          'RestaurantsCounterService': 0, 
          'Open24Hours': 0
         }
df_yelp_business_attributes.fillna(value=values2, inplace=True)
# df_yelp_business_attributes.sample(50)
# array([nan, "u'quiet'", "u'average'", "u'loud'", "u'very_loud'",
#        "'average'", "'quiet'", "'loud'", "'very_loud'", 'None'],
#       dtype=object)
# array([nan, "'none'", "u'none'", "u'full_bar'", "u'beer_and_wine'",
#        "'full_bar'", "'beer_and_wine'", 'None'], dtype=object)

# df['new column name'] = df['column name'].apply(lambda x: 'value if condition is met' if x condition else 'value if condition is not met')
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].str.contains('none')] = '0'
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].str.contains('none')] = 1
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].str.contains('none')] = 2
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].str.contains('none')] = 3

# df_yelp_business_attributes.loc[(df_yelp_business_attributes['RestaurantsAttire'] == 'none') | (df_yelp_business_attributes['RestaurantsAttire'] == 'None')] = 0
# df_yelp_business_attributes.loc[(df_yelp_business_attributes['RestaurantsAttire'] == 'casual') | (df_yelp_business_attributes['RestaurantsAttire'] == u"casual")] = 1
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'] == 'dressy'] = 2
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'] == 'formal'] = 3
# S.replace('(-d)','',regex=True, inplace = True)
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].replace('casual','1', regex=False, inplace=True)]
# value = re.sub(r"[^0-9]+", "", value)
# df_yelp_business_attributes['RestaurantsAttire'] = df_yelp_business_attributes['RestaurantsAttire'].str.replace(u'casual','').astype('int')
# df['P'] = df['P'].str.replace(r'\D+', '').astype('int')
# df_yelp_business_attributes[df_yelp_business_attributes['RestaurantsAttire'].str.contains('*casual*') == True] = 1
# df[df['Country (region)'].str.contains('^I.*')==True]

# S.replace('(-d)','',regex=True, inplace = True)
df_yelp_business_attributes['RestaurantsAttire'].replace('None','0', inplace=True)
df_yelp_business_attributes['RestaurantsAttire'].replace('casual','1', inplace=True)
df_yelp_business_attributes['RestaurantsAttire'].replace('dressy','2', inplace=True)
df_yelp_business_attributes['RestaurantsAttire'].replace('formal','3', inplace=True)
# df["a"] = pd.to_numeric(df["a"])
df_yelp_business_attributes['RestaurantsAttire'] = pd.to_numeric(df_yelp_business_attributes['RestaurantsAttire'])

In [137]:
# df_yelp_business['attributes'].tail(4)
# df_yelp_business_attributes['RestaurantsAttire'].sample(100)
df_yelp_business_attributes['WiFi'].unique()
# text123.head(20)
# type(text123[0])
# df_yelp_business_attributes.columns
# df_yelp_business_attributes.RestaurantsAttire.unique()
# df_yelp_business_attributes['RestaurantsAttire'].dtype

array(['no', 'free', 0, 'paid', 'None'], dtype=object)

In [135]:
df_yelp_biz_LV_Rest['attributes_text'].tail(40)

207192    {'WiFi': ''free'', 'RestaurantsPriceRange2': '2', 'OutdoorSeating': 0, 'RestaurantsGoodForGroups': 1, 'HasTV': 0, 'Alcohol': ''none'', 'NoiseLevel': 'u'average'', 'Caters': 0, ...
207244    {'RestaurantsAttire': ''casual'', 'BikeParking': 1, 'GoodForMeal': '{'dessert': False, 'latenight': False, 'lunch': True, 'dinner': False, 'brunch': False, 'breakfast': False}'...
207292    {'OutdoorSeating': 0, 'GoodForKids': 1, 'WiFi': 'u'no'', 'HasTV': 1, 'RestaurantsReservations': 0, 'Caters': 1, 'BikeParking': 1, 'BusinessAcceptsCreditCards': 1, 'NoiseLevel':...
207310    {'Caters': 1, 'BusinessAcceptsCreditCards': 1, 'NoiseLevel': 'u'average'', 'RestaurantsDelivery': 0, 'RestaurantsAttire': 'u'casual'', 'RestaurantsReservations': 1, 'Restaurant...
207345    {'RestaurantsGoodForGroups': 1, 'GoodForKids': 1, 'WiFi': 'u'free'', 'BikeParking': 1, 'Alcohol': 'u'none'', 'BusinessAcceptsCreditCards': 1, 'RestaurantsTakeOut': 1, 'OutdoorS...
207457    {'OutdoorSeating': 1, 'RestaurantsPriceR

In [90]:
df_yelp_business['categories'].sample(20)

84428                                                                                  shopping, cosmetics & beauty supply, beauty & spas
137025    music venues, dive bars, arts & entertainment, pubs, restaurants, bars, american (new), canadian (new), local flavor, nightlife
33867                                                                                                 american (traditional), restaurants
20753                                                                                            gyms, active life, fitness & instruction
186527                                                                                  grocery, convenience stores, food, specialty food
70048                                                                     international, caribbean, nightlife, lounges, bars, restaurants
129744                                                          health & medical, health insurance offices, insurance, financial services
21431                             

In [91]:
import datetime
print(datetime.datetime.now())

2020-06-30 20:50:20.404171
