### Global variables that I use with different methods

In [33]:
import datetime
print(datetime.datetime.now())

2020-06-28 14:31:25.565538


In [34]:
# Global Variables that we will use throughout the notebook

TEST_SIZE              = 0.20 # train_size = 1.00 - test_size
TRAIN_SIZE             = 1.00 - TEST_SIZE
RANDOM_STATE           = 20200427
RESTORE_PICKLE         = True

In [35]:
import numpy as np
import pandas as pd
import scipy
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

from matplotlib.colors import ListedColormap

from scipy.spatial.distance import cdist
import seaborn as sn
from sklearn import cluster, metrics
from sklearn import datasets, ensemble, metrics, linear_model
from sklearn.cluster import AffinityPropagation, KMeans, MeanShift, estimate_bandwidth, SpectralClustering, DBSCAN
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, FeatureHasher
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix, pairwise_distances, mean_squared_error
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV,cross_val_score, train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, normalize, LabelEncoder
from sklearn.utils import shuffle

import json
import spacy
import statsmodels.api as sm
import unicodedata

import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize 
# nltk.download('gutenberg')
from collections import Counter

import umap

from ast import literal_eval
import chardet, codecs, datetime, os, platform, random, re, string, sys, time, unicodedata
from IPython.display import Markdown, display
from zipfile import ZipFile

In [36]:
#Global settings
pd.set_option('display.max_rows', 1000)
pd.set_option("max_rows", 1000)
pd.options.display.width=400
pd.options.display.max_colwidth=180
pd.set_option('display.max_columns', 200)
pd.set_option("max_columns", 200)

In [37]:
# We will uses this with TF-IDF

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [38]:
def run_tfidf_vectorizer(df, parameters={}):
    
    vectorizer = TfidfVectorizer(**parameters)
    vectors = vectorizer.fit_transform(df.values.astype('U'))
    feature_names = vectorizer.get_feature_names()
    tfidf_df_dense = vectors.toarray()
    df = pd.DataFrame(tfidf_df_dense, columns=feature_names)

    return df

In [39]:
# This function allows us to read a csv from a Zip file, specifying the columns that we want to use
# It returns a data frame

def file_stuff(path, filename, filetype, zipfilename=None, dtypes=None, usecols=None):
    fullfilename = "{}".format(path+'/'+filename)
    
    print("fullfilename = {}".format(fullfilename))
        
    if zipfilename:
        zip_file = ZipFile(zipfilename)
        
    if filetype == 'csv':
        if zipfilename:
            df = pd.read_csv(zip_file.open(fullfilename), dtype=dtypes, usecols=usecols)
        else:
            df = pd.read_csv(fullfilename, dtype=dtypes, usecols=usecols)
    
    if filetype == 'json':
        if zipfilename:
            df = pd.read_json(zip_file.open(fullfilename))
        else:
            
            df = pd.read_json(fullfilename)
       
    print("There are {} rows in this file.".format(df.shape[0]))
    
    return df

In [40]:
def pickle_it(mode, df, file_name):
    
    if mode == 'to_pickle':
        print_timestamp('Now pickling file {}.'.format(file_name))
        df.to_pickle(file_name)  # where to save it, usually as a .pkl
        print_timestamp('File pickled successfully {}.'.format(file_name))
    
    else: 
        # Then you can load it back using:
        print_timestamp('Now restoring pickled file {}.'.format(file_name))
        df = pd.read_pickle(file_name)
        print_timestamp('Restored pickled file {}.'.format(file_name))
        return df

In [41]:
def print_timestamp(displaytext):    
    datetime_now = str(datetime.datetime.now())
    printFormatted("{:19.19}: In: {} {} ".format(datetime_now, sys._getframe(1).f_code.co_name, displaytext))

In [42]:
def printFormatted(string):
    newline = '\n'
    display(Markdown(string))
    write_to_logfile(string+newline)

In [43]:
def write_to_logfile(message, mdformat=''):
    bufsize = 0
    with open('TestResults.md', 'a+') as the_file:
        the_file.write('{} {}'.format(mdformat, message))

In [44]:
def read_json_to_pandas(filename):
    with open(filename) as json_file:      
        data = json_file.readlines()
        # this line below may take at least 8-10 minutes of processing for 4-5 million rows. It converts all strings in list to actual json objects. 
        data = list(map(json.loads, data)) 

    df = pd.DataFrame(data)
    
    print("There are {} rows in this file.".format(df.shape[0]))
    #json_lines... 

    return df

In [45]:
path = '/Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive'
biz_file = 'yelp_academic_dataset_business'
user_file = 'yelp_academic_dataset_user'
review_file = 'yelp_academic_dataset_review'

# It took 10m to load this entire dataset via JSON files.
if RESTORE_PICKLE:
    print("here we go")
    df_yelp_business = pickle_it('from_pickle', None, "{}/{}.{}".format(path, biz_file,'pkl'))
    df_yelp_user = pickle_it('from_pickle', None, "{}/{}.{}".format(path, user_file,'pkl'))
    df_yelp_review = pickle_it('from_pickle', None, "{}/{}.{}".format(path, review_file,'pkl'))
    
else:
    print_timestamp('Starting json_to_pandas on yelp_business')
    df_yelp_business = read_json_to_pandas("{}/{}.{}".format(path, biz_file,'json'))
    print_timestamp('Starting json_to_pandas on yelp_user')
    df_yelp_user = read_json_to_pandas("{}/{}.{}".format(path, user_file,'json'))
    print_timestamp('Starting json_to_pandas on yelp_review')
    df_yelp_review = read_json_to_pandas("{}/{}.{}".format(path, review_file,'json'))
    print_timestamp('Finished with json_to_pandas on yelp_review')

    pickle_it('to_pickle', df_yelp_business, "{}/{}.{}".format(path, biz_file,'pkl'))
    pickle_it('to_pickle', df_yelp_user, "{}/{}.{}".format(path, user_file,'pkl'))
    pickle_it('to_pickle', df_yelp_review, "{}/{}.{}".format(path, review_file,'pkl'))

print("that's all for now")

# 10m to load from JSON
# 46ss to load from Pickle

here we go


2020-06-28 14:31:25: In: pickle_it Now restoring pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_business.pkl. 

2020-06-28 14:31:27: In: pickle_it Restored pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_business.pkl. 

2020-06-28 14:31:27: In: pickle_it Now restoring pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_user.pkl. 

2020-06-28 14:31:37: In: pickle_it Restored pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_user.pkl. 

2020-06-28 14:31:43: In: pickle_it Now restoring pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_review.pkl. 

2020-06-28 14:32:20: In: pickle_it Restored pickled file /Users/lou/GITHubProjects/Thinkful/Datafiles/Yelp/yelp_2020/10100_1035793_bundle_archive/yelp_academic_dataset_review.pkl. 

that's all for now


In [46]:
# df_yelp_business.sample(5)
df_yelp_business.head(5)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikeParking': 'True', 'GoodForKids': 'False', 'BusinessParking': '{'garage': False, 'street': False, 'validated': False, 'lot': True, 'v...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Shopping","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0', 'Wednesday': '10:0-18:0', 'Thursday': '11:0-20:0', 'Friday': '11:0-20:0', 'Saturday': '11:0-20:0', 'Sunday': '13:0-18:0'}"
1,Yzvjg0SayhoZgCljUJRF9Q,"Carlos Santo, NMD","8880 E Via Linda, Ste 107",Scottsdale,AZ,85258,33.569404,-111.890264,5.0,4,1,"{'GoodForKids': 'True', 'ByAppointmentOnly': 'True'}","Health & Medical, Fitness & Instruction, Yoga, Active Life, Pilates",
2,XNoUzKckATkOD1hP6vghZg,Felinus,3554 Rue Notre-Dame O,Montreal,QC,H4C 1P4,45.479984,-73.58007,5.0,5,1,,"Pets, Pet Services, Pet Groomers",
3,6OAZjbxqM5ol29BuHsil3w,Nevada House of Hose,1015 Sharp Cir,North Las Vegas,NV,89030,36.219728,-115.127725,2.5,3,0,"{'BusinessAcceptsCreditCards': 'True', 'ByAppointmentOnly': 'False', 'DogsAllowed': 'True', 'BikeParking': 'True', 'BusinessParking': '{'garage': False, 'street': False, 'valid...","Hardware Stores, Home Services, Building Supplies, Home & Garden, Shopping","{'Monday': '7:0-16:0', 'Tuesday': '7:0-16:0', 'Wednesday': '7:0-16:0', 'Thursday': '7:0-16:0', 'Friday': '7:0-16:0'}"
4,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726648,4.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppointmentOnly': 'True'}","Home Services, Plumbing, Electricians, Handyman, Contractors","{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', 'Wednesday': '9:0-16:0', 'Thursday': '9:0-16:0', 'Friday': '9:0-16:0'}"


In [47]:
df_yelp_business[['name','city','state','latitude','longitude','attributes','categories']].head(4)

Unnamed: 0,name,city,state,latitude,longitude,attributes,categories
0,The Range At Lake Norman,Cornelius,NC,35.462724,-80.852612,"{'BusinessAcceptsCreditCards': 'True', 'BikeParking': 'True', 'GoodForKids': 'False', 'BusinessParking': '{'garage': False, 'street': False, 'validated': False, 'lot': True, 'v...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Shopping"
1,"Carlos Santo, NMD",Scottsdale,AZ,33.569404,-111.890264,"{'GoodForKids': 'True', 'ByAppointmentOnly': 'True'}","Health & Medical, Fitness & Instruction, Yoga, Active Life, Pilates"
2,Felinus,Montreal,QC,45.479984,-73.58007,,"Pets, Pet Services, Pet Groomers"
3,Nevada House of Hose,North Las Vegas,NV,36.219728,-115.127725,"{'BusinessAcceptsCreditCards': 'True', 'ByAppointmentOnly': 'False', 'DogsAllowed': 'True', 'BikeParking': 'True', 'BusinessParking': '{'garage': False, 'street': False, 'valid...","Hardware Stores, Home Services, Building Supplies, Home & Garden, Shopping"


In [48]:
# print(df_yelp_business.shape)
print(df_yelp_business.city.nunique())

1251


In [49]:
df_yelp_user[['name','review_count','yelping_since','useful','funny','cool', 'elite','friends','fans']].head(4)

Unnamed: 0,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans
0,Rafael,553,2007-07-06 03:27:11,628,225,227,,"oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfDg, IkRib6Xs91PPW7pon7VVig, A8Aq8f0-XvLBcyMk2GJdJQ, eEZM1kogR7eL4GOBZyPvBA, e1o1LN7ez5ckCpQeAab4iw, _HrJVzFaRFUhPva8cwBjpQ, pZeGZGzX...",14
1,Michelle,564,2008-04-28 01:29:25,790,316,400,200820092010201120122013,"ly7EnE8leJmyqyePVYFlug, pRlR63iDytsnnniPb3AOug, kc-rnN-ndnFTdHG4TfIgeQ, GYndf-h6dAwpGP0lDBz2Wg, FPo3SwQuAK53QVZm_eIyBg, 9fF_T3pQu3ay1oA7h_VYNA, G5T3bd6dUs5zkQ2VMZtRUw, tufuEc5f...",27
2,Martin,60,2008-08-28 23:40:05,151,125,103,2010,"Uwlk0txjQBPw_JhHsQnyeg, Ybxr1tSCkv3lYA0I1qmnPQ, DNmeLov3wXNxlxjN5feBoQ, x7n69vEsYFh9xnW3D5lPPQ, -AaBjWJYiQxXkCMDlXfPGw, COXnA2hnzFDai3ywx_iM8A, dUFoyswTt5ZQbleF3_4TCg, uj2AWSvs...",5
3,John,206,2008-09-20 00:08:14,233,160,84,2009,"iog3Nyg1i4jeumiTVG_BSA, M92xWY2Vr9w0xoH8bPplfQ, So46aZ3y7zRl2VmFK35vCQ, vrZmtsiaIZBr42KwAve5qA, SaNDaz5rBQs-5gyhOkO1MA, xTcuKbp7ocDcZDD_bcK9hw, PpzliPkE_fzsI6r15UMZFA, Ygr_c6So...",6


In [50]:
df_yelp_review.sample(4)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
5930637,Ov9t-nMmwPWjnqZmpFkVXQ,JdBDOdVIDW5EYkreVwoDoA,zswdqM8Av3KxVv9o7_6ORA,2.0,0,0,0,"Ordered delivery and received it in 40ish minutes. Everything we ordered that was fried was pretty much soggy mush. Might be better if we dined in, but stay away if you plan to...",2018-03-22 22:08:06
2616751,2iLmRpO-mb7cc0ZCsqDKSw,TFTv6Rzd3iSesmafK6UbKA,lfJGLcaX9Wcf0JO5ERMCnQ,5.0,0,0,0,Morales serviced my Highlander and did wonderful job. Got what I wanted with no wait. Pulled out in less than 20 minutes on a Saturday.,2018-05-13 00:34:51
5886641,XNpPVRsD4RbjA3qqQFPtNw,4VvvaddDORTcmkUS8zQE6w,x1ClWF6HF3X_7TFFnLED_Q,2.0,2,0,1,"Food was very good. We had the prime rib sandwich, and an order of chicken wings. The sandwich was very good, and the wings were nice and spicy. Unfortunately we did not enj...",2015-08-02 02:41:54
5800870,HKWjarvyJ7IEnPGBP33K-Q,pbj0UnJYZlrKbvKAbZtz5A,TAEMcgLcCLebzn8Jd3nVRw,5.0,0,0,0,Today was my first time trying wimpys I ordered a bacon cheese burger by this is one of the best burgers I've ever had thay have great service it's a very clean restaurant and...,2016-03-21 21:31:37


In [51]:
df_yelp_review[['text','date']].head(4)

Unnamed: 0,text,date
5798916,"Just not impressed. I found the quality of their subs left something to be desired, especially for the price. They aren't bad but there are better options in the area. Staff w...",2016-11-13 01:35:23
3345484,This company was at my apartment complex today and put a green sticker on my car stating that my car will be towed due to invalid registration. I have California plates due to ...,2017-04-10 04:19:51
5762073,"We were standing in line at chipotle and decided we weren't really feeling it... Found this place (thanks yelp!) and were happy to!!! Friendly service, they give you a taste if...",2014-12-06 03:28:39
7366623,Don't know what made us stop in here but I'm so grateful we did. The Pho is served in huge bowls that could probably feed 2 people easily. My son has a serious gluten allergy s...,2015-02-20 23:38:00


In [52]:
df_yelp_business.describe()

Unnamed: 0,latitude,longitude,stars,review_count,is_open
count,209393.0,209393.0,209393.0,209393.0,209393.0
mean,38.579934,-97.390217,3.538055,36.937505,0.806632
std,4.940448,16.718535,1.023543,123.343597,0.39494
min,21.497258,-158.025525,1.0,3.0,0.0
25%,33.638658,-112.269476,3.0,4.0,1.0
50%,36.147408,-111.743531,3.5,9.0,1.0
75%,43.611693,-79.972679,4.5,27.0,1.0
max,51.299943,-72.80655,5.0,10129.0,1.0


In [53]:
df_yelp_user.describe()

Unnamed: 0,review_count,useful,funny,cool,fans,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
count,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0,1968703.0
mean,22.1693,39.82709,17.03435,21.70796,1.458824,3.648087,1.980362,0.3004191,0.1873203,0.1614454,0.07019241,1.372517,2.849952,2.819167,2.819167,1.058364,1.114744
std,76.74226,513.3536,355.0568,445.7187,16.67521,1.172525,72.29082,12.75309,15.07502,11.62099,9.967903,58.82533,96.94462,86.51499,86.51499,31.54894,92.26612
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,3.0,0.0,0.0,0.0,3.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,15.0,13.0,3.0,3.0,0.0,4.57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,14455.0,197130.0,165861.0,191359.0,11568.0,5.0,25452.0,13501.0,14179.0,13654.0,12669.0,58480.0,72370.0,40508.0,40508.0,15445.0,82622.0


In [54]:
df_yelp_review.describe()

Unnamed: 0,stars,useful,funny,cool
count,8021122.0,8021122.0,8021122.0,8021122.0
mean,3.703575,1.322882,0.4596423,0.574562
std,1.490486,3.550831,2.188143,2.476906
min,1.0,-1.0,0.0,-1.0
25%,3.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0
75%,5.0,1.0,0.0,0.0
max,5.0,1122.0,976.0,502.0


In [55]:
# Pandas Yelp Queries
# df_yelp_user[(df_yelp_user.review_count > 100)].count()
df_yelp_review['text'].sample(20)

3999026    Alliance Residential is just another Slave Owners corporation. One of largest developers & property mgmt companies in the nation. Renting & paying for any property owned by thi...
4533655    I recently had an excellent experience with Vitaly from Vital Data Recovery.\n\nMy Sony Vaio suffered some water damage. I took it to several computer repair businesses in Mont...
7681165    As a french person, I expected greatness but was quite disappointed.\n\nThe menu has a slight twisted to the traditional french cuisine I am used to.\n\nThe gratin was great al...
4119796    Staying in Vegas for a few days and decided to get some spa treatments. For waxing I wanted a spot that specialized in waxing so I choose Studio B. So I just got a Brazilian wa...
7144300    I'm not a fan of visiting Las Vegas, but this place has changed my mind. Had to visit this city for business and I'm glad we stay at this hotel. We ended up eating a Freedom Be...
1271109    Great food and great vibes. What e

In [56]:
# cleanup functions

def cleanup_text(text):
    text = re.sub(r"\n","", text)                              # remove newlines
    text = re.sub('[^\w\s]','', text)                          # remove punctuation
    text = text.lower()                                        # lower case all letters
    text_list = text.split()                                   # split it for the list comprehension
    text_list = [x for x in text_list if len(x) > 1            # we only want words longer than 1 character
                 & x.isdigit() == False ]                      # no numerics
    text = " ".join(text_list)                                 # put it back into a string    
    return text

def cleanup_attributes(column_value):
    column_value = re.sub(r"'True'",'1', column_value)
    column_value = re.sub(r"'False'",'0', column_value)
    column_value = re.sub(r"''","'", column_value)
    column_value = re.sub(r"''","'", column_value)
    return column_value

In [58]:
# total_rows['ColumnID'] = total_rows['ColumnID'].astype(str)
# df_yelp_business[df_yelp_business['categories'].str.contains('Restaurant')].count()
# df_yelp_business[df_yelp_business['categories'].str.contains('Restaurant')].sample(20)
# df_yelp_business_restaurants = df_yelp_business[df_yelp_business['categories'].str.contains('Restaurant')]
# Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
#        'latitude', 'longitude', 'stars', 'review_count', 'is_open',
#        'attributes', 'categories', 'hours']

# Cleanup the features: categories, attributes, city, and name

# Cleanup df_yelp_business
df_yelp_business.dropna(subset=['categories', 'attributes'], inplace=True)

df_yelp_business['city'] = df_yelp_business['city'].str.title() # move to lowercase - title tries to be too smart
df_yelp_business['name'] = df_yelp_business['name'].str.title()
df_yelp_business['attributes'] = df_yelp_business['attributes'].astype(str)
df_yelp_business['categories'] = df_yelp_business['categories'].astype(str)
# df_yelp_business['attributes'] = df_yelp_business['attributes'].apply(lambda col: cleanup_attributes(col)) # get the primarygenre
df_yelp_business_restaurants = df_yelp_business[df_yelp_business['categories'].str.contains('Restaurant')][['business_id','name','city', 
                                                                                                            'postal_code', 'state',
                                                                                                            'latitude','longitude',
                                                                                                            'review_count', 'attributes',
                                                                                                             'categories']]
df_yelp_biz_LV_Rest = df_yelp_business[
                                       (df_yelp_business['categories'].str.contains('Restaurant') == True) &
                                       (df_yelp_business['city'].str.contains('Vegas') == True) &
                                       (df_yelp_business.review_count > 40)
                                      ]

# We should only use df_yelp_biz_LV_Rest from here on...

In [None]:
# df_yelp_biz_LV_Rest.head(20) # there are 4,284 restaurants in Las Vegas, with 40 or more reviews! 24-Jun-2020 
# df_yelp_reviews_LV_Rest = 
# https://stackoverflow.com/questions/34055584/python-pandas-string-contains-and-doesnt-contain
# df_yelp_biz2_LV[(df_yelp_biz2_LV.review_count > 50)].sample(20)
# df_yelp_biz2.sample(20)
df_yelp_biz_LV_Rest.count()
# df_yelp_biz2_socal.head(30)

In [None]:
# df = pd.merge(df,df2[['Key_Column','Target_Column']],on='Key_Column', how='left')
df_yelp_review_LV_Rest = pd.merge(df_yelp_review.sample(100000),df_yelp_biz_LV_Rest['business_id'],on='business_id') # this worked! count=1,484,887!
# Cleanup df_yelp_review
df_yelp_review_LV_Rest['text'] = df_yelp_review_LV_Rest['text'].apply(lambda col: cleanup_text(col)) # Cleanup the yelp_review_LV_Rest only

In [None]:
# df_yelp_review_LV_Rest.head(5)

In [None]:
parameters = {
              'lowercase': True,
              'max_df': 0.65, 
              'min_df': 10,  
              'max_features': 100,
              'norm': u'l2',
              'smooth_idf' :True,
              'stop_words': 'english',
              'tokenizer': LemmaTokenizer(),
              'use_idf': True

}
df_yelp_review_LV_Rest_tfidf = run_tfidf_vectorizer(df_yelp_review_LV_Rest['text'],parameters=parameters)

In [None]:
# df_yelp_review_LV_Rest.count()
# df_yelp_review.count()
df_yelp_business.count()
# df_yelp_review_LV_Rest[['text']].sample(20)
# df_yelp_review_LV_Rest_tfidf[['ha','wa','u']].sample(10)
# df_yelp_review_LV_Rest_tfidf[['ha','wa','u']].describe()

In [None]:
# sorted_df.tail(20)
df_yelp_biz_LV_Rest['categories'].sample(10)

In [None]:
# type(eval(df_yelp_business.categories[0]))

In [None]:
# def get_first_name_in_list(r, column_number, colname):
#     return eval(r.values.tolist()[column_number])[0]['{}'.format(colname)]

# df_movies_dataset['genre']      = df_movies_dataset.apply(lambda row: get_first_name_in_list(row, 1,'name'), axis=1) # get the primarygenre
# df_yelp_business['attributes'].head(5)
# print(type(df_yelp_business['attributes'][0]))

In [None]:
def make_list_of_true_valued_keys(column_value_string):
#  
    debug = False
    non_bool1 = []
    import ast
    
#     https://stackoverflow.com/questions/1207457/convert-a-unicode-string-to-a-string-in-python-containing-extra-symbols
#     title = u"Klüft skräms inför på fédéral électoral große"
#     import unicodedata
#     unicodedata.normalize('NFKD', title).encode('ascii', 'ignore')
#     'Kluft skrams infor pa federal electoral groe'

    column_value_string = cleanup_attributes(column_value_string)
    column_value_string = unicodedata.normalize('NFKD', column_value_string).encode('ascii', 'ignore')
    column_value_string = column_value_string.decode("utf-8")
    column_value = eval(column_value_string) # do this after you do all data cleanup
    
    
    # checkout AST abstract syntax tree... ast.literal_eval(dict_as_string)
#     column_value = ast.literal_eval(column_value_string)
#     print("column_value={}".format(column_value))
    temp_dict = {}
    # these are embedded dictionaries, that I am removing now, but may add back in later
    if 'BusinessParking' in column_value:
        del column_value['BusinessParking']
    if 'Ambience' in column_value:
        del column_value['Ambience']
    if 'Music' in column_value:
        del column_value['Music']
#         Business_parking = {}
#         Business_parking = column_value['BusinessParking']
#         column_value.pop('BusinessParking')
#         temp_dict = {**column_value, **Business_parking}
    return column_value

df_yelp_biz_LV_Rest['attributes_text']      = df_yelp_biz_LV_Rest['attributes'].apply(lambda col: make_list_of_true_valued_keys(col)) # get the primarygenre
df2 = pd.DataFrame.from_dict(df_yelp_biz_LV_Rest['attributes_text'], orient='columns')   

# from: https://stackoverflow.com/questions/21104592/json-to-pandas-dataframe
# path1 = '42.974049,-81.205203|42.974298,-81.195755'
# request=Request('http://maps.googleapis.com/maps/api/elevation/json?locations='+path1+'&sensor=false')
# response = urlopen(request)
# elevations = response.read()
# data = json.loads(elevations)
# df = pd.json_normalize(data['results'])

df_yelp_business_attributes = pd.json_normalize(df_yelp_biz_LV_Rest['attributes_text'])

In [None]:
df_yelp_business_attributes['BikeParking'] = df_yelp_business_attributes['BikeParking'].fillna(0)
df_yelp_business_attributes['BusinessAcceptsCreditCards'] = df_yelp_business_attributes['BikeParking'].fillna(0)
df_yelp_business_attributes['GoodForKids'] = df_yelp_business_attributes['BikeParking'].fillna(0)
df_yelp_business_attributes['ByAppointmentOnly'] = df_yelp_business_attributes['BikeParking'].fillna(0)

# values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
# >>> df.fillna(value=values)
values = {'BusinessAcceptsCreditCards' : 0, 
          'BikeParking': 0, 
          'GoodForKids': 0, 
          'ByAppointmentOnly': 0, 
          'RestaurantsPriceRange2': 0, 
          'DogsAllowed': 0, 
          'WiFi': 0, 
          'RestaurantsAttire' : '0', 
          'RestaurantsTakeOut': 0, 
          'NoiseLevel' : 'unknown', 
          'RestaurantsReservations': 0, 
          'RestaurantsGoodForGroups': 0, 
          'HasTV': 0, 
          'Alcohol' : 'Nun', 
          'RestaurantsDelivery': 0, 
          'OutdoorSeating': 0, 
          'Caters': 0, 
          'WheelchairAccessible': 0, 
          'AcceptsInsurance': 0, 
          'RestaurantsTableService': 0,
#           'GoodForMeal', 
          'HappyHour': 0, 
          'BusinessAcceptsBitcoin': 0, 
          'BYOB': 0, 
          'Corkage': 0, 
          'GoodForDancing': 0, 
          'CoatCheck': 0, 
#           'BestNights': 0, 
          'Smoking': 0, 
          'DietaryRestrictions': 0, 
          'DriveThru': 0, 
          'HairSpecializesIn': 0, 
          'BYOBCorkage': 0, 
          'AgesAllowed': 0, 
          'RestaurantsCounterService': 0, 
          'Open24Hours': 0
         }
df_yelp_business_attributes.fillna(value=values, inplace=True)
# df_yelp_business_attributes.sample(50)
# array([nan, "u'quiet'", "u'average'", "u'loud'", "u'very_loud'",
#        "'average'", "'quiet'", "'loud'", "'very_loud'", 'None'],
#       dtype=object)
# array([nan, "'none'", "u'none'", "u'full_bar'", "u'beer_and_wine'",
#        "'full_bar'", "'beer_and_wine'", 'None'], dtype=object)

# df['new column name'] = df['column name'].apply(lambda x: 'value if condition is met' if x condition else 'value if condition is not met')
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].str.contains('none')] = '0'
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].str.contains('none')] = 1
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].str.contains('none')] = 2
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].str.contains('none')] = 3

# df_yelp_business_attributes.loc[(df_yelp_business_attributes['RestaurantsAttire'] == 'none') | (df_yelp_business_attributes['RestaurantsAttire'] == 'None')] = 0
# df_yelp_business_attributes.loc[(df_yelp_business_attributes['RestaurantsAttire'] == 'casual') | (df_yelp_business_attributes['RestaurantsAttire'] == u"casual")] = 1
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'] == 'dressy'] = 2
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'] == 'formal'] = 3
# S.replace('(-d)','',regex=True, inplace = True)
# df_yelp_business_attributes.loc[df_yelp_business_attributes['RestaurantsAttire'].replace('casual','1', regex=False, inplace=True)]
# value = re.sub(r"[^0-9]+", "", value)
# df_yelp_business_attributes['RestaurantsAttire'] = df_yelp_business_attributes['RestaurantsAttire'].str.replace(u'casual','').astype('int')
# df['P'] = df['P'].str.replace(r'\D+', '').astype('int')
# df_yelp_business_attributes[df_yelp_business_attributes['RestaurantsAttire'].str.contains('*casual*') == True] = 1
# df[df['Country (region)'].str.contains('^I.*')==True]

# S.replace('(-d)','',regex=True, inplace = True)
df_yelp_business_attributes['RestaurantsAttire'].replace('None','0',regex=True, inplace=True)
df_yelp_business_attributes['RestaurantsAttire'].replace('casual','1',regex=True, inplace=True)
df_yelp_business_attributes['RestaurantsAttire'].replace('dressy','2',regex=True, inplace=True)
df_yelp_business_attributes['RestaurantsAttire'].replace('formal','3',regex=True, inplace=True)


In [None]:
# df_yelp_business['attributes'].tail(4)
df_yelp_business_attributes['RestaurantsAttire'].sample(40)
# text123.head(20)
# type(text123[0])
# df_yelp_business_attributes.columns
# df_yelp_business_attributes.RestaurantsAttire.unique()

In [None]:
df_yelp_biz_LV_Rest['attributes_text'].tail(40)

In [None]:
df_yelp_business['categories'].sample(20)

In [None]:
import datetime
print(datetime.datetime.now())