In [133]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter
import time
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

%matplotlib inline

In [134]:
# for business review text
reviews=pd.read_csv("yelp_review.csv")

# for business categories
businesses=pd.read_csv("yelp_business.csv")

In [135]:
# get stopwords like "the", "a", etc. to ignore in review word counting
stop_words = set(stopwords.words('english'))
stop_words.update(['food', 'restaurant', 'restaurants', 'plannning', 'shop', 'service'])

In [136]:
# combine all reviews for each business into a dictionary of business_id keys and concatenated review values
review_dict = defaultdict(str)
for i,x in reviews.iterrows():
    review_dict[x["business_id"]] += x["text"]

In [137]:
review_dict.get('AEx2SYEUJmTxVVB18LlCwA')



In [138]:
# create dictionary of most common words (in decreasing order) in business reviews. keys are business_id strings
count_dict = defaultdict(Counter)
for key, value in review_dict.items():
    tokenizer = RegexpTokenizer(r'\w+')
    # tokenize to ignore punctuation and split strings into words
    words = tokenizer.tokenize(value.lower())
    words = [x for x in words if x not in stop_words]
    counts = Counter(words).most_common()
    count_dict[key] = counts

In [139]:
count_dict.get('AEx2SYEUJmTxVVB18LlCwA')

[('place', 78),
 ('sandwich', 78),
 ('special', 66),
 ('wilensky', 64),
 ('one', 50),
 ('mustard', 48),
 ('bologna', 40),
 ('like', 38),
 ('montreal', 36),
 ('go', 33),
 ('soda', 31),
 ('salami', 30),
 ('cheese', 30),
 ('de', 30),
 ('le', 29),
 ('counter', 27),
 ('old', 25),
 ('un', 25),
 ('good', 24),
 ('order', 24),
 ('lunch', 23),
 ('time', 23),
 ('cherry', 23),
 ('since', 22),
 ('made', 21),
 ('swiss', 21),
 ('la', 21),
 ('beef', 20),
 ('back', 20),
 ('get', 18),
 ('really', 18),
 ('est', 18),
 ('simple', 17),
 ('thing', 17),
 ('great', 17),
 ('et', 17),
 ('way', 16),
 ('well', 16),
 ('staff', 15),
 ('sandwiches', 15),
 ('l', 15),
 ('friendly', 14),
 ('pressed', 14),
 ('make', 14),
 ('du', 14),
 ('history', 13),
 ('know', 13),
 ('would', 13),
 ('pickles', 13),
 ('years', 13),
 ('light', 13),
 ('sour', 13),
 ('ever', 13),
 ('much', 13),
 ('definitely', 13),
 ('eat', 13),
 ('pour', 13),
 ('grilled', 13),
 ('served', 12),
 ('nothing', 12),
 ('delicious', 12),
 ('people', 12),
 ('first

In [140]:
print(businesses[businesses.business_id == 'AEx2SYEUJmTxVVB18LlCwA']['name'])
print(list(businesses[businesses.business_id == 'AEx2SYEUJmTxVVB18LlCwA']['categories']))

141797    "Wilensky's"
Name: name, dtype: object
['Diners;Food;Restaurants;Delis']


In [141]:
count_dict.get('VR6GpWIda3SfvPC-lg9H3w')

[('tuck', 51),
 ('de', 34),
 ('one', 31),
 ('place', 30),
 ('good', 28),
 ('et', 27),
 ('us', 25),
 ('delicious', 25),
 ('great', 24),
 ('made', 22),
 ('really', 21),
 ('meal', 20),
 ('pork', 19),
 ('go', 19),
 ('amazing', 19),
 ('montreal', 19),
 ('went', 19),
 ('back', 19),
 ('le', 19),
 ('belly', 18),
 ('little', 18),
 ('time', 18),
 ('best', 18),
 ('like', 16),
 ('well', 16),
 ('salad', 15),
 ('la', 15),
 ('5', 15),
 ('also', 15),
 ('pour', 15),
 ('bit', 14),
 ('loved', 14),
 ('perfect', 14),
 ('menu', 13),
 ('fish', 13),
 ('dinner', 13),
 ('would', 13),
 ('night', 13),
 ('even', 13),
 ('wine', 12),
 ('table', 12),
 ('ever', 12),
 ('super', 12),
 ('try', 12),
 ('une', 12),
 ('reservation', 11),
 ('could', 11),
 ('steak', 11),
 ('excellent', 11),
 ('que', 11),
 ('came', 11),
 ('crispy', 11),
 ('way', 11),
 ('friends', 11),
 ('bar', 11),
 ('say', 11),
 ('les', 11),
 ('à', 11),
 ('dish', 10),
 ('lamb', 10),
 ('friend', 10),
 ('sauce', 10),
 ('everyone', 10),
 ('un', 10),
 ('top', 10),

In [142]:
print(businesses[businesses.business_id == 'VR6GpWIda3SfvPC-lg9H3w']['name'])
print(list(businesses[businesses.business_id == 'VR6GpWIda3SfvPC-lg9H3w']['categories']))

110508    "Tuck Shop"
Name: name, dtype: object
['Restaurants;Canadian (New);Italian']


In [143]:
businesses[businesses.state == 'NC'][business.city == 'Charlotte']

  """Entry point for launching an IPython kernel.


Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
14,fNMVV_ZX7CJSDWQGdOM8Nw,"""Showmars Government Center""",Uptown,"""600 E 4th St""",Charlotte,NC,28202,35.221647,-80.839345,3.5,7,1,Restaurants;American (Traditional)
56,ykJM7EuGziATZ5u2qIT08g,"""My Fitness World""",South End,"""1225 S Church St, Ste B""",Charlotte,NC,28203,35.220211,-80.855785,5.0,4,1,Active Life;Fitness & Instruction;Gyms
77,NFTh6mj3X2AnHJCKIFUk5Q,"""Fairfield Inn & Suites""",,"""8540 E Independence Blvd""",Charlotte,NC,28105,35.147422,-80.726487,4.0,17,1,Hotels;Event Planning & Services;Hotels & Travel
84,nJ3mXjItS8WcwhYbzbfDQw,"""Scott's Karat Patch""",South Park,"""5110 Park Rd, Ste 2F""",Charlotte,NC,28209,35.160889,-80.849260,4.0,5,1,Watch Repair;Jewelry Repair;Shopping;Local Ser...
97,HAX1zec191t7QkT2sBZ76A,"""La Isla Cuban Restaurant""",,"""1816 Galerea Blvd, Ste D""",Charlotte,NC,28270,35.137223,-80.734594,3.0,4,0,Restaurants;Cuban
102,c6Q3HP4cmWZbD9GX8kr4IA,"""Pep Boys""",,"""4837 N Tryon St""",Charlotte,NC,28213,35.258682,-80.785106,3.5,8,1,Auto Parts & Supplies;Auto Repair;Tires;Automo...
103,c7X2SdKxVJMaOnFROO8WEg,"""Finga Lickin' Caribbean Eatery""",,"""2838 The Plz""",Charlotte,NC,28205,35.236823,-80.801084,4.5,21,1,Pizza;Food;Internet Cafes;Restaurants;Caribbean
126,WUiDaFQRZ8wKYGLvmjFjAw,"""China Buffet""",University City,"""8630 University Executive Park Dr""",Charlotte,NC,28262,35.306173,-80.752672,3.5,76,1,Buffets;Restaurants;Sushi Bars;Chinese
128,5q6Xh-UcJa78bp6dzyaE7w,"""Duck Donuts""",Dilworth,"""1710 Kenilworth Ave, Ste 220""",Charlotte,NC,28203,35.202624,-80.844419,4.5,373,1,Breakfast & Brunch;Food;Coffee & Tea;Donuts;Re...
155,mtTxLi9CZNOsDqOTJH3pQw,"""Felicitea""",NoDa,"""516 E 15th St, Ste 11D""",Charlotte,NC,28206,35.232883,-80.825610,3.0,4,0,Food;Coffee & Tea;Massage;Beauty & Spas


In [144]:
count_dict.get('c7X2SdKxVJMaOnFROO8WEg')

[('chicken', 25),
 ('jerk', 19),
 ('good', 19),
 ('place', 13),
 ('curry', 12),
 ('cheese', 12),
 ('caribbean', 11),
 ('plantains', 11),
 ('rice', 11),
 ('great', 10),
 ('mac', 10),
 ('peas', 9),
 ('like', 7),
 ('got', 7),
 ('back', 7),
 ('really', 7),
 ('cabbage', 6),
 ('best', 5),
 ('go', 5),
 ('n', 5),
 ('clean', 4),
 ('friendly', 4),
 ('flavor', 4),
 ('stars', 4),
 ('would', 4),
 ('time', 4),
 ('goat', 4),
 ('portions', 4),
 ('spot', 4),
 ('lot', 3),
 ('well', 3),
 ('owner', 3),
 ('even', 3),
 ('know', 3),
 ('also', 3),
 ('much', 3),
 ('jamaican', 3),
 ('price', 3),
 ('bit', 3),
 ('way', 3),
 ('definitely', 3),
 ('cash', 3),
 ('us', 3),
 ('get', 3),
 ('spicy', 3),
 ('sauce', 3),
 ('put', 3),
 ('came', 3),
 ('ordered', 3),
 ('fried', 3),
 ('ox', 3),
 ('tail', 3),
 ('charlotte', 3),
 ('wanted', 3),
 ('super', 3),
 ('went', 3),
 ('used', 3),
 ('delicious', 3),
 ('order', 3),
 ('pretty', 3),
 ('beef', 3),
 ('small', 3),
 ('provide', 2),
 ('fresh', 2),
 ('macaroni', 2),
 ('love', 2),
 (

In [145]:
categories = str(businesses[businesses.business_id == 'Wcrq2gwUCOH9FPyQPnoUiQ']['categories']).lower().split()


categories = [x.split(';') for x in categories]
categories = [item for sublist in categories for item in sublist]
print(categories)

['174420', 'fitness', '&', 'instruction', 'gyms', 'active', 'life', 'name:', 'categories,', 'dtype:', 'object']


In [146]:
# create a repository for the simplified categories where the key is business_id
clean_category_dict = defaultdict(str)

# iterate through each business (key in count_dict is business_id)
for key in count_dict.keys():
    # split categories into separate strings and make all lower case
    categories = str(businesses[businesses.business_id == key]['categories']).lower().split()
    # split remaining semicolon separated strings
    categories = [x.split(';') for x in categories]
    # consecutive splits created nested lists, so flattening here
    categories = [item for sublist in categories for item in sublist]
    
    # count_dict has tuples of (word, count) as values
    for tup in count_dict.get(key):
        for cat in categories:
            if tup[0] == cat:
                clean_category_dict[key] = tup[0]
                break
    


In [147]:
categories = str(businesses[businesses.business_id == 'c7X2SdKxVJMaOnFROO8WEg']['categories']).lower().split()


categories = [x.split(';') for x in categories]
categories = [item for sublist in categories for item in sublist]

for tup in count_dict.get('c7X2SdKxVJMaOnFROO8WEg'):
    for cat in categories:
        if tup[0] == cat:
            print(tup[0])
            break

caribbean


In [None]:
type(clean_category_dict.get['c6Q3HP4cmWZbD9GX8kr4IA'])