# **Get Data** 


In [1]:
# get Data from Github repo
!git clone https://github.com/AndreaJJCC/CategorySuggestion.git
  
# Or to get data from Google Drive, enable next two lines and
# comment out previous line
#from google.colab import drive
#drive.mount('/content/gdrive')

fatal: destination path 'CategorySuggestion' already exists and is not an empty directory.


# **Import Necessary Libraries**

In [2]:
# Other necessary installations/downloads
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# Imports
import json
import pandas as pd
import re 
import os
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [0]:
# Define file paths
# To modify file paths, click on > symbol on the top left corner of the screen
# (under the CO symbol), click on the "Files" tab, and click on the "CategorySuggestion"
# folder. You can then right click on the file and choose "Copy path"
main_dir = '/content/CategorySuggestion/'
business_dir = main_dir + 'yelp_academic_dataset_business.json'
reviews_dir = main_dir + 'temp_reviews.json'

In [5]:
# Unzip the business json file
os.chdir('/content/CategorySuggestion/')
!unzip -o /content/CategorySuggestion/yelp_academic_dataset_business.zip

Archive:  /content/CategorySuggestion/yelp_academic_dataset_business.zip
  inflating: yelp_academic_dataset_business.json  


#**Load and Visualize Data**

In [0]:
# Define function to load files
# Input: file directory
# Output: list of json objects (dictionaries)
def load_data( directory):
  with open(directory) as f:
    data = []
    for line in f:
      data.append(json.loads(line))
  return data

In [7]:
# Load business records as dictionaries
# and convert dictionaries to pandas dataframe
business_df = pd.DataFrame.from_dict(load_data(business_dir))
business_records = business_df.shape[0]
print('Business data\n' + 'Loaded ' + str(business_records) + ' records.')
business_df.head(2)

Business data
Loaded 188593 records.


Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,1314 44 Avenue NE,"{'BikeParking': 'False', 'BusinessAcceptsCredi...",Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",Calgary,"{'Monday': '8:30-17:0', 'Tuesday': '11:0-21:0'...",1,51.091813,-114.031675,Minhas Micro Brewery,,T2E 6L6,24,4.0,AB
1,,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,"{'Friday': '17:0-23:0', 'Saturday': '17:0-23:0...",0,35.960734,-114.939821,CK'S BBQ & Catering,,89002,3,4.5,NV


In [8]:
# Load reviews records as dictionaries
# and convert dictionaries to pandas dataframe
# NOTE: the file being read only contains the first 5,000 records provided by
# Yelp. The entire reviews file provided by Yelp cannot be loaded due to memory issues.
reviews_df = pd.DataFrame.from_dict(load_data(reviews_dir))
print('Reviews data\n' + 'Loaded ' + str(reviews_df.shape[0]) + ' records.')
reviews_df.head(2)

Reviews data
Loaded 5000 records.


Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,iCQpiavjjPzJ5_3gPD5Ebg,0,2011-02-25,0,x7mDIiDB3jEiPGPHOmDzyw,2,The pizza was okay. Not the best I've had. I p...,0,msQe1u7Z_XuqjGoqhB0J5g
1,pomGBqfbxcqPv14c3XH-ZQ,0,2012-11-13,0,dDl8zu1vWPdKGihJrwQbpw,5,I love this place! My fiance And I go here atl...,0,msQe1u7Z_XuqjGoqhB0J5g


# **Preprocess Data**

In [0]:
# Define function to convert text to lowercase and  remove punctuation
def to_lower_and_punc( col ):
  return col.astype(str)\
            .str.lower()\
            .str.replace('[^a-z\s]', '') # Only leave letters and whitespaces


## **Preprocess Categories List**

In [10]:
# Convert categories column to lowercase
business_df['categories'] = to_lower_and_punc(business_df.categories)
# Tokenize categories column
business_df['categories'] = business_df['categories'].apply(nltk.word_tokenize)
print(business_df.categories[0:4])

0    [tours, breweries, pizza, restaurants, food, h...
1    [chicken, wings, burgers, caterers, street, ve...
2    [breakfast, brunch, restaurants, french, sandw...
3                     [insurance, financial, services]
Name: categories, dtype: object


## **Preprocess Business Dataframe**

In [11]:
# Filter out only necessary information from business dataframe
business_df = business_df[['business_id', 'categories', 'name', 'review_count']]

# Filter out businesses that have n or more reviews
n = 5 # This is an arbitrary number and can be modify.
business_df = business_df[business_df.review_count > n]
rev_filtered_business = business_df.shape[0]
print('Number of businesses with more than ' + str(n) + ' reviews = ' + str(rev_filtered_business))

print('Percentage of filtered businesses = %2.2f%% (%2d/%2d)' % ( ((rev_filtered_business/business_records) * 100), rev_filtered_business, business_records ) )

Number of businesses with more than 5 reviews = 122186
Percentage of filtered businesses = 64.79% (122186/188593)


## **Preprocess Reviews dataframe and Merge with Business dataframe**

In [12]:
# Filter out only necessary information from  reviews dataframe
reviews_df = reviews_df[['business_id', 'text']]
reviews_df.head(5)

Unnamed: 0,business_id,text
0,iCQpiavjjPzJ5_3gPD5Ebg,The pizza was okay. Not the best I've had. I p...
1,pomGBqfbxcqPv14c3XH-ZQ,I love this place! My fiance And I go here atl...
2,jtQARsP6P-LbkyjbO1qNGg,Terrible. Dry corn bread. Rib tips were all fa...
3,elqbBhBfElMNSrjFqW3now,Back in 2005-2007 this place was my FAVORITE t...
4,Ums3gaP2qM3W1XcA5r6SsQ,Delicious healthy food. The steak is amazing. ...


In [13]:
# Join business_df and reviews_df by business_id
# This will give us a dataframe with only necessary
# Information and filtered out businesses with
# Predefined number of reviews
data = pd.merge(business_df, reviews_df, how = 'inner')
print('The total number of records in the dataframe is ' + str(data.shape[0]))
data.head(5)

The total number of records in the dataframe is 4740


Unnamed: 0,business_id,categories,name,review_count,text
0,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",Allwyn's Bakery,105,Love the jerk chicken sandwich and jerk chicke...
1,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",Luckee,171,Came here for a lovely dinner with husband be...
2,2ktKjN5z8EcqmUv6EDiDgA,"[fashion, department, stores, automotive, shop...",Costco,121,Got $1000 worth of tires today. They told me i...
3,ADmJgVJ82zdLzffdaH1gVw,"[food, specialty, food, organic, stores, healt...",Planet Organic Market,14,I have given this store so many chances becaus...
4,V90fC_aF-_DNYzQvUtbLww,"[hotels, travel, asian, fusion, day, spas, cas...",Jayde Fuzion,246,We are locals and decided to try Jayde since w...


In [14]:
# For each record: Combined the 'name' and 'text' columns into the 'text' column
data['text'] = data['name'] + ' ' + data['text']
# For each record: Remove punctuation from 'text' and convert to lowercase
data['text'] = to_lower_and_punc(data.text)
data.head(5)

Unnamed: 0,business_id,categories,name,review_count,text
0,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",Allwyn's Bakery,105,allwyns bakery love the jerk chicken sandwich ...
1,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",Luckee,171,luckee came here for a lovely dinner with husb...
2,2ktKjN5z8EcqmUv6EDiDgA,"[fashion, department, stores, automotive, shop...",Costco,121,costco got worth of tires today they told me ...
3,ADmJgVJ82zdLzffdaH1gVw,"[food, specialty, food, organic, stores, healt...",Planet Organic Market,14,planet organic market i have given this store ...
4,V90fC_aF-_DNYzQvUtbLww,"[hotels, travel, asian, fusion, day, spas, cas...",Jayde Fuzion,246,jayde fuzion we are locals and decided to try ...


In [15]:
# For each record: remove stopwords from 'text'
stop_list = stopwords.words('english')
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_list)]))
data.head(5)

Unnamed: 0,business_id,categories,name,review_count,text
0,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",Allwyn's Bakery,105,allwyns bakery love jerk chicken sandwich jerk...
1,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",Luckee,171,luckee came lovely dinner husband weeks ago gr...
2,2ktKjN5z8EcqmUv6EDiDgA,"[fashion, department, stores, automotive, shop...",Costco,121,costco got worth tires today told would long w...
3,ADmJgVJ82zdLzffdaH1gVw,"[food, specialty, food, organic, stores, healt...",Planet Organic Market,14,planet organic market given store many chances...
4,V90fC_aF-_DNYzQvUtbLww,"[hotels, travel, asian, fusion, day, spas, cas...",Jayde Fuzion,246,jayde fuzion locals decided try jayde since lo...


In [16]:
# For each record: apply stemming to 'text'
ps = PorterStemmer()
data['text'] = data['text'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))
data.head(5)

Unnamed: 0,business_id,categories,name,review_count,text
0,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",Allwyn's Bakery,105,allwyn bakeri love jerk chicken sandwich jerk ...
1,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",Luckee,171,lucke came love dinner husband week ago great ...
2,2ktKjN5z8EcqmUv6EDiDgA,"[fashion, department, stores, automotive, shop...",Costco,121,costco got worth tire today told would long wa...
3,ADmJgVJ82zdLzffdaH1gVw,"[food, specialty, food, organic, stores, healt...",Planet Organic Market,14,planet organ market given store mani chanc liv...
4,V90fC_aF-_DNYzQvUtbLww,"[hotels, travel, asian, fusion, day, spas, cas...",Jayde Fuzion,246,jayd fuzion local decid tri jayd sinc love hot...


# Create Features with CountVectorizer
https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/

In [17]:
# Initizalize the transform
cnt_vectorizer = CountVectorizer()
# Tokenize text column and build vocabulary
cnt_vectorizer.fit(data['text'])
# Create sparse matrix
cnt_vector = cnt_vectorizer.transform(data['text'])


# Create column count_features and add the array values
# returned from CountVectorizer for each record
data['count_features'] = list(cnt_vector.toarray())
print('The size of the Count sparse matrix is ' + str(cnt_vector.shape))
data.head(5)

The size of the Count sparse matrix is (4740, 16148)


Unnamed: 0,business_id,categories,name,review_count,text,count_features
0,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",Allwyn's Bakery,105,allwyn bakeri love jerk chicken sandwich jerk ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",Luckee,171,lucke came love dinner husband week ago great ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2ktKjN5z8EcqmUv6EDiDgA,"[fashion, department, stores, automotive, shop...",Costco,121,costco got worth tire today told would long wa...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,ADmJgVJ82zdLzffdaH1gVw,"[food, specialty, food, organic, stores, healt...",Planet Organic Market,14,planet organ market given store mani chanc liv...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,V90fC_aF-_DNYzQvUtbLww,"[hotels, travel, asian, fusion, day, spas, cas...",Jayde Fuzion,246,jayd fuzion local decid tri jayd sinc love hot...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Create Features with TfidfVectorizer

In [18]:
# Create the transform
vectorizer = TfidfVectorizer()
# Tokenize text column and build vocabulary
vectorizer.fit(data['text']) 
# Create sparse matrix
vector = vectorizer.transform(data['text'])


# Create column tfidf_features and add the array values
# returned from TfidfVectorizer for each record
data['tfidf_features'] = list(vector.toarray())
print('The size of the TFIDF sparse matrix is ' + str(vector.shape))
data.head(5)

The size of the TFIDF sparse matrix is (4740, 16148)


Unnamed: 0,business_id,categories,name,review_count,text,count_features,tfidf_features
0,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",Allwyn's Bakery,105,allwyn bakeri love jerk chicken sandwich jerk ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",Luckee,171,lucke came love dinner husband week ago great ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2ktKjN5z8EcqmUv6EDiDgA,"[fashion, department, stores, automotive, shop...",Costco,121,costco got worth tire today told would long wa...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,ADmJgVJ82zdLzffdaH1gVw,"[food, specialty, food, organic, stores, healt...",Planet Organic Market,14,planet organ market given store mani chanc liv...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,V90fC_aF-_DNYzQvUtbLww,"[hotels, travel, asian, fusion, day, spas, cas...",Jayde Fuzion,246,jayd fuzion local decid tri jayd sinc love hot...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [21]:
# Print the vocabulary list and the idf values
print('TFIDF Vocabulary: (word:tokenID) ', vectorizer.vocabulary_)
print('The IDF values are: ', vectorizer.idf_)
# summarize encoded vector
print('The first element of the TFIDF sparse matrix is ' + str(vector.toarray()[0]))

The IDF values are:  [8.77085618 8.36539107 8.77085618 ... 8.77085618 8.77085618 8.77085618]
The first element of the TFIDF sparse matrix is [0. 0. 0. ... 0. 0. 0.]


# Save to File

In [0]:
#data.to_json( path_or_buf = main_dir + 'features.json', orient = 'records', lines = True, index = True)