# **Get Data** 


In [1]:
#from google.colab import drive
#drive.mount('/content/gdrive')
!git clone https://github.com/AndreaJJCC/CategorySuggestion.git

fatal: destination path 'CategorySuggestion' already exists and is not an empty directory.


# **Import Necessary Libraries**

In [2]:
# Other necessary installations/downloads
import nltk
nltk.download('punkt')
!pip install efficient-apriori
from efficient_apriori import apriori

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
# Imports
import json #lines
import pandas as pd
import re 
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import random

In [0]:
# Define file paths
main_dir = '/content/CategorySuggestion/'
business_dir = main_dir + 'yelp_academic_dataset_business.json'
reviews_dir = main_dir + 'temp_reviews.json'

In [5]:
os.chdir('/content/CategorySuggestion/')
!unzip -o /content/CategorySuggestion/yelp_academic_dataset_business.zip

Archive:  /content/CategorySuggestion/yelp_academic_dataset_business.zip
  inflating: yelp_academic_dataset_business.json  


#**Load and Visualize Data**

In [0]:
# Define function to load files
# Input: file directory
# Output: dictionary of json objects
def load_data( directory):
  with open(directory) as f:
    data = []
    for line in f:
      data.append(json.loads(line))
  return data

In [7]:
# Load json files as dictionaries
# and convert dictionaries to pandas dataframe
business_df = pd.DataFrame.from_dict(load_data(business_dir))
business_records = business_df.shape[0]
print('Business data\n' + 'Loaded ' + str(business_records) + ' records.')
business_df.head(2) 


Business data
Loaded 188593 records.


Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,1314 44 Avenue NE,"{'BikeParking': 'False', 'BusinessAcceptsCredi...",Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",Calgary,"{'Monday': '8:30-17:0', 'Tuesday': '11:0-21:0'...",1,51.091813,-114.031675,Minhas Micro Brewery,,T2E 6L6,24,4.0,AB
1,,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,"{'Friday': '17:0-23:0', 'Saturday': '17:0-23:0...",0,35.960734,-114.939821,CK'S BBQ & Catering,,89002,3,4.5,NV


In [8]:
# Load json files as dictionaries
# and convert dictionaries to pandas dataframe
reviews_df = pd.DataFrame.from_dict(load_data(reviews_dir))
print('Reviews data\n' + 'Loaded ' + str(reviews_df.shape[0]) + ' records.')
reviews_df.head(2)

Reviews data
Loaded 5000 records.


Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,iCQpiavjjPzJ5_3gPD5Ebg,0,2011-02-25,0,x7mDIiDB3jEiPGPHOmDzyw,2,The pizza was okay. Not the best I've had. I p...,0,msQe1u7Z_XuqjGoqhB0J5g
1,pomGBqfbxcqPv14c3XH-ZQ,0,2012-11-13,0,dDl8zu1vWPdKGihJrwQbpw,5,I love this place! My fiance And I go here atl...,0,msQe1u7Z_XuqjGoqhB0J5g


# **Preprocess Data**

In [0]:
# Define function to convert text to lowercase and  remove punctuation
def to_lower_and_punc( col ):
  return col.astype(str)\
            .str.lower()\
            .str.replace('[^a-z\s]', '')


## **Begin Preprocessing of Categories List**

In [10]:
# Convert categories column to lowercase
business_df['categories'] = to_lower_and_punc(business_df.categories)
# Tokenize categories column
business_df['categories'] = business_df['categories'].apply(nltk.word_tokenize)
print(business_df.categories[0:4])

0    [tours, breweries, pizza, restaurants, food, h...
1    [chicken, wings, burgers, caterers, street, ve...
2    [breakfast, brunch, restaurants, french, sandw...
3                     [insurance, financial, services]
Name: categories, dtype: object


## **Begin Preprocessing of business dataframe**

In [11]:
# Filter out only necessary information from business dataframe
business_df = business_df[['business_id', 'categories', 'name', 'review_count']]

# Filter out businesses that have n or more reviews
n = 5
business_df = business_df[business_df.review_count > n]
rev_filtered_business = business_df.shape[0]
print('Number of businesses with more than ' + str(n) + ' reviews = ' + str(rev_filtered_business))

# Filter out businesses that do not have a category assigned
business_df = business_df.dropna(subset = ['categories'])
cat_filtered_business = business_df.shape[0]
print('Number of businesses with categories (i.e. category not null) ', cat_filtered_business)

print('Percentage of filtered businesses = %2.2f%% (%2d/%2d)' % ( ((cat_filtered_business/business_records) * 100), cat_filtered_business, business_records ) )

# View dataframe
#business_df.head(5)

Number of businesses with more than 5 reviews = 122186
Number of businesses with categories (i.e. category not null)  122186
Percentage of filtered businesses = 64.79% (122186/188593)


## **Preprocess of reviews dataframe and Merge with Business dataframe**

In [12]:
# Filter out only necessary information from  reviews dataframe
reviews_df = reviews_df[['business_id', 'text']]
reviews_df.head(5)

Unnamed: 0,business_id,text
0,iCQpiavjjPzJ5_3gPD5Ebg,The pizza was okay. Not the best I've had. I p...
1,pomGBqfbxcqPv14c3XH-ZQ,I love this place! My fiance And I go here atl...
2,jtQARsP6P-LbkyjbO1qNGg,Terrible. Dry corn bread. Rib tips were all fa...
3,elqbBhBfElMNSrjFqW3now,Back in 2005-2007 this place was my FAVORITE t...
4,Ums3gaP2qM3W1XcA5r6SsQ,Delicious healthy food. The steak is amazing. ...


In [13]:
# Join business_df and reviews_df by business_id
# This will give us a dataframe with only necessary
# Information and filtered out businesses with
# Predefined number of reviews

data = pd.merge(business_df, reviews_df, how = 'inner')
print('The total number of records in the dataframe is ' + str(data.shape[0]))
data.head(5)

The total number of records in the dataframe is 4740


Unnamed: 0,business_id,categories,name,review_count,text
0,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",Allwyn's Bakery,105,Love the jerk chicken sandwich and jerk chicke...
1,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",Luckee,171,Came here for a lovely dinner with husband be...
2,2ktKjN5z8EcqmUv6EDiDgA,"[fashion, department, stores, automotive, shop...",Costco,121,Got $1000 worth of tires today. They told me i...
3,ADmJgVJ82zdLzffdaH1gVw,"[food, specialty, food, organic, stores, healt...",Planet Organic Market,14,I have given this store so many chances becaus...
4,V90fC_aF-_DNYzQvUtbLww,"[hotels, travel, asian, fusion, day, spas, cas...",Jayde Fuzion,246,We are locals and decided to try Jayde since w...


In [14]:
# For each review: Remove punctuation and convert to lowercase
data['text'] = to_lower_and_punc(data.text)
# Also normalize the names of businesses
data['name'] = to_lower_and_punc(data.name)
data.head(5)

Unnamed: 0,business_id,categories,name,review_count,text
0,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",allwyns bakery,105,love the jerk chicken sandwich and jerk chicke...
1,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",luckee,171,came here for a lovely dinner with husband be...
2,2ktKjN5z8EcqmUv6EDiDgA,"[fashion, department, stores, automotive, shop...",costco,121,got worth of tires today they told me it woul...
3,ADmJgVJ82zdLzffdaH1gVw,"[food, specialty, food, organic, stores, healt...",planet organic market,14,i have given this store so many chances becaus...
4,V90fC_aF-_DNYzQvUtbLww,"[hotels, travel, asian, fusion, day, spas, cas...",jayde fuzion,246,we are locals and decided to try jayde since w...


In [15]:
data['name_text'] = data['name'] + ' ' + data['text']
data.head(5)

Unnamed: 0,business_id,categories,name,review_count,text,name_text
0,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",allwyns bakery,105,love the jerk chicken sandwich and jerk chicke...,allwyns bakery love the jerk chicken sandwich ...
1,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",luckee,171,came here for a lovely dinner with husband be...,luckee came here for a lovely dinner with husb...
2,2ktKjN5z8EcqmUv6EDiDgA,"[fashion, department, stores, automotive, shop...",costco,121,got worth of tires today they told me it woul...,costco got worth of tires today they told me ...
3,ADmJgVJ82zdLzffdaH1gVw,"[food, specialty, food, organic, stores, healt...",planet organic market,14,i have given this store so many chances becaus...,planet organic market i have given this store ...
4,V90fC_aF-_DNYzQvUtbLww,"[hotels, travel, asian, fusion, day, spas, cas...",jayde fuzion,246,we are locals and decided to try jayde since w...,jayde fuzion we are locals and decided to try ...


## Duplicate Data and assign one label

In [0]:
# Define function to duplicate data from main dataframe
# Input: dataframe
# Output: dataframe with duplicated records (one copy of record per category in categories column)
def duplicate_data(dataframe):
  
  i = 0
  cat_list = []   # list of single categories for duplicated records
  duplicate = []  # list of number of times a record must be duplicated
  # For the category list of each record
  for lst in dataframe.categories:
    if lst == dataframe.categories[i]:
      # save the number of times a record will be duplicate -> len(lst) times
      # (appends an int to duplicate list, which is used later)
      duplicate.append(len(lst))
      # Also form a list of individual category labels for duplicated records
      # i.e. Record 1) [restautarant, bars]
      #      Record 2) [food, tea] 
      # --> [[restaurant],[bars], [food], [tea]]
      for category in lst:
        cat_list.append(category)
      i = i + 1
      
  # duplicate each record based on the value of duplicate list    
  dataframe = dataframe.loc[np.repeat(dataframe.index.values, duplicate)].reset_index(drop = True)
  # Create column of individual labels for each record, including duplicated records
  dataframe['label'] = cat_list
  print("The number of single categories is ", len(cat_list))
  return  dataframe

# **Create Features with TfidfVectorizer**

In [17]:
# Create the transform
tfidf_vectorizer = TfidfVectorizer()

# Tokenize text column and build vocabulary
tfidf_vectorizer.fit(data['text'])

# Print the vocabulary list and the idf values
print('Vocabulary: (word:tokenID) ', tfidf_vectorizer.vocabulary_)
#myidf = tfidf_vectorizer.idf_
#print('The IDF values are: ', myidf)

# Create sparse matrix
tfidf_vector = tfidf_vectorizer.transform(data['text'])

# summarize encoded vector
print('The size of the sparse matrix is ' + str(tfidf_vector.shape))
print('The first element of the sparse matrix is ' + str(tfidf_vector.toarray()[0]))

# Create column tfidf_features and add the array values
# returned from TfidfVectorizer for each record
data['tfidf_features'] = list(tfidf_vector.toarray())

The size of the sparse matrix is (4740, 20475)
The first element of the sparse matrix is [0. 0. 0. ... 0. 0. 0.]


In [0]:
# Initizalize the transform
name_tfidf_vectorizer = TfidfVectorizer() # uses text + business_name as features
# Tokenize text column and build vocabulary
name_tfidf_vectorizer.fit(data['name_text']) # uses text + business_name as features
# Create sparse matrix
name_tfidf_vector = name_tfidf_vectorizer.transform(data['name_text'])
# Create column tfidf_features and add the array values
# returned from TfidfVectorizer for each record
data['name_tfidf_features'] = list(name_tfidf_vector.toarray())

In [19]:
# duplicate each record and assign single category to each copy of a specific record
dup_data = duplicate_data(data)
dup_data.head(15)

The number of single categories is  30189


Unnamed: 0,business_id,categories,name,review_count,text,name_text,tfidf_features,name_tfidf_features,label
0,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",allwyns bakery,105,love the jerk chicken sandwich and jerk chicke...,allwyns bakery love the jerk chicken sandwich ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",caribbean
1,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",allwyns bakery,105,love the jerk chicken sandwich and jerk chicke...,allwyns bakery love the jerk chicken sandwich ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",food
2,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",allwyns bakery,105,love the jerk chicken sandwich and jerk chicke...,allwyns bakery love the jerk chicken sandwich ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",bakeries
3,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",allwyns bakery,105,love the jerk chicken sandwich and jerk chicke...,allwyns bakery love the jerk chicken sandwich ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",restaurants
4,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",luckee,171,came here for a lovely dinner with husband be...,luckee came here for a lovely dinner with husb...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",restaurants
5,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",luckee,171,came here for a lovely dinner with husband be...,luckee came here for a lovely dinner with husb...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",chinese
6,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",luckee,171,came here for a lovely dinner with husband be...,luckee came here for a lovely dinner with husb...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",dim
7,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",luckee,171,came here for a lovely dinner with husband be...,luckee came here for a lovely dinner with husb...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",sum
8,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",luckee,171,came here for a lovely dinner with husband be...,luckee came here for a lovely dinner with husb...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",breakfast
9,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",luckee,171,came here for a lovely dinner with husband be...,luckee came here for a lovely dinner with husb...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",brunch


## Train Model

In [20]:
# Split dataset into 8:2 ratio
training_data = dup_data.sample(frac = 0.8, random_state = np.random.RandomState(seed = None)) #random.randint(0, dup_data.shape[0]))

print('Training data has ' + str(training_data.shape[0]) + ' records.')

testing_data = dup_data[~dup_data.isin(training_data)].dropna()

print('Testing data has ' + str(testing_data.shape[0]) + ' records.')

Training data has 24151 records.
Testing data has 6038 records.


In [21]:
# Initialize model
tfidf_model = MultinomialNB()
# Train model with training data
tfidf_model.fit(training_data.tfidf_features.tolist(), training_data.label)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
# Initialize model
name_tfidf_model = MultinomialNB()
# Train model with training data
name_tfidf_model.fit(training_data.name_tfidf_features.tolist(), training_data.label)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
# Creates a list of lists of the top N classes
# Input: model, from which we get the classes/labels
#        probabilities list, list of lists
#        N, the number of classes to return
# Output: returns a series of the top N classes of a given list
def getNclasses( model, probabilities, N):

  topNclasses = [] # list that will contain top N classes for a given record
  # for each probabaility vector (for each record)
  for value in probabilities:
    # get tuple (probability, class/label)
    # sort probabilities in descending order
    # get top N tuples i.e. the tuples with higher probabilities
    tmp = sorted(zip(value, model.classes_), reverse=True)[:N]

    topN = []
    # for each tuple (probabilities, class/label) only append the class/label
    # to the topN list for each record
    # i.e. for a given record: [(0.25, food), (0.2, tea), (0.1, bakery)]
    # it returns [food, tea, bakery]
    for e in tmp:
      topN.append(e[1])
    # then append this to the topNclasses list, which if list of lists
    # i.e. list of top N categories list
    # i.e topNClasses = [ [(record1) food, tea, bakery], [(record2) restaurant, bar] ] 
    topNclasses.append(topN)
    
  return pd.Series(topNclasses)

In [0]:
# Compares two lists to check if the element in one list
# is contained in another list.
# Input: two list to be compared namely predicted and actual
#        measure, used to write to a text file. Measure specifies substring to be 
#        used for the file name
# Output: in_list -> list of values, 0 => values in list1 are not in list2
#                         # => # of values of list1 that exist in list2
#         length-> number of actual labels for a given business
#         output file in the format [predicted labels] [actual labels]
def compareLists( predicted, actual, measure):
 
  tp_list, fp_list, length = [], [], [] # saves values for each record (tp/fp/len(actual))
#  outfile = open(main_dir + '../' + measure + '_comparisons.txt', mode='wt', encoding='utf-8')
#  Predicted, Actual = 'Predicted Labels', 'Actual'
#  outfile.write(f'{Predicted:56} {Actual}\n')
  
  j = 0 # Index of predicted labels
  
  # For each actual_label, iterate and check if any of the predicted labels
  # are in the actual_label list
  for actual_label in actual:
    tp = 0 # True positives - Keep track of how many pred_label are actual_label
    fp = 0 # False negavtives - Keep track of how many pred_label are not actual_label
    # write to file the predicted list and the actual label
 #   outfile.write(str(predicted[j]) + " " + str(actual_label) + "\n")

    # iterate over predicted list. If any of the predicted labels appear
    # in actual_label cnt += 1, else cnt = 0
    for pred_label in predicted.iloc[j]:
      if str(pred_label) in actual_label:
        tp = tp + 1
      else:
        fp = fp +1
    # append the results for a given record to in_list
    tp_list.append(tp)
    fp_list.append(fp)
    length.append(len(actual_label))
    j = j + 1
#  outfile.close()
  return tp_list, fp_list, length

In [25]:
testing_data = testing_data.reset_index(drop = True)

# Get probabilities of all classes for each review
tfidf_prob = tfidf_model.predict_proba(testing_data.tfidf_features.tolist())
name_tfidf_prob = name_tfidf_model.predict_proba(testing_data.name_tfidf_features.tolist())
# Get topNclasses for each review
testing_data['tfidf_topNclasses'] = getNclasses(tfidf_model, tfidf_prob, 10) 
testing_data['name_tfidf_topNclasses'] = getNclasses(name_tfidf_model, name_tfidf_prob, 10)

testing_data.head(5)

Unnamed: 0,business_id,categories,name,review_count,text,name_text,tfidf_features,name_tfidf_features,label,tfidf_topNclasses,name_tfidf_topNclasses
0,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]",allwyns bakery,105.0,love the jerk chicken sandwich and jerk chicke...,allwyns bakery love the jerk chicken sandwich ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",caribbean,"[restaurants, food, bars, american, nightlife,...","[restaurants, food, bars, american, nightlife,..."
1,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",luckee,171.0,came here for a lovely dinner with husband be...,luckee came here for a lovely dinner with husb...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",chinese,"[restaurants, bars, food, american, nightlife,...","[restaurants, bars, food, american, nightlife,..."
2,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...",luckee,171.0,came here for a lovely dinner with husband be...,luckee came here for a lovely dinner with husb...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",sum,"[restaurants, bars, food, american, nightlife,...","[restaurants, bars, food, american, nightlife,..."
3,V90fC_aF-_DNYzQvUtbLww,"[hotels, travel, asian, fusion, day, spas, cas...",jayde fuzion,246.0,we are locals and decided to try jayde since w...,jayde fuzion we are locals and decided to try ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",day,"[restaurants, bars, food, american, services, ...","[restaurants, bars, food, american, services, ..."
4,V90fC_aF-_DNYzQvUtbLww,"[hotels, travel, asian, fusion, day, spas, cas...",jayde fuzion,246.0,we are locals and decided to try jayde since w...,jayde fuzion we are locals and decided to try ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",casinos,"[restaurants, bars, food, american, services, ...","[restaurants, bars, food, american, services, ..."


In [26]:
# Create list of tuples (categories) to input to apriori algorithm
items_list = []
for item in testing_data.categories:
  items_list.append(tuple(item))
print(items_list[0:2])

[('caribbean', 'food', 'bakeries', 'restaurants'), ('restaurants', 'chinese', 'dim', 'sum', 'breakfast', 'brunch')]


In [0]:
# Input: list[tuples], min_support float, min_confidence float-max = 1, max_length int
# output: dict{ numOfValuesInSets:itemsets}, list[rules]
itemsets, rules = apriori(items_list, min_support=0.003,  min_confidence=1, max_length = 5)

In [28]:
# Create a list of sets from dict{ numOfValuesInSets:{itemsets}}
# i.e Go from {1: {('accessories',): 44, ('yogurt',): 145},  <-set size = 1
#              2: {('accessories', 'computer'): 29}}  <-set size = 2
# to [{'accessories'}, {'yogurt'}, {'accessories', 'computer'}]
sets_list = []
for value in itemsets.values():
  for element in value:
    sets_list.append(set(element))
print(sets_list[0:5])

[{'accessories'}, {'active'}, {'activities'}, {'acupuncture'}, {'adult'}]


In [0]:
# Compare the predicted classes from Naive Bayes
# to the sets returned by apriori algorithm
# Return the best(longest) matched sub/set
def compare_to_set(predicted, sets_list):
  myList = []
  prev = int()
  for pred in predicted: # prediction from Naive Bayes
      pred_vals = set(pred) # cast list to set
      longest_set = set()
      for item in reversed(sets_list): # itemsets returned by apriori
        if pred_vals >= item:  # if itemset is a subset of prediction by Naive Bayes
          temp_set = item & pred_vals # Intersection of both sets, returns elements in item and pred_vals
          longest_set.update(temp_set) # Update subset to get longest possible subset
          """"# If size of prev_subset > current subset break because
          # current subset is a subset of the previous subset
          # i.e. {'breakfast', 'cafes'} is a subset of {'breakfast', 'cafes', 'brunch'}
          if prev > len(item):
            myList.append(list(longest_set)) # append longest set to list that will be returned
            break
          prev = len(item)
          """
      myList.append(list(longest_set))
  return myList

In [0]:
tfidf_pred_sets = compare_to_set(testing_data.tfidf_topNclasses, sets_list)
name_pred_sets = compare_to_set(testing_data.name_tfidf_topNclasses, sets_list)

In [0]:
# compare the list of predicted labels to the actual labels
# compare the list of predicted labels to the actual labels
tfidf_tp, tfidf_fp, tfidf_length = compareLists(testing_data.tfidf_topNclasses, testing_data.categories, 'count')
name_tfidf_tp, name_tfidf_fp, name_tfidf_length = compareLists(testing_data.name_tfidf_topNclasses, testing_data.categories, 'count')

tfidf_set_tp, tfidf_set_fp, tfidf_set_length = compareLists(pd.Series(tfidf_pred_sets), testing_data.categories, 'count')
name_set_tp, name_set_fp, name_set_length = compareLists(pd.Series(name_pred_sets), testing_data.categories, 'count')

In [32]:
# Display precison and recall
print("TfidfVectorizer raw (stopwords not removed)")
tfidf_recall = (np.mean(np.divide(tfidf_tp,tfidf_length)) * 100)
tfidf_precision = (np.mean(np.divide(tfidf_tp, np.add(tfidf_tp, tfidf_fp)))) * 100
print("TFIDF Recall: %2.4f               |  TFIDF Precision: %2.4f" %(tfidf_recall,tfidf_precision))

name_tfidf_recall = (np.mean(np.divide(name_tfidf_tp,name_tfidf_length)) * 100)
name_tfidf_precision = (np.mean(np.divide(name_tfidf_tp, np.add(name_tfidf_tp, name_tfidf_fp)))) * 100
print("TFIDF w/ Name Recall: %2.4f       |  TFIDF w/ Name Precision: %2.4f" %(name_tfidf_recall,name_tfidf_precision))

print("===========================================================================")

tfidf_set_recall = (np.mean(np.divide(tfidf_set_tp,tfidf_set_length)) * 100)
tfidf_set_precision = (np.mean(np.divide(tfidf_set_tp, np.add(tfidf_set_tp, tfidf_set_fp)))) * 100
print("TFIDF Set Recall: %2.4f           |  TFIDF Set Precision: %2.4f" %(tfidf_set_recall,tfidf_set_precision))

name_set_recall = (np.mean(np.divide(name_set_tp,name_set_length)) * 100)
name_set_precision = (np.mean(np.divide(name_set_tp, np.add(name_set_tp, name_set_fp)))) * 100
print("TFIDF Set w/ Name Recall: %2.4f   |  TFIDF Set w/ Name Precision: %2.4f" %(name_set_recall,name_set_precision))

TfidfVectorizer raw (stopwords not removed)
TFIDF Recall: 36.6145               |  TFIDF Precision: 28.0308
TFIDF w/ Name Recall: 37.8548       |  TFIDF w/ Name Precision: 28.9185
TFIDF Set Recall: 36.6145           |  TFIDF Set Precision: 28.0308
TFIDF Set w/ Name Recall: 37.8548   |  TFIDF Set w/ Name Precision: 28.9185
