# **Get Data** 


In [1]:
#from google.colab import drive
#drive.mount('/content/gdrive')
!git clone https://github.com/AndreaJJCC/CategorySuggestion.git

Cloning into 'CategorySuggestion'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 18 (delta 4), reused 14 (delta 2), pack-reused 0[K
Unpacking objects: 100% (18/18), done.


# **Import Necessary Libraries**

In [0]:
# Imports
import json #lines
import pandas as pd
import re 
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import random

In [3]:
os.chdir('/content/CategorySuggestion/')
!unzip -o /content/CategorySuggestion/features.zip

Archive:  /content/CategorySuggestion/features.zip
  inflating: features.json           


In [0]:
# Define file paths
main_dir = '/content/CategorySuggestion/'
features_dir = main_dir + 'features.json'
sets_dir = main_dir + 'sets.txt'

# **Helper Functions**

In [0]:
# Define function to duplicate data from main dataframe
# Input: dataframe
# Output: dataframe with duplicated records (one copy of record per category in categories column)
def duplicate_data(dataframe):
  
  i = 0
  cat_list = []   # list of single categories for duplicated records
  duplicate = []  # list of number of times a record must be duplicated
  # For the category list of each record
  for lst in dataframe.categories:
    if lst == dataframe.categories[i]:
      # save the number of times a record will be duplicate -> len(lst) times
      # (appends an int to duplicate list, which is used later)
      duplicate.append(len(lst))
      # Also form a list of individual category labels for duplicated records
      # i.e. Record 1) [restautarant, bars]
      #      Record 2) [food, tea] 
      # --> [[restaurant],[bars], [food], [tea]]
      for category in lst:
        cat_list.append(category)
      i = i + 1
      
  # duplicate each record based on the value of duplicate list    
  dataframe = dataframe.loc[np.repeat(dataframe.index.values, duplicate)].reset_index(drop = True)
  # Create column of individual labels for each record, including duplicated records
  dataframe['label'] = cat_list
  print("The number of single categories is ", len(cat_list))
  return  dataframe

In [0]:
# Creates a list of lists of the top N classes
# Input: model, from which we get the classes/labels
#        probabilities list, list of lists
#        N, the number of classes to return
# Output: returns a series of the top N classes of a given list
def getNclasses( model, probabilities, N):

  topNclasses = [] # list that will contain top N classes for a given record
  # for each probabaility vector (for each record)
  for value in probabilities:
    # get tuple (probability, class/label)
    # sort probabilities in descending order
    # get top N tuples i.e. the tuples with higher probabilities
    tmp = sorted(zip(value, model.classes_), reverse=True)[:N]
    topN = []
    # for each tuple (probabilities, class/label) only append the class/label
    # to the topN list for each record
    # i.e. for a given record: [(0.25, food), (0.2, tea), (0.1, bakery)]
    # it returns [food, tea, bakery]
    for e in tmp:
      topN.append(e[1])
    # then append this to the topNclasses list, which if list of lists
    # i.e. list of top N categories list
    # i.e topNClasses = [ [(record1) food, tea, bakery], [(record2) restaurant, bar] ] 
    topNclasses.append(topN)
    
  return pd.Series(topNclasses)

In [0]:
# Compares two lists to check if the element in one list
# is contained in another list.
# Input: two list to be compared namely predicted and actual
# Output: tp_list -> list of true positive values for each record,
#                    # of pred_labels that are actual_labels
#         fp_list -> list of false   
#                         # => # of values of list1 that exist in list2
#         length-> number of actual labels for a given business
def compareLists( predicted, actual):
 
  tp_list, fp_list, length = [], [], [] # saves values for each record (tp/fp/len(actual))
  
  j = 0 # Index of predicted labels
  
  # For each actual_label, iterate and check if any of the predicted labels
  # are in the actual_label list
  for actual_label in actual:
    tp = 0 # True positives - Keep track of how many pred_label are actual_label
    fp = 0 # False negavtives - Keep track of how many pred_label are not actual_label
    
    # iterate over predicted list. If any of the predicted labels appear
    # in actual_label cnt += 1, else cnt = 0
    for pred_label in predicted.iloc[j]:
      if str(pred_label) in actual_label:
        tp = tp + 1
      else:
        fp = fp +1
    # append the results for a given record to in_list
    tp_list.append(tp)
    fp_list.append(fp)
    length.append(len(actual_label))
    j = j + 1

  return tp_list, fp_list, length

In [0]:
# Compare the predicted classes from Naive Bayes
# to the sets returned by apriori algorithm
# Return the best(longest) matched sub/set
def compare_to_set(predicted, sets_list):
  myList = []
  prev = int()
  for pred in predicted: # prediction from Naive Bayes
      pred_vals = set(pred) # cast list to set
      longest_set = set()
      for item in reversed(sets_list): # itemsets returned by apriori
        if pred_vals >= item:  # if itemset is a subset of prediction by Naive Bayes
          temp_set = item & pred_vals # Intersection of both sets, returns elements in item and pred_vals
          longest_set.update(temp_set) # Update subset to get longest possible subset
          """"# If size of prev_subset > current subset break because
          # current subset is a subset of the previous subset
          # i.e. {'breakfast', 'cafes'} is a subset of {'breakfast', 'cafes', 'brunch'}
          if prev > len(item):
            myList.append(list(longest_set)) # append longest set to list that will be returned
            break
          prev = len(item)
          """
      myList.append(list(longest_set))
  return myList

#**Load and Visualize Data**

In [0]:
# Define function to load files
# Input: file directory
# Output: dictionary of json objects
def load_data( directory):
  with open(directory) as f:
    data = []
    for line in f:
      data.append(json.loads(line))
  return data

In [10]:
data_df = pd.DataFrame.from_dict(load_data(features_dir))
data_df = data_df.drop(columns =['tfidf_features'])
data_df.head(5)

Unnamed: 0,business_id,categories,count_features,name,review_count,text
0,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",allwyns bakery,105,allwyns bakery love jerk chicken sandwich jerk...
1,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",luckee,171,luckee came lovely dinner husband weeks ago gr...
2,2ktKjN5z8EcqmUv6EDiDgA,"[fashion, department, stores, automotive, shop...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",costco,121,costco got worth tires today told would long w...
3,ADmJgVJ82zdLzffdaH1gVw,"[food, specialty, food, organic, stores, healt...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",planet organic market,14,planet organic market given store many chances...
4,V90fC_aF-_DNYzQvUtbLww,"[hotels, travel, asian, fusion, day, spas, cas...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",jayde fuzion,246,jayde fuzion locals decided try jayde since lo...


In [11]:
# duplicate each record and assign single category to each copy of a specific record
dup_data = duplicate_data(data_df)
dup_data.head(11)

The number of single categories is  30189


Unnamed: 0,business_id,categories,count_features,name,review_count,text,label
0,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",allwyns bakery,105,allwyns bakery love jerk chicken sandwich jerk...,caribbean
1,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",allwyns bakery,105,allwyns bakery love jerk chicken sandwich jerk...,food
2,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",allwyns bakery,105,allwyns bakery love jerk chicken sandwich jerk...,bakeries
3,YIez_A3WOt9J2SXN7OMa2Q,"[caribbean, food, bakeries, restaurants]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",allwyns bakery,105,allwyns bakery love jerk chicken sandwich jerk...,restaurants
4,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",luckee,171,luckee came lovely dinner husband weeks ago gr...,restaurants
5,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",luckee,171,luckee came lovely dinner husband weeks ago gr...,chinese
6,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",luckee,171,luckee came lovely dinner husband weeks ago gr...,dim
7,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",luckee,171,luckee came lovely dinner husband weeks ago gr...,sum
8,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",luckee,171,luckee came lovely dinner husband weeks ago gr...,breakfast
9,YkAIlxYZ1guSqbbowU9X4g,"[restaurants, chinese, dim, sum, breakfast, br...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",luckee,171,luckee came lovely dinner husband weeks ago gr...,brunch


## Split Data

In [12]:
# Split dataset into 8:2 ratio
training_data = dup_data.sample(frac = 0.8, random_state = np.random.RandomState(seed = None)) #random.randint(0, dup_data.shape[0]))

print('Training data has ' + str(training_data.shape[0]) + ' records.')

testing_data = dup_data[~dup_data.isin(training_data)].dropna()

print('Testing data has ' + str(testing_data.shape[0]) + ' records.')

Training data has 24151 records.
Testing data has 6038 records.


In [0]:
from sklearn.model_selection import GridSearchCV

In [0]:
parameters = {'n_estimators': [10, 20, 25], 
              'min_samples_split': [2, 3, 4], 
              'max_features': ['sqrt', 'auto', 'log2'], 
              'bootstrap': [True, False]}

In [15]:
parameters

{'bootstrap': [True, False],
 'max_features': ['sqrt', 'auto', 'log2'],
 'min_samples_split': [2, 3, 4],
 'n_estimators': [10, 20, 25]}

In [0]:
RF = RandomForestClassifier(n_estimators = 10)

In [0]:
clf = GridSearchCV(RF,parameters,'recall_micro', n_jobs = -1, cv = 5, refit = 'recall_micro') 
#clf = GridSearchCV('estimator' = RF, 'param_grid' = parameters, 'scoring' = ['recall', 'precision'], 'n_jobs' = -1, 'iid' = False, 'cv'=5, refit = 'recall')

In [0]:
x_train = training_data.count_features.tolist()
y_train = training_data.label

In [20]:
clf.fit(x_train, y_train)



KeyboardInterrupt: ignored

In [0]:
sorted(clf.cv_results_.keys())

In [0]:
clf.best_score_ * 100

In [0]:
clf.best_params_

## Train Model

In [0]:
# Initialize model
model = RandomForestClassifier(n_estimators=10, max_depth=2, min_samples_split=2, min_samples_leaf=1, bootstrap=True)

In [0]:
from sklearn.model_selection import cross_validate
cv_results = cross_validate(model, x_train, y_train, cv=5, return_train_score=True)

In [0]:
cv_results['train_score']

In [0]:
# Train model with training data
x_train = training_data.count_features.tolist()
y_train = training_data.label
model.fit(x_train, y_train)

In [0]:
testing_data = testing_data.reset_index(drop = True)

# Get probabilities of all classes for each review
#clean_cnt_prob = clean_count_model.predict_proba(testing_data.clean_count_features.tolist())
x_test = testing_data.count_features.tolist()
cnt_prob = model.predict_proba(x_test)
# Get topNclasses for each review
#testing_data['count_topNclasses'] = getNclasses(clean_count_model, clean_cnt_prob, 10) 
testing_data['count_topNclasses'] = getNclasses(model, cnt_prob, 10)

testing_data.head(5)

In [0]:
"""with open(sets_dir) as sets_file:
  sets_list = []
  for item in sets_file:
    #print(set([item]))
    sets_list.append(item)
    
print(sets_list[0:5])
"""

In [0]:
#pred_sets = compare_to_set(testing_data.count_topNclasses, sets_list)

In [0]:
# compare the list of predicted labels to the actual labels
cnt_tp, cnt_fp, cnt_length = compareLists(testing_data.count_topNclasses, testing_data.categories)
#set_tp, set_fp, set_length = compareLists(pd.Series(pred_sets), testing_data.categories)

In [0]:
# Display precison and recall
cnt_recall = (np.mean(np.divide(cnt_tp,cnt_length)) * 100)
cnt_precision = (np.mean(np.divide(cnt_tp, np.add(cnt_tp, cnt_fp)))) * 100
print("Count w/ Name Recall: %2.4f       |  Count w/ Name Precision: %2.4f" %(cnt_recall,cnt_precision))

"""
print("===========================================================================")

set_recall = (np.mean(np.divide(set_tp,set_length)) * 100)
set_precision = (np.mean(np.divide(set_tp, np.add(set_tp, set_fp)))) * 100
print("Count Set w/ Name Recall: %2.4f   |  Count Set w/ Name Precision: %2.4f" %(set_recall,set_precision))
"""