In [1]:
import pandas as pd
import numpy as np
import json
import time 

dataset_path = '../data-mining-3/' # Change this path to match your local dataset folder path

business_filename = 'yelp_academic_dataset_business.json'
review_filename = 'yelp_academic_dataset_review.json'

# Reading business file
start = time.time()

list_business = []
categories_order = ['Beauty & Spas', 'Shopping', 'Bars']

with open(dataset_path + business_filename, 'r', encoding="utf8") as buisness_file:
    for line in buisness_file:
        json_dict = json.loads(line)
        if  json_dict['city'] == 'Toronto' and \
            json_dict['review_count'] >= 10 and \
            json_dict['categories'] is not None: # and \
            #any(word in json_dict['categories'] for word in categories_order):
                cat = [x.strip(" ") for x in json_dict['categories'].split(",")]
                if "Beauty & Spas" in cat:
                    temp_cat = "Beauty & Spas"
                    list_business.append([json_dict['business_id'], temp_cat])
                elif "Shopping" in cat:
                    temp_cat = "Shopping"
                    list_business.append([json_dict['business_id'], temp_cat])
                elif "Bars" in cat:
                    temp_cat = "Bars"
                    list_business.append([json_dict['business_id'], temp_cat]) 
                # The following line makes sure that every buisness categoy is
                # set to one of the categories_order list in that specific order.
                # temp_cat = [cat for cat in categories_order if cat in json_dict['categories']][0]
                # list_business.append([json_dict['business_id'], temp_cat])

stop = time.time()
print("Read file in {:.3f} seconds".format(stop-start))

df_buisness = pd.DataFrame(list_business, columns=['business_id', 'categories'])
display(df_buisness.head())
print('{} businesses in df_buisness'.format(df_buisness['business_id'].size))

Read file in 2.228 seconds


Unnamed: 0,business_id,categories
0,cicPsia8Wj-DNRkmLbD_xg,Bars
1,xVXyrTWbG8U3szze-aA7eg,Bars
2,e-tRKAC-q40SqQfAOwYa-A,Beauty & Spas
3,C9keC4mWuXdl2mYFHZXudQ,Shopping
4,PFS9kf3U-ZCvpqay3AaNnQ,Shopping


2991 businesses in df_buisness


In [2]:
# Extracting unique buisness ids
np_businesses = np.array(list_business)
np_businesses_ids = np_businesses[:,0]
np_businesses_ids

array(['cicPsia8Wj-DNRkmLbD_xg', 'xVXyrTWbG8U3szze-aA7eg',
       'e-tRKAC-q40SqQfAOwYa-A', ..., 'wjqOdj0XJUDOOtU9LjRlWQ',
       'AqpB2IoLkUupDCuH-hmVdg', '0hudPyuCBlKg79OwKBw-eQ'], dtype='<U22')

In [3]:
def find_buisness_index(business_id):
    index = np.where(np_businesses_ids == business_id)[0]
    return index

# Reading reveiws file
start = time.time()
business_reviews = [[] for i in range(int(len(np_businesses_ids)))]

with open(dataset_path + review_filename, 'r', encoding="utf8") as reviews_file:
    for line in reviews_file:
        json_dict = json.loads(line)
        index = find_buisness_index(json_dict['business_id'])
        if index.size > 0:
            if len(business_reviews[index[0]]) == 0:
                business_reviews[index[0]] = json_dict['text']
            else:
                business_reviews[index[0]] += json_dict['text']

stop = time.time()
print("Read file in {:.3f} mins".format((stop-start)/60))

Read file in 5.049 mins


In [4]:
df_business_reviews = pd.DataFrame(business_reviews, columns=['reviews as a single string for each buisness'])
df_business_reviews

Unnamed: 0,reviews as a single string for each buisness
0,"Consistently good, as the Keg tends to be.\n\n..."
1,I would give zero stars. I came here with a gr...
2,A blissful experience! I highly recommended th...
3,If you're a boy and you want to wear some hot ...
4,"As a country girl, I often find myself missing..."
...,...
2986,Good tacos in the downtown core are hard to co...
2987,This used to be my favourite place. It was alw...
2988,Very welcoming place. Great setup and super fr...
2989,I can't beleive I am saying this... but I left...


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

# Load custom stopwords 
# source: https://github.com/kavgan/nlp-in-practice/blob/master/tf-idf/resources/stopwords.txt
with open('stopwords.txt', 'r') as text_file:
    lines = text_file.read().split('\n')

stop_words = text.ENGLISH_STOP_WORDS.union(lines)

# Allowing words that are alpharithmetics more than 2 chars, excluding
# common words that exist in more than max_df of docs and rare words that
# exist in less than min_df of docs.
tfidf = TfidfVectorizer(min_df=0.25,
                        max_df=0.85,
                        max_features=8000,
                        token_pattern=r'(?u)\b[A-Za-z][A-Za-z]+\b',
                        lowercase=True,
                        stop_words=stop_words)

document_term_matrix = tfidf.fit_transform(business_reviews)



In [6]:
import random 
samples_num = 30
print('Sample of words included in the features:\n{}\n\n'.format(random.sample(tfidf.get_feature_names(), samples_num)))
print('Sample of effective stop words list.:\n{}'.format(random.sample(tfidf.get_stop_words(), samples_num)))

Sample of words included in the features:
['worth', 'tasty', 'ago', 'ambience', 'decent', 'advice', 'mention', 'affordable', 'saying', 'wouldn', 'general', 'quick', 'bit', 'checking', 'home', 'true', 'crispy', 'finding', 'conversation', 'tried', 'deep', 'knowledgeable', 'recommend', 'liked', 'servers', 'eye', 'usual', 'expect', 'including', 'skin']


Sample of effective stop words list.:
['cest', 'provide', 'g', 'inc', 'eleven', 'somewhere', 'these', 'their', 'sincere', 'accordingly', 'qua', 'against', 'stopped', 'theirs', 'among', 'allest', 'whom', 'again', 'otherwiser', 'five', 'astraddler', 'appears', 'rathest', 'due', 'underneath', 'appropriate', 'till', 'owt', 'every', 'woulded']


In [7]:
# True values table: Μatching categories to numbers
true_labels = np_businesses[:,1]
for i in range(len(true_labels)):
    if true_labels[i]=='Bars':
        true_labels[i]=0
    if true_labels[i]=='Beauty & Spas':
        true_labels[i]=1
    if true_labels[i]=='Shopping':
        true_labels[i]=2
    
true_labels = [int(i) for i in true_labels] 

In [8]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5,shuffle = True)

In [9]:
input_train = []
input_test = []
output_train = []
output_test = []

for train_index, test_index in kf.split(document_term_matrix):
    input_train.append(document_term_matrix[train_index])
    input_test.append(document_term_matrix[test_index])
    output_train.append(np.asarray(true_labels)[train_index])
    output_test.append(np.asarray(true_labels)[test_index])

In [10]:
# Importing the libraries
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

average_acc = 0
average_f1 = 0 
average_precision = 0
average_recall = 0
average_conf_matrix = np.zeros([3,3])

for i in range(5):
    
    classifier = KNeighborsClassifier(n_neighbors = 5, p=2) #Euclidean distance
    classifier = classifier.fit(input_train[i], output_train[i])
    
    y_pred = classifier.predict(input_test[i])
    
    average_conf_matrix += confusion_matrix(output_test[i], y_pred)
    average_acc += accuracy_score(output_test[i], y_pred)
    average_f1 += f1_score(output_test[i], y_pred, average='weighted')
    average_precision += precision_score(output_test[i], y_pred, average='weighted')
    average_recall += recall_score(output_test[i], y_pred, average='weighted')
    
print("Average Confusion Matrix is")
print(average_conf_matrix//5)
print("Average accuracy is", average_acc/5)
print("Average F1 Score is", average_f1/5)
print("Average Precision Score is", average_precision/5)
print("Average recall score is", average_recall/5)

Average Confusion Matrix is
[[248.   0.   2.]
 [  1. 159.   4.]
 [ 15.   4. 161.]]
Average accuracy is 0.9525256698734234
Average F1 Score is 0.9521427455487471
Average Precision Score is 0.9532264260275054
Average recall score is 0.9525256698734234


In [11]:
from sklearn.svm import SVC

average_acc = 0
average_f1 = 0 
average_precision = 0
average_recall = 0
average_conf_matrix = np.zeros([3,3])

for i in range(5):
    
    svm = SVC(kernel='rbf', gamma='scale') # Gaussian Kernel
    svm = svm.fit(input_train[i],output_train[i])
    
    y_pred = svm.predict(input_test[i])
    
    average_conf_matrix += confusion_matrix(output_test[i], y_pred)
    average_acc += accuracy_score(output_test[i], y_pred)
    average_f1 += f1_score(output_test[i], y_pred, average='weighted')
    average_precision += precision_score(output_test[i], y_pred, average='weighted', zero_division=0)
    average_recall += recall_score(output_test[i], y_pred, average='weighted')
    
print("Average Confusion Matrix is")
print(average_conf_matrix//5)
print("Average accuracy is", average_acc/5)
print("Average F1 Score is", average_f1/5)
print("Average Precision Score is", average_precision/5)
print("Average recall score is", average_recall/5)

Average Confusion Matrix is
[[249.   0.   2.]
 [  0. 158.   5.]
 [ 11.   1. 168.]]
Average accuracy is 0.9632218692246275
Average F1 Score is 0.9630746093431268
Average Precision Score is 0.9635723022741441
Average recall score is 0.9632218692246275


In [12]:
from sklearn.svm import SVC

average_acc = 0
average_f1 = 0 
average_precision = 0
average_recall = 0
average_conf_matrix = np.zeros([3,3])

for i in range(5):
    
    svm = SVC(kernel='linear', gamma='scale') # Linear Kernel
    svm = svm.fit(input_train[i],output_train[i])
    
    y_pred = svm.predict(input_test[i])
    
    average_conf_matrix += confusion_matrix(output_test[i], y_pred)
    average_acc += accuracy_score(output_test[i], y_pred)
    average_f1 += f1_score(output_test[i], y_pred, average='weighted')
    average_precision += precision_score(output_test[i], y_pred, average='weighted', zero_division=0)
    average_recall += recall_score(output_test[i], y_pred, average='weighted')

print("Average Confusion Matrix is")
print(average_conf_matrix//5)
print("Average accuracy is", average_acc/5)
print("Average F1 Score is", average_f1/5)
print("Average Precision Score is", average_precision/5)
print("Average recall score is", average_recall/5)

Average Confusion Matrix is
[[249.   0.   2.]
 [  1. 158.   6.]
 [ 11.   1. 168.]]
Average accuracy is 0.962218524743022
Average F1 Score is 0.9621030442620884
Average Precision Score is 0.9626689012630552
Average recall score is 0.962218524743022


In [13]:
from sklearn.naive_bayes import GaussianNB

average_acc = 0
average_f1 = 0 
average_precision = 0
average_recall = 0
average_conf_matrix = np.zeros([3,3])

for i in range(5):
    
    nb = GaussianNB()
    nb = nb.fit(input_train[i].todense(),output_train[i])
    
    y_pred = nb.predict(input_test[i].todense())
    
    average_conf_matrix += confusion_matrix(output_test[i], y_pred)
    average_acc += accuracy_score(output_test[i], y_pred)
    average_f1 += f1_score(output_test[i], y_pred, average='weighted')
    average_precision += precision_score(output_test[i], y_pred, average='weighted', zero_division=0)
    average_recall += recall_score(output_test[i], y_pred, average='weighted')

print("Average Confusion Matrix is")
print(average_conf_matrix//5)
print("Average accuracy is", average_acc/5)
print("Average F1 Score is", average_f1/5)
print("Average Precision Score is", average_precision/5)
print("Average recall score is", average_recall/5)

Average Confusion Matrix is
[[246.   0.   5.]
 [  1. 155.   8.]
 [ 12.   9. 160.]]
Average accuracy is 0.938481638851821
Average F1 Score is 0.9380893707112037
Average Precision Score is 0.9382657380521602
Average recall score is 0.938481638851821


In [14]:
# We use LBFGS algorithm for optimization or newton-cg

In [15]:
import sklearn.linear_model as linear_model

average_acc = 0
average_f1 = 0 
average_precision = 0
average_recall = 0
average_conf_matrix = np.zeros([3,3])

for i in range(5):
    
    linear_clf = linear_model.LogisticRegression(solver='newton-cg')
    linear_clf.fit(input_train[i], output_train[i])
    y_pred = linear_clf.predict(input_test[i])
    
    average_conf_matrix += confusion_matrix(output_test[i], y_pred)
    average_acc += accuracy_score(output_test[i], y_pred)
    average_f1 += f1_score(output_test[i], y_pred, average='weighted')
    average_precision += precision_score(output_test[i], y_pred, average='weighted', zero_division=0)
    average_recall += recall_score(output_test[i], y_pred, average='weighted')

print("Average Confusion Matrix is")
print(average_conf_matrix//5)
print("Average accuracy is", average_acc/5)
print("Average F1 Score is", average_f1/5)
print("Average Precision Score is", average_precision/5)
print("Average recall score is", average_recall/5)

Average Confusion Matrix is
[[248.   0.   2.]
 [  0. 157.   6.]
 [ 11.   2. 167.]]
Average accuracy is 0.960212394123986
Average F1 Score is 0.9600498677695212
Average Precision Score is 0.9604558712835848
Average recall score is 0.960212394123986


In [16]:
from sklearn import tree

average_acc = 0
average_f1 = 0 
average_precision = 0
average_recall = 0
average_conf_matrix = np.zeros([3,3])

for i in range(5):
    
    decision_tree = tree.DecisionTreeClassifier()
    decision_tree = decision_tree.fit(input_train[i], output_train[i])

    y_pred = decision_tree.predict(input_test[i])

    average_conf_matrix += confusion_matrix(output_test[i], y_pred)
    average_acc += accuracy_score(output_test[i], y_pred)
    average_f1 += f1_score(output_test[i], y_pred, average='weighted')
    average_precision += precision_score(output_test[i], y_pred, average='weighted', zero_division=0)
    average_recall += recall_score(output_test[i], y_pred, average='weighted')

print("Average Confusion Matrix is")
print(average_conf_matrix//5)
print("Average accuracy is", average_acc/5)
print("Average F1 Score is", average_f1/5)
print("Average Precision Score is", average_precision/5)
print("Average recall score is", average_recall/5)

Average Confusion Matrix is
[[234.   1.  15.]
 [  2. 152.  10.]
 [ 12.  10. 159.]]
Average accuracy is 0.9127386223415839
Average F1 Score is 0.9128872693159515
Average Precision Score is 0.9135174534050053
Average recall score is 0.9127386223415839


## ΕΡΩΤΗΜΑ Β

In [19]:
import gensim

filepath = '/home/left/github/data-mining-3/GoogleNews-vectors-negative300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(filepath, binary=True)


In [31]:
vectors = [model[x] for x in business_reviews[0].split(' ')]

KeyError: "word 'good,' not in vocabulary"

In [29]:
business_reviews[0].split(' ')

['Consistently',
 'good,',
 'as',
 'the',
 'Keg',
 'tends',
 'to',
 'be.\n\nHighlights:',
 'great',
 'lunchtime',
 'filet',
 'and',
 'side',
 'vegetables;',
 'great',
 'service',
 'that',
 'checked',
 'in',
 'with',
 'just',
 'the',
 'right',
 'frequency;',
 'bowls',
 'and',
 'bowls',
 'of',
 'those',
 'great',
 'chocolate',
 'mints',
 'scattered',
 'throughout',
 'the',
 'restaurant',
 'so',
 'that',
 'you',
 'can',
 'pocket',
 'a',
 'few',
 'extra',
 ';)',
 'directly',
 'across',
 'from',
 'the',
 'south',
 'entrance/exit',
 'of',
 'the',
 'Eglinton',
 'TTC',
 'stop.\n\nMedium:',
 'parking',
 'available',
 'underground,',
 'but',
 'it',
 'was',
 'a',
 'bit',
 'expensive;',
 'side',
 'caesar',
 "wasn't",
 'served',
 'a',
 'lemon',
 'wedge.\n\nLow:',
 'opens',
 'exactly',
 'at',
 '11:30',
 'am,',
 'or',
 'shortly',
 'after,',
 'so',
 'if',
 'you',
 'arrive',
 'a',
 'bit',
 'early',
 'be',
 'prepared',
 'to',
 'wait',
 'outside',
 'and',
 'not',
 'in',
 'the',
 'lounge.Second',
 'visit'

In [30]:
business_reviews[0]

'Consistently good, as the Keg tends to be.\n\nHighlights: great lunchtime filet and side vegetables; great service that checked in with just the right frequency; bowls and bowls of those great chocolate mints scattered throughout the restaurant so that you can pocket a few extra ;) directly across from the south entrance/exit of the Eglinton TTC stop.\n\nMedium: parking available underground, but it was a bit expensive; side caesar wasn\'t served a lemon wedge.\n\nLow: opens exactly at 11:30 am, or shortly after, so if you arrive a bit early be prepared to wait outside and not in the lounge.Second visit to The Keg Steakhouse Yonge/Eglinton after being unimpressed last time. This time - it\'s just slightly better. At least the steak was cooked right.\n\nI can\'t pin it as to why but even a good juicy piece such as the New York cut could turn out dry and somewhat bland while served lukewarm. The cook was about right which is medium rare with some redness inside. Still though - again, in