### Eρώτηση 4
#### Μάριος Ιακωβίδης Α.Μ. 4063

#### Useful Imports

In [46]:
import pandas as pd
import os
import glob
import numpy as np
import sklearn.feature_extraction.text as sk_text
import sklearn.cluster as sk_cluster
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
import gensim
from gensim.models import Word2Vec
from gensim import utils

#### Data Preprocessing

Load a Yelp dataset that contains some restaurants in Philladelphia

In [2]:
restaurants_df = pd.read_csv('philly_restaurants_categories.csv')
display(restaurants_df.shape)

(951, 2)

Load the Yelp reviews for all businesses reviewed in Yelp

In [3]:
# load yelp reviews
json_dir = "C:/Users/mariosjkb/Desktop/3η Σειρα Ασκησεων/reviews_dataset"

json_pattern = os.path.join(json_dir,'*.json')
file_list = glob.glob(json_pattern)

dfs = []

for file in file_list:
  json_data = pd.read_json(file, lines=True)
  dfs.append(json_data)
reviews_df = pd.concat(dfs,ignore_index=True)
display(reviews_df.shape)

(6990280, 9)

Keep only the reviews for the restaurants we loaded

In [5]:
data_df = pd.merge(reviews_df,restaurants_df,how="inner",on="business_id")
data_df = data_df[['business_id','text','stars']]
display(data_df.shape)

(155680, 3)

For each restaurant join all the reviews and find the mean of the star rating

In [7]:
data_df = data_df.groupby(['business_id']).agg(({'text':' '.join,'stars':'mean'})).reset_index()
data_df.to_csv('data.csv')
display(data_df.shape)

(951, 3)

Label restaurants as good(represented as 1) if the mean star rating is above 4.5 and bad(represented as 0) if the rating is less that 2. We ignore the rest of the restaurants in order to keep the very best and the very worst of the them and get reviews with plenty of positive/negative comments. Also this helps to have well shaped and seperated clusters.

In [70]:
data = pd.read_csv('data.csv')
data = data.loc[(data['stars'] >= 4.5) | (data['stars'] <= 2)].reset_index()
data['quality'] = np.where(data['stars'] >= 4.5,1,0)
data.shape

(164, 6)

Vectorize data using td-idf vectorizer and ignore a list of stop words we created

The stop_words_list was created progressively after plenty of runs of the K-means algorythm. We checked the 10 most significant words in the cluster centers and if they weren't describing a restaurant in a positive or negative way, we considered them as stop-words in order to be ignored in the next run. After plenty of runs and after getting the result we wanted we stopped that process and moved forward.

In [81]:
stop_words_list = ['pizza', 'food', 'great', 'sushi', 'good', 'place', 'delicious', 'italian', 'best', 'just', 'amazing', 'like', 'service', 'order', 'time', 'pasta', 'really', 'ordered', 'fresh', 'definitely','food', 'order', 'drive', 'fries', 'service', 'time', 'place', 'just', 'like', 'location', 'burger', 'don', 'minutes', 'got', 'ordered', 'people', 've', 'good', 'chicken', 'wait','the', 'and', 'to', 'it', 'was', 'this', 'they', 'my', 'in', 'is', 'of', 'for', 'that', 'you', 'not', 'at', 'me', 'on', 'but', 'with','the', 'and', 'to', 'was', 'it', 'of', 'is', 'for', 'in', 'with', 'we', 'this', 'my', 'they', 'you', 'that', 'but', 'on', 'so', 'had','have', 'no', 'there', 'be', 'are', 'up', 'were', 'when', 'one', 'get', 'here', 'out', 'if', 'from', 'all', 'she', 'or', 'never', 'what', 'as','were', 'are', 'have', 'very', 'as', 'be', 'their', 'all', 'out', 'also', 'our', 'here', 'there', 'which', 'friendly', 'can', 'from', 'if', 'back', 'try','roll', 'go', 'will', 'philly', 'sauce', 'menu', 'by', 'cheese', 'about', 'been', 'would','go', 'even', 'an', 'your', 'do', 'he', 'them', 'because', 'after', 'only', 'then', 'been', 'will', 'would', 'her', 'about', 'said', 'give', 'ever', 'again','through', 'didn', 'always', 'over', 'other', 'know', 'went', 'who', 'has', 'sandwich', 'come', 'down', 'how', '10', 'staff', 'could', 'two', 'long', 'some', 'more','rolls','us', 'little', 'restaurant', 'everything', 'too', 'made', 'spicy','should', 'still', 'times', 'any', 'around', 'did', 'want', 'took', 'right', 'came', 'before', 'another', 'something', 'us', 'meal', 'than', 'night', 're','mcdonald', 'customer', 'told', 'asked', 'every', 'first', 'while', 'off', 'these', 'going', 'say', 'better', 'eat', 'hot', 'take', 'day', 'most', 'see', 'make','hour', 'way', 'gave', 'inside', 'wasn', 'away', 'well', 'now', 'much', 'll', 'his', 'am', 'experience', 'being', 'last', 'new', 'think', 'big', 'need', 'wanted', 'stars', 'its', 'sure', 'pretty', 'home', 'nice', 'into', 'area', 'thing', 'though', 'tried', 'since', 'taste', 'point', 'why', 'ice', 'open', 'where', 'large', 'next', 'each', 'lot',  'many', 'find', 'things', 'check', 'items', 'lunch', 'dining','byob', 'pork', 'lunch',]
vectorizer = sk_text.TfidfVectorizer(stop_words=stop_words_list,max_features=100)
reviews = data.loc[:,"text"].to_list()
clustering_data = vectorizer.fit_transform(reviews)
clustering_data.shape

(164, 100)

Use a K-means clustering algorythm using the vectorized data into 2 clusters(good restaurants and bad restaurants)

In [82]:
kmeans = sk_cluster.KMeans(n_clusters=2,init="k-means++",n_init=50)
kmeans_result = kmeans.fit_transform(clustering_data)
kmeans_labels = kmeans.labels_

Function to help with the mapping of true labels and K-means labels

In [83]:
def cluster_class_mapping(kmeans_labels,true_labels):
    C= metrics.confusion_matrix(kmeans_labels,true_labels)
    mapping = list(np.argmax(C,axis=1)) #for each row (cluster) find the best class in the confusion matrix
    mapped_kmeans_labels = [mapping[l] for l in kmeans_labels]
    C2= metrics.confusion_matrix(mapped_kmeans_labels,true_labels)
    return mapped_kmeans_labels,C2

Metrics to evaluate the quality of the clustering

In [99]:
labels, C = cluster_class_mapping(kmeans_labels,data.quality)
print("Confusion matrix:\n",C)

accuracy = metrics.accuracy_score(labels,data.quality)
print("Clustering accuracy = ",accuracy)

precision = metrics.precision_score(labels,data.quality,average=None)
print("Precision score per class = ",precision)

recall = metrics.recall_score(labels,data.quality,average=None)
print("Recall score per class = ",recall)

f1_score = metrics.f1_score(labels,data.quality,average=None)
print("F1-score per class = ",f1_score)

Confusion matrix:
 [[87  0]
 [ 5 72]]
Clustering accuracy =  0.9695121951219512
Precision score per class =  [0.94565217 1.        ]
Recall score per class =  [1.         0.93506494]
F1-score per class =  [0.97206704 0.96644295]


We conclude that the result of the clustering is almost 100% accurate, since all the metrics are above 93% and the confusion matrix is almost ideal. So our goal to create well seperated clusters was met.

Find the 10 most important words in the cluster centers

In [101]:
cluster_centers = kmeans.cluster_centers_
cluster_centers_word_indices = (-cluster_centers).argsort()
top_10_word_indices = []
top_10_words_all = []

for i in range(0,cluster_centers_word_indices.shape[0]):
    top_10_word_indices.append(cluster_centers_word_indices[i][:10])

words = vectorizer.get_feature_names_out()

for i in range(0,len(top_10_word_indices)):
    top_10_words = []
    for j in range(0,10):
        top_10_words.append(words[top_10_word_indices[i][j]])

    print("Top 10 words for cluster " + str(i) + " are: " + str(top_10_words))
    top_10_words_all.append(top_10_words)

Top 10 words for cluster 0 are: ['thru', 'worst', 'manager', 'fast', 'cold', 'waiting', 'bad', 'delivery', 'wrong', 'work']
Top 10 words for cluster 1 are: ['spot', 'love', 'recommend', 'favorite', 'salad', 'excellent', 'dinner', 'perfect', 'special', 'small']


Split the data in train and test set in order to use a Word Embedding

In [102]:
X = data.text
y = data.quality
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

Use Skipgram embedding to get the 5 more frequent context words of the positive and negative words given by the cluster centers

In [103]:
top_phrases = []
# preprocess train data
train_gsim = [gensim.utils.simple_preprocess(x) for x in X_train]
train_data_labels = [(x,y) for (x,y) in zip(train_gsim,y_train) if len(x) > 0]
X_train_gsim = [x for (x,y) in train_data_labels]
y_train_gsim = [y for (x,y) in train_data_labels]

# preprocess test data
test_gsim = [gensim.utils.simple_preprocess(x) for x in X_test]
test_data_labels = [(x,y) for (x,y) in zip(test_gsim,y_test) if len(x) > 0]
X_test_gsim = [x for (x,y) in test_data_labels]
y_test_gsim = [y for (x,y) in test_data_labels]

# train a Skipgram model and get the new train set and test set
skipgram_model = gensim.models.Word2Vec(X_train_gsim,min_count=1,vector_size=50,window=10,sg=1)

# for each word find the 5 most frequent context words
for i in range(0,2):
    for j in range(0,len(top_10_words_all[i])):
        for k in range(0,5):
            phrase = top_10_words_all[i][j] + " " + skipgram_model.wv.most_similar(top_10_words_all[i][j])[k][0]
            top_phrases.append(phrase)

print(top_phrases)

['thru drive', 'thru line', 'thru speaker', 'thru through', 'thru cars', 'worst slowest', 'worst mcdonalds', 'worst poor', 'worst mcdonald', 'worst worse', 'manager spoke', 'manager phone', 'manager refused', 'manager situation', 'manager male', 'fast attitudes', 'fast mediocre', 'fast atrocious', 'fast dominos', 'fast horrendous', 'cold lukewarm', 'cold soggy', 'cold stale', 'cold french', 'cold nuggets', 'waiting fifteen', 'waiting eventually', 'waiting twenty', 'waiting mins', 'waiting min', 'bad compelled', 'bad dominos', 'bad poorly', 'bad warn', 'bad poisoning', 'delivery grubhub', 'delivery online', 'delivery delivered', 'delivery estimated', 'delivery via', 'wrong correct', 'wrong label', 'wrong repeat', 'wrong didnt', 'wrong turns', 'work showing', 'work purpose', 'work running', 'work wonder', 'work wants', 'spot kinme', 'spot rotation', 'spot nolibs', 'spot jewelers', 'spot fairmount', 'love suppli', 'love lucatelli', 'love officially', 'love treats', 'love awesome', 'recomm

From the top phrases list we evaluated which words or phrases are describing a restaurant in a positive or a negative way and we created the resprective vocabularies

In [1]:
positive_vocabulary = ['spot lovers','love','awesome','highly recommend','suggest','strongly recommend','favorite','excellent','outstanding','phenomenal','superb',
                       'valentine dinner','fiance dinner','celebration dinner','perfect','decadent','special','suggest','become favorite','outstanding','fantastic',
                       'perfect light']

negative_vocabulary = ['drive thru','worst','worse','slowest','manager refused','spoke to manager','fast','atrocious','mediocre','cold food','stale food','soggy food','waiting twenty mins',
                       'bad','terrible','awful','wrong everytime','garbage','horrible','bad poisoning','wrong label','horrendous']

Positive and negative restaurant vocabulary

In [2]:
print("Positive vocabulary for restaurants: ",positive_vocabulary)
print("\n")
print("Negative vocabulary for restaurants: ",negative_vocabulary)

Positive vocabulary for restaurants:  ['spot lovers', 'love', 'awesome', 'highly recommend', 'suggest', 'strongly recommend', 'favorite', 'excellent', 'outstanding', 'phenomenal', 'superb', 'valentine dinner', 'fiance dinner', 'celebration dinner', 'perfect', 'decadent', 'special', 'suggest', 'become favorite', 'outstanding', 'fantastic', 'perfect light']


Negative vocabulary for restaurants:  ['drive thru', 'worst', 'worse', 'slowest', 'manager refused', 'spoke to manager', 'fast', 'atrocious', 'mediocre', 'cold food', 'stale food', 'soggy food', 'waiting twenty mins', 'bad', 'terrible', 'awful', 'wrong everytime', 'garbage', 'horrible', 'bad poisoning', 'wrong label', 'horrendous']
