In [232]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import json
import collections
import re, string
import sys
import time
from nltk.corpus import stopwords
from wordcloud import WordCloud
from subprocess import check_output
import matplotlib.pyplot as plt
import math

In [123]:
def read_json(file):
    dataset = {}
    keys = []
    with open(file) as file_lines:
        for count, line in enumerate(file_lines):
            data = json.loads(line.strip())
            if count ==0:
                dataset, keys = init_ds(data)
            for k in keys:
                dataset[k].append(data[k])
                
        return pd.DataFrame(dataset)

In [124]:
yelp_review = read_json("/kaggle/input/yelp-dataset/yelp_academic_dataset_review.json")
yelp_review["date"] = pd.to_datetime(yelp_review["date"], format = "%Y-%m-%d")

In [125]:
yelp_review.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [None]:
stopwords_set = set(['.','i','a','and','the','to', 'was', 'it', 'of', 'for', 'in', 'my', 
                 'that', 'so', 'do', 'our', 'the', 'and', ',', 'my', 'in', 'we', 'you', 
                 'are', 'is', 'be', 'me', 'like', 'get', 'time', 'place'])

In [127]:
def tokenize(s):
    """Convert string to lowercase and split into words (ignoring
    punctuation), returning list of words.
    """
    word_list = re.findall(r'\w+', s.lower())
    filtered_words = [word for word in word_list if word not in stopwords.words('english') and word not in stopwords_set]
    return filtered_words


def count_ngrams(lines, min_length=1, max_length=1):
    """Iterate through given lines iterator (file object or list of
    lines) and return n-gram frequencies. The return value is a dict
    mapping the length of the n-gram to a collections.Counter
    object of n-gram tuple and number of times that n-gram occurred.
    Returned dict includes n-grams of length min_length to max_length.
    """
    lengths = range(min_length, max_length + 1)
    ngrams = {length: collections.Counter() for length in lengths}
    queue = collections.deque(maxlen=max_length)

    # Helper function to add n-grams at start of current queue to dict
    def add_queue():
        current = tuple(queue)
        for length in lengths:
            if len(current) >= length:
                ngrams[length][current[:length]] += 1

    # Loop through all lines and words and add n-grams to dict
    for line in lines:
        for word in tokenize(line):
            queue.append(word)
            if len(queue) >= max_length:
                add_queue()

    # Make sure we get the n-grams at the tail end of the queue
    while len(queue) > min_length:
        queue.popleft()
        add_queue()

    return ngrams

def print_most_frequent(ngrams, num=10):
    """Print num most common n-grams of each length in n-grams dict."""
    for n in sorted(ngrams):
        print('----- {} most common {}-word phrase -----'.format(num, n))
        for gram, count in ngrams[n].most_common(num):
            print('{0}: {1}'.format(' '.join(gram), count))
        print('')

## Give out some common words in the reviews of 3 restaurants

In [128]:
num_business_analysis = 3 # basically this will tell how much computing and diverse our analysis will be
yelp_business = read_json("/kaggle/input/yelp-dataset/yelp_academic_dataset_business.json")

In [129]:
business_ids= business_data[:num_business_analysis].business_id.values
business_names = business_data[:num_business_analysis]["Business name"].values
# get all the reviews and analyse them
#business_names
for i, business_id in enumerate(business_ids):
    # now extract reviews from reviews data
    print("Analysing business: ",business_names[i])
    reviews = yelp_review.loc[yelp_review['business_id'] == business_id].text.values
    most_used_text = count_ngrams(reviews,max_length=1)
    print_most_frequent(most_used_text, num=10)

Analysing business:  Turning Point of North Wales
----- 10 most common 1-word phrase -----
food: 176
service: 98
coffee: 83
good: 82
time: 81
wait: 79
place: 77
great: 76
breakfast: 71
back: 69

Analysing business:  Body Cycle Spinning Studio
----- 10 most common 1-word phrase -----
class: 232
classes: 160
great: 140
studio: 140
instructors: 122
body: 113
cycle: 111
bikes: 100
spinning: 100
love: 85

Analysing business:  Kettle Restaurant
----- 10 most common 1-word phrase -----
food: 59
buffet: 58
good: 28
place: 28
restaurant: 28
service: 21
breakfast: 20
great: 18
friendly: 17
staff: 16



## TF/IDF

In [364]:
doc = yelp_review.text[0:10]
unique = []
for line in doc:
    words = tokenize(line)
    for word in words:
        if word not in unique:
            unique.append(word) 
print(unique)


['decide', 'eat', 'aware', 'going', 'take', '2', 'hours', 'beginning', 'end', 'tried', 'multiple', 'times', 'want', 'locations', 'nj', 'never', 'bad', 'experience', 'food', 'good', 'takes', 'long', 'time', 'come', 'waitstaff', 'young', 'usually', 'pleasant', 'many', 'experiences', 'spent', 'way', 'waiting', 'opt', 'another', 'diner', 'restaurant', 'weekends', 'order', 'done', 'quicker', 'taken', 'lot', 'spin', 'classes', 'years', 'nothing', 'compares', 'body', 'cycle', 'nice', 'clean', 'space', 'amazing', 'bikes', 'welcoming', 'motivating', 'instructors', 'every', 'class', 'top', 'notch', 'work', 'anyone', 'struggles', 'fit', 'workouts', 'online', 'scheduling', 'system', 'makes', 'easy', 'plan', 'ahead', 'need', 'line', 'advanced', 'gyms', 'make', 'write', 'review', 'without', 'giving', 'russell', 'owner', 'shout', 'passion', 'fitness', 'cycling', 'evident', 'desire', 'clients', 'succeed', 'always', 'dropping', 'check', 'provide', 'encouragement', 'open', 'ideas', 'recommendations', 'w

In [369]:
word_count = {}
for word in unique:
    count = 0
    for line in doc:
        words = tokenize(line)
        count += words.count(word)
    word_count[word] = count


tf = {}
total_words = sum(word_count.values())
for word, count in word_count.items():
    tf[word] = count / total_words
    
    
idf = {}
num_docs = len(doc)
for word in unique:
    count = 0
    for line in doc:
        words = tokenize(line)
        if word in words:
            count += 1
    idf[word] = math.log(num_docs / count)

tf_idf = {}
for word, tf_value in tf.items():
    tf_idf[word] = tf_value * idf[word]

df = pd.DataFrame({'Term': list(tf.keys()), 'TF': list(tf.values()), 'IDF': list(idf.values()), 'TF-IDF': list(tf_idf.values())})
print(df)


       Term        TF       IDF    TF-IDF
0    decide  0.002083  2.302585  0.004797
1       eat  0.002083  2.302585  0.004797
2     aware  0.002083  2.302585  0.004797
3     going  0.008333  0.916291  0.007636
4      take  0.004167  1.609438  0.006706
..      ...       ...       ...       ...
356   fishy  0.002083  2.302585  0.004797
357      im  0.002083  2.302585  0.004797
358  hoping  0.002083  2.302585  0.004797
359   night  0.002083  2.302585  0.004797
360   money  0.002083  2.302585  0.004797

[361 rows x 4 columns]
