# Dataset Analysis

### Import libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
from matplotlib import pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
from tqdm import tqdm
import requests
import ast
import os
import random
from collections import Counter
import heapq

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AJL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AJL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\AJL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\AJL\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [2]:
plt.style.use('seaborn')
%matplotlib inline

### Read in dataset_20

In [3]:
df_20 = pd.DataFrame()
with open("reviewSamples20.json", "r") as ins:
    for line in ins:
        df_20 = df_20.append(ast.literal_eval(line), ignore_index=True)   

In [4]:
df_20.head() # view first 5 rows

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,eU_713ec6fTGNO4BegRaww,0.0,2013-01-20 13:25:59,0.0,fdiNeiN_hoCxCMy2wTRW9g,4.0,I'll be the first to admit that I was not exci...,0.0,w31MKYsNFMrjhWxxAb5wIw
1,3fw2X5bZYeW9xCz_zGhOHg,5.0,2016-05-07 01:21:02,4.0,G7XHMxG0bx9oBJNECG4IFg,3.0,Tracy dessert had a big name in Hong Kong and ...,5.0,jlu4CztcSxrKx56ba1a5AQ
2,6lj2BJ4tJeu7db5asGHQ4w,0.0,2017-05-26 01:23:19,0.0,rEITo90tpyKmEfNDp3Ou3A,5.0,We've been a huge Slim's fan since they opened...,0.0,6Fz_nus_OG4gar721OKgZA
3,XTzKRvWciP_BZ9imk8mmPQ,0.0,2014-06-27 21:32:31,1.0,bjD0Dqn3k-fi00BXatrytg,1.0,I tried this place because my girls are away f...,5.0,1fi6x4tnJtlVWaJmoIO9XA
4,CDpoJiqgM04wqQTZ6QkTqQ,0.0,2015-12-05 02:37:03,0.0,CelUWzp-GnJIiiV1mDUb-g,1.0,Love this place downtown but the Scottsdale lo...,0.0,tFICmdLtwgFIRcwtlbYQOg


### Writing Style

In [5]:
pd.set_option('display.max_colwidth', -1) # display full dataframe information, else truncated
print(df_20['text'])
pd.reset_option('display.max_colwidth')

0     I'll be the first to admit that I was not excited about going to La Tavolta. Being a food snob, when a group of friends suggested we go for dinner I looked online at the menu and to me there was nothing special and it seemed overpriced.  Im also not big on ordering pasta when I go out. Alas, I was outnumbered. Thank goodness! I ordered the sea bass special. It was to die for. Cooked perfectly, seasoned perfectly, perfect portion. I can not say enough good things about this dish. When the server asked how it was he seemed very proud of the dish and said, " doesn't she (the chef) do an incredible job?" She does. \n\nMy hubby got the crab tortellini and also loved his. I heard "mmmm this is so good" from all around the table. Our waiter was super nice and even gave us free desserts because we were some of the last people in the restaurant. Service was very slow and the place was PACKED but we had our jugs of wine and a large group with good conversation so it didn't seem to bother a

Name: text, dtype: object


In [6]:
# Save to disk for easier analysis
# Maybe randomly select 5 to write to file later
with open("text_20.txt", 'w') as f:
    for idx, entry in df_20['text'].iteritems():
        f.write("### Review " + str(idx+1))
        f.write(os.linesep)
        f.write(entry)
        f.write(os.linesep*2)

In [7]:
# Compare with sample Straits Times article
url = r"https://www.straitstimes.com/business/tougher-job-market-as-employers-post-fewer-vacancies-in-q2-unemployment-for-locals-creeps"
soup = BeautifulSoup(requests.get(url).content)
article = soup.find("div", {"itemprop":"articleBody"}).findAll('p')
with open("sample_straits_times.txt", 'w', encoding="utf-8") as f:
    for element in article:
        f.write(element.getText())
        f.write(os.linesep)

### Read in full dataset

In [8]:
df = pd.DataFrame()
if os.path.isfile("reviews.csv"):
    df = pd.read_csv("reviews.csv")
else:
    with open("reviewSelected100.json", "r", encoding = "latin-1") as ins:
        for i, line in enumerate(ins):
            df = df.append(ast.literal_eval(line), ignore_index=True)
    df.to_csv("reviews.csv", index=False)

### Sentence Segmentation (Part 1 - plotting)

In [9]:
# Initilize list
# For each review -> list.append([star_level, num_sentences])
segmented_reviews_with_star_level = []
for _, row in df.iterrows():
    segmented_reviews_with_star_level.append([int(row['stars']),\
                  len(nltk.tokenize.sent_tokenize(row['text']))])

In [10]:
lengths = {1: [], 2: [], 3: [], 4: [], 5: []}

In [11]:
for i in segmented_reviews_with_star_level:
    star_level = i[0]
    num_sentences = i[1]
    lengths[star_level].append(num_sentences)

In [12]:
star_counts_lengths = {}
for star, lengths_of_star in lengths.items():
    star_counts_lengths[star] = dict(Counter(lengths_of_star))

In [13]:
# Save the relevant figures
count = []
for star in star_counts_lengths:
    lengths = []
    num_occurences = []
    for k, v in star_counts_lengths[star].items():
        lengths.append(k)
        num_occurences.append(v)
    # to numpy
    sum_occurences = sum(num_occurences) # used to scale the plot
    count.append(sum_occurences)

    lengths = np.array(lengths)
    
    # plot
    plt.scatter(lengths, num_occurences, c='green', edgecolor='black',
                linewidth=1, alpha=0.75)
    plt.xlabel("Length of review (number of sentences)")
    plt.ylabel("Number of reviews")
    plt.title(f"Number of reviews of a given length (Star {star})")
    plt.xlim(left = 0, right = 80)
    plt.ylim(bottom = -0.01 * sum_occurences, top = 0.2 * sum_occurences)
    plt.savefig(f'star level {star} num reviews against num sentences.png', dpi=300, bbox_inches="tight")
    plt.clf()
    
    # stats
    mean = np.sum(num_occurences * lengths / sum_occurences, axis = 0)
    print("Star {}, avg length: {:1.2f}".format(star, mean))

Star 1, avg length: 9.76
Star 2, avg length: 9.87
Star 3, avg length: 8.88
Star 4, avg length: 8.21
Star 5, avg length: 6.81


<Figure size 432x288 with 0 Axes>

In [14]:
# Draw pie chart of number of reviews / star
name = ["1", "2", "3", "4", "5"]

plt.pie(count, labels = name, autopct='%1.2f%%')

plt.title("Number of reviews given star")
plt.savefig('Number_of_reviews_given_stars.png', dpi=300, bbox_inches="tight")
plt.clf()


<Figure size 432x288 with 0 Axes>

### Sentence Segmentation (Part 2 - randomly sample 5)

In [15]:
# Need to sample long and short reviews.
# Assume long/short based on length of 'text' field in review.
text_list = df['text'].tolist()
text_list.sort(key = len) # sort on length

In [16]:
five_samples = []
for i in [0.1, 0.3, 0.5, 0.7, 0.9]:
    five_samples.append(text_list[round(i*len(text_list))]) # get 10/30/50/70/90 percentile in length

In [17]:
five_samples_segmented = [nltk.tokenize.sent_tokenize(sample) for sample in five_samples]

In [18]:
with open ("5_reviews_and_their_segmentation.txt", 'w') as f:
    for i in range(5):
        f.write(f"### Review {i+1} ###" + '\n')
        for sent in five_samples_segmented[i]:
            f.write(f'{sent}' + '\n')
        f.write('\n')

### Tokenization and Stemming (Part 1 - plotting)

In [19]:
# Segment into sentence before tokenize
reviews = df['text']
segmented_reviews = [nltk.tokenize.sent_tokenize(review)\
                     for review in reviews]

In [20]:
# Tokenize
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokenized_reviews = []
for segmented_review in segmented_reviews:
    tokens_in_review = []
    for sentence in segmented_review:
        # remove punctuation and set lowercase
        tokens = tokenizer.tokenize(sentence)
        tokens = [token.lower() for token in tokens if token.isalpha()]
        tokens_in_review.extend(tokens)
    tokenized_reviews.append(tokens_in_review)

In [21]:
# Stem
stemmer = nltk.stem.SnowballStemmer('english')
stemmed_reviews = []
for sentence in tokenized_reviews:
    stemmed_reviews.append([])
    for token in sentence:
        stemmed_reviews[-1].append(stemmer.stem(token))

In [22]:
# Plot and save Tokenization
lengths_of_tokenized_reviews = [len(tokenized_review) for tokenized_review in tokenized_reviews]
token_counts_lengths = dict(Counter(lengths_of_tokenized_reviews)) # number of tokens : num_reviews
token_lengths = []
num_occurences = []
for k, v in token_counts_lengths.items():
    token_lengths.append(k)
    num_occurences.append(v)
plt.scatter(token_lengths, num_occurences, c='green', edgecolor='black',
            linewidth=1, alpha=0.75)
plt.xlabel("Length of review (number of tokens)")
plt.ylabel("Number of reviews")
plt.title(f"Number of reviews of a given length (Tokenization)")
plt.savefig('num reviews against num tokens (Tokenization).png', dpi=300, bbox_inches="tight")
plt.clf()

<Figure size 432x288 with 0 Axes>

In [23]:
# Plot and save Stemming
lengths_of_stemmed_reviews = [len(stemmed_review) for stemmed_review in stemmed_reviews]
stem_counts_lengths = dict(Counter(lengths_of_stemmed_reviews)) # number of tokens : num_reviews
stem_lengths = []
num_occurences = []
for k, v in stem_counts_lengths.items():
    stem_lengths.append(k)
    num_occurences.append(v)
plt.scatter(stem_lengths, num_occurences, c='green', edgecolor='black',
            linewidth=1, alpha=0.75)
plt.xlabel("Length of review (number of stems)")
plt.ylabel("Number of reviews")
plt.title(f"Number of reviews of a given length (Stemming)")
plt.savefig('num reviews against num tokens (Stemming).png', dpi=300, bbox_inches="tight")
plt.clf()

<Figure size 432x288 with 0 Axes>

### Tokenization and Stemming (Part 2 - List top 20 most frequent words)

In [24]:
# First we remove stop words from tokenized reviews
# Then we find top 20 words
# Then we stem, and find top 20 words

stop_words = nltk.corpus.stopwords.words('english')
# Save stop_words for report
with open("stop_words.txt", "w") as f:
    for stop_word in stop_words:
        f.write(stop_word + "\n")

In [25]:
# Remove stop words
for i in range(len(tokenized_reviews)):
    tokenized_reviews[i] = [word for word in tokenized_reviews[i]\
                            if word not in stop_words]

In [26]:
# Find top 20 tokens for tokenized_reviews
word_dict = {}
for tokenized_review in tokenized_reviews:
    for word in tokenized_review:
        if word not in word_dict:
            word_dict[word] = 1
        else:
            word_dict[word] += 1
top_20 = heapq.nlargest(20, word_dict.items(), key=lambda item: item[1])
# Save to disk
with open ("top_20_tokens.txt", 'w') as f:
    for word in top_20:
        f.write(f"{word[0]}: {word[1]}" + '\n')
# To compare to top 20 stems
with open ("top_20_tokens (stemmed).txt", 'w') as f:
    for word in top_20:
        f.write(f"{stemmer.stem(word[0])}: {word[1]}" + '\n')

In [27]:
# Remove stop words
for i in range(len(stemmed_reviews)):
    stemmed_reviews[i] = [word for word in stemmed_reviews[i]\
                          if word not in stop_words]

In [28]:
# Find top 20 stems for stemmed_reviews
word_dict = {}
for stemmed_review in stemmed_reviews:
    for word in stemmed_review:
        if word not in word_dict:
            word_dict[word] = 1
        else:
            word_dict[word] += 1
top_20 = heapq.nlargest(20, word_dict.items(), key=lambda item: item[1])
print(top_20)
# Save to disk
with open ("top_20_stems.txt", 'w') as f:
    for word in top_20:
        f.write(f"{word[0]}: {word[1]}" + '\n')

[('place', 9407), ('food', 8731), ('good', 8055), ('time', 6588), ('get', 6390), ('servic', 6338), ('great', 6326), ('like', 6231), ('order', 6152), ('go', 5993), ('one', 5278), ('would', 5168), ('back', 4744), ('friend', 3982), ('tri', 3977), ('come', 3784), ('realli', 3731), ('also', 3333), ('love', 3297), ('got', 3171)]


### POS Tagging

In [29]:
# Randomly select 5 sentences.
# Approach: randomly select 5 reviews, and within each review - randomly select 1 sentence.

In [30]:
five_reviews =  reviews.sample(5).tolist()
five_segmented_reviews = [nltk.tokenize.sent_tokenize(review) for review in five_reviews]
five_sentences = [random.choice(segmented_review) for segmented_review in five_segmented_reviews]

In [31]:
# POS Tagging (refer upenn_tagset.txt for tagset)
# Tokenize first, then run POS Tagger
five_POS_tagged_sentences = []
for sentence in five_sentences:
    tokens = tokenizer.tokenize(sentence)
    tokens = [token.lower() for token in tokens if token.isalpha()] # remove punctuation and set lower case
    five_POS_tagged_sentences.append(nltk.pos_tag(tokens))
# Save POS Tagging results (each sentence is separated by a line)
with open("POS_tagging_result.txt", 'w') as f:
    for POS_tagged_sentence in five_POS_tagged_sentences:
        for word_and_POS_tag in POS_tagged_sentence:
            f.write(f"{word_and_POS_tag[0]} : {word_and_POS_tag[1]}" + '\n')
        f.write('\n')

### Most Frequent Adjectives for each Rating (Part 1 - top 10 frequent adjectives)

In [32]:
# From upenn_tagset.txt, the POS Tags for adjective are JJ, JJR, JJS
# We first generate the counts of all adj words for all ratings
rating_freq_adj = {}
ratings = [1.0, 2.0, 3.0, 4.0, 5.0]
for rating in tqdm(ratings):
    freq_adj = {}
    reviews = df[df['stars'] == rating]['text'].tolist()
    segmented_reviews = [nltk.tokenize.sent_tokenize(review) for review in reviews]
    for segmented_review in segmented_reviews:
        for sentence in segmented_review:
            tokens = tokenizer.tokenize(sentence)
            # remove punctuation and set lower case
            # remove i as JJ take i as adjective
            tokens = [token.lower() for token in tokens\
                      if(token.isalpha() and token != 'i' and token != 'I')]
            POS_tags = nltk.pos_tag(tokens)
            for POS_tag in POS_tags:
                word, tag = POS_tag[0], POS_tag[1]
                if tag in ["JJR", "JJS","JJ"]:
                    if word not in freq_adj:
                        freq_adj[word] = 1
                    else:
                        freq_adj[word] += 1
    rating_freq_adj[rating] = freq_adj

100%|██████████| 5/5 [02:05<00:00, 25.19s/it]


In [33]:
# Get top 10 counts of adj words for all ratings
rating_top_10 = {}
for rating, freq_adj in rating_freq_adj.items():
    # item is (word, count)
    top_10 = heapq.nlargest(10, freq_adj.items(), key=lambda item: item[1])
    rating_top_10[rating] = top_10

In [34]:
# Save the results
with open("most_frequent_adjectives.txt", 'w') as f:
    for rating, top_10 in rating_top_10.items():
        f.write(f"### {rating} ###" + '\n')
        for item in top_10:
            f.write(f"{item[0]} : {item[1]}" + '\n')
        f.write('\n')

#### Examining when and why 'good' exists in reviews with star rating 1

In [35]:
with open("good_in_1.0.txt", 'w') as f:
    for _, row in df.iterrows():
        if 'good' in row['text'].lower() and row['stars'] == 1.0:
            f.write(f"###### Review {_} ######\n")
            f.write(row['text'])
            f.write("\n\n")

### Most Frequent Adjectives for each Rating (Part 2 - top 10 indicative adjectives)

In [36]:
# First, get number of words for each rating
rating_number_words = {}
ratings = [1.0, 2.0, 3.0, 4.0, 5.0]
for rating in ratings:
    count = 0
    reviews = df[df['stars'] == rating]['text'].tolist()
    segmented_reviews = [nltk.tokenize.sent_tokenize(review) for review in reviews]
    for segmented_review in tqdm(segmented_reviews):
        for sentence in segmented_review:
            tokens = tokenizer.tokenize(sentence)
            tokens = [token.lower() for token in tokens if token.isalpha()] # remove punctuation and set lower case
            count += len(tokens)
    rating_number_words[rating] = count
# Total number of words
total_number_of_words = sum([v for k, v in rating_number_words.items()])

100%|██████████| 2306/2306 [00:02<00:00, 1089.60it/s]
100%|██████████| 1372/1372 [00:01<00:00, 1124.80it/s]
100%|██████████| 1904/1904 [00:01<00:00, 1370.50it/s]
100%|██████████| 3559/3559 [00:02<00:00, 1437.20it/s]
100%|██████████| 6159/6159 [00:03<00:00, 1878.74it/s]


In [37]:
# Next, get P(w|R) for each rating
pwr_dict = {}
ratings = [1.0, 2.0, 3.0, 4.0, 5.0]
for rating in ratings:
    word_probability_given_rating = {}
    for word, count in rating_freq_adj[rating].items():
        word_probability_given_rating[word] = count/rating_number_words[rating]
    pwr_dict[rating] = word_probability_given_rating

In [38]:
# Next get P(w) for each unique adj in all reviews
adj_total_count = {}
ratings = [1.0, 2.0, 3.0, 4.0, 5.0]
for rating in ratings:
    for adj, count in rating_freq_adj[rating].items():
        if adj not in adj_total_count:
            adj_total_count[adj] = count
        else:
            adj_total_count[adj] += count
pw_dict = {}
for adj, count in adj_total_count.items():
    pw_dict[adj] = count/total_number_of_words

In [39]:
# Get relative entropy of adj words for all ratings
rating_relative_entropy = {}
ratings = [1.0, 2.0, 3.0, 4.0, 5.0]
for rating in ratings:
    relative_entropy = {}
    for word, pwr in pwr_dict[rating].items():
        relative_entropy[word] = pwr * np.log(pwr/pw_dict[word])
    rating_relative_entropy[rating] = relative_entropy

In [40]:
# Get top 10 indicative adj words for all ratings
rating_top_10_indicative = {}
for rating, relative_entropy in rating_relative_entropy.items():
    top_10 = heapq.nlargest(10, relative_entropy.items(), key=lambda item: item[1]) # item is (word, count)
    rating_top_10_indicative[rating] = top_10

In [41]:
# Save the results
with open("most_indicative_adjectives.txt", 'w') as f:
    for rating, top_10_indicative in rating_top_10_indicative.items():
        f.write(f"### {rating} ###" + '\n')
        for item in top_10_indicative:
            f.write(f"{item[0]} : {item[1]}" + '\n')
        f.write('\n')