# Comment Analysis

Use jupyter notebook to conduct interactive data analysis.

Also output crucial files for data visualization on web.

In [1]:

import numpy as np
import pandas as pd
import json
import math
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

# json encoder for numpy
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return None if math.isnan(obj) else float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)
    
# Define a function to remove HTML tags from a string
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped = soup.get_text(separator=" ")
    return stripped

SAMPLE_N = 10

customer_csv_path = '../data/csv/aug_customers.csv'
product_csv_path = '../data/csv/aug_products.csv'
comment_csv_path = '../data/csv/aug_comments.csv'

output_base_dir = '../stats/comment'

In [2]:

# parse selected comments to json form
# and join info from products & customers
def parse_joined_comments_to_json(selected_comment_df):
    comments = []
    for idx, comment_ds in selected_comment_df.iterrows():
        comment = {
            'summary': comment_ds['summary'],
            'review': comment_ds['review'],
            'timestamp': comment_ds['timestamp'],
            'rating': comment_ds['overall'],
            'sentiment': comment_ds['sentiment'],
            'overall': comment_ds['sentimentWithRating'],
            'wordCount': comment_ds['wordCount'],
        }
        customer_ds = customer_df[customer_df['customerId']==comment_ds['customerId']].iloc[0]
        product_ds = product_df[product_df['asin']==comment_ds['asin']].iloc[0]
        comment['customer'] = {
            'customerId': customer_ds['customerId'],
            'name': customer_ds['name'],
            'cluster': customer_ds['cluster'],
        }
        comment['product'] = {
            'asin': product_ds['asin'],
            'title': product_ds['title'],
            'brand': None if isinstance(product_ds['brand'], float) and math.isnan(product_ds['brand']) else product_ds['brand'],
            # 'imageUrl': product_ds['imageUrl'],
            'imageUrl': None if isinstance(product_ds['imageUrl'], float) and math.isnan(product_ds['imageUrl']) else product_ds['imageUrl'],
            'price': product_ds['price'],
            'categories': product_ds['categories'].split(','),
            'avgRating': product_ds['avgOverall'],
            'avgSentiment': product_ds['avgSentiment'],
            'avgOverall': product_ds['avgSentimentWithRating'],
            'reviewCount': product_ds['reviewCount'],
        }
        comments.append(comment)
    return comments

## Load Data

In [3]:
comment_df = pd.read_csv(comment_csv_path)
total_count = len(comment_df)

print(f'Number of comments: {total_count}')

Number of comments: 8554087


In [4]:
customer_df = pd.read_csv(customer_csv_path)

customer_df.head()

Unnamed: 0,customerId,name,cluster,distanceToCenter,vector2dX,vector2dY
0,A3478QRKQDOPQ2,jacki,2,13.761846,6.972486,13.825474
1,A2VHSG6TZHU1OB,Ken P,3,33.170107,15.408235,-2.924636
2,A23EJWOW1TLENE,Reina Berumen,0,8.186462,-1.312076,5.83658
3,A1KM9FNEJ8Q171,N Coyle,0,12.849782,-5.78806,5.27656
4,A38LY2SSHVHRYB,Jodie Vesely,0,12.660837,-0.643088,-1.748742


In [5]:
product_df = pd.read_csv(product_csv_path)

product_df.head()

  product_df = pd.read_csv(product_csv_path)


Unnamed: 0,asin,title,brand,description,imageUrl,rank,price,categories,avgOverall,avgSentiment,avgSentimentWithRating,reviewCount
0,695009,Understanding Seizures and Epilepsy,,,,886503,-1.0,Movies,4.0,-0.10625,6.234375,1
1,791156,Spirit Led&mdash;Moving By Grace In The Holy S...,,,https://images-na.ssl-images-amazon.com/images...,342688,-1.0,Movies,5.0,0.65,9.125,2
2,143529,My Fair Pastry (Good Eats Vol. 9),Alton Brown,Disc 1: Flour Power (Scones; Shortcakes; South...,https://images-na.ssl-images-amazon.com/images...,370026,-1.0,Movies,5.0,0.161111,7.902778,1
3,143588,"Barefoot Contessa (with Ina Garten), Entertain...",Ina Garten,Barefoot Contessa Volume 2: On these three dis...,,342914,74.95,Movies,4.75,0.448383,8.370957,12
4,143502,Rise and Swine (Good Eats Vol. 7),Alton Brown,Rise and Swine (Good Eats Vol. 7) includes bon...,https://images-na.ssl-images-amazon.com/images...,351684,-1.0,Movies,5.0,0.525,8.8125,1


In [6]:
comment_df.head()

Unnamed: 0,overall,customerId,asin,summary,review,timestamp,sentiment,sentimentWithRating,wordCount
0,1.0,A00013803RVZPCZKTT9U,B003ZTNT2Y,One Star,Crap!!!!,1485129600,-1.0,1.0,1
1,5.0,A0001392IVCRENBEIEYS,6302409365,I'm so glad I watched this,This made my night. I'm so glad I watched this.,1488240000,0.5,8.75,9
2,5.0,A0001598OL7FAN6XNMK9,B00BMRTPEM,its riveting. i have to keep myself controlled...,its riveting. i have to keep myself controlled...,1478649600,0.25,8.125,22
3,4.0,A0001598OL7FAN6XNMK9,B00IV3FLO8,Four Stars,Great action. Good twists. It's sexy it's hot!,1475625600,0.4825,7.70625,8
4,5.0,A0001598OL7FAN6XNMK9,B00OGL6S64,its riveting. i have to keep myself controlled...,its riveting. i have to keep myself controlled...,1478649600,0.25,8.125,22


In [7]:
comment_df.describe()

Unnamed: 0,overall,timestamp,sentiment,sentimentWithRating,wordCount
count,8554087.0,8554087.0,8554087.0,8554087.0,8554087.0
mean,4.233573,1391790000.0,0.3286066,7.55509,38.41055
std,1.221733,111604500.0,0.3309448,1.773322,58.34319
min,1.0,871948800.0,-1.0,1.0,0.0
25%,4.0,1369786000.0,0.1134615,6.893015,6.0
50%,5.0,1424650000.0,0.30625,8.004563,20.0
75%,5.0,1462493000.0,0.5190476,8.75,43.0
max,5.0,1538525000.0,1.0,10.0,2266.0


## Comment Length

In [8]:
# sort by length
comment_df.sort_values(by=['wordCount'], inplace=True)

comment_df.head()

Unnamed: 0,overall,customerId,asin,summary,review,timestamp,sentiment,sentimentWithRating,wordCount
5935125,5.0,A3L5A9L7QELM4M,B00LOCLBWO,Five Stars,":,)",1425168000,0.0,7.5,0
146141,2.0,A128YUO41J374B,B00M0GM4V0,watch it only if it's your last choice,"#%<<>*%<><~,",1428192000,0.0,4.5,0
8503038,5.0,AWVACZAYADITP,B00BC36UBM,Good action/suspense,*******,1484179200,0.0,7.5,0
3184267,5.0,A2DWU3V9M6TFES,B000067S1H,:-),+ + + + +,1449100800,0.0,7.5,0
7375697,1.0,AFA2SA85AX3PS,B004PP3IM8,; $++%)%!,;$++%)%!,1468800000,0.0,3.5,0


In [9]:
comment_df.tail()

Unnamed: 0,overall,customerId,asin,summary,review,timestamp,sentiment,sentimentWithRating,wordCount
6852980,5.0,A75MSQQ3LN86Z,B00005OATO,SIEGFRIED; THIRD PART OF THE GREAT METAPHOR,In many ways this is the most difficult of all...,1227139200,0.11596,7.789899,1741
4223526,4.0,A2UBSGFCP2QFNV,B00BCRRA8U,"Emmy-winning ""Beetlejuice"" series is like two ...","For the first time, the long-running animated ...",1370736000,0.085393,6.713482,1743
5860513,4.0,A3JZOITOIP90EW,B001DDY6O4,whats here,Its hard to figure out whats in these new echo...,1247270400,-0.071768,6.32058,1748
982802,5.0,A1FDW1SPYKB354,B00028G748,A 25-Disc Set Containing Every Single Episode ...,If you're looking for a spectacularly-massive ...,1155168000,0.154667,7.886667,1901
1309886,2.0,A1KI9RNOIBG6XT,B0058KGHX6,"this review will fill in ALL the blanks, lol",Just as all film and most documentaries are de...,1392076800,-0.110417,4.223957,2266


In [10]:
comment_length_stat = []
thresholds = [0, 1, 10, 100, 1000]

for idx, thres in enumerate(thresholds):
    if idx < len(thresholds) - 1:
        cat = f'{thres} ~ {thresholds[idx+1]-1} words'
        subset_df = comment_df[comment_df["wordCount"].between(thres, thresholds[idx+1], inclusive='left')]
    else:
        cat = f'>= {thres} words'
        subset_df = comment_df[comment_df["wordCount"] >= thres]
    count = len(subset_df)
    print(f'{count} comments in category {cat}, accounts for {100*count/total_count:.2f}%')
    comment_length_stat.append({
        'category': cat,
        'count': count,
        'samples': parse_joined_comments_to_json(subset_df.sample(n=SAMPLE_N))
    })
    

287 comments in category 0 ~ 1 words, accounts for 0.00%
2749824 comments in category 1 ~ 10 words, accounts for 32.15%
5005924 comments in category 10 ~ 100 words, accounts for 58.52%
797769 comments in category 100 ~ 1000 words, accounts for 9.33%
283 comments in category >= 1000 words, accounts for 0.00%


200+ comments are simply speechless, they use marks to express feelings. The majority of comments (nearly half) has 10~100 words, with another 30+% use less than 10 words. This is consistant with most product reviews we see online: seldom will they be long and descriptive.

In [11]:
with open(os.path.join(output_base_dir, 'length.json'), 'w') as f:
    json.dump(comment_length_stat, f, cls=NpEncoder)

# Rating from Customer

In [12]:
comment_df['overall'] = comment_df['overall'].astype(int)

In [13]:
comment_rating_stat = []

for rating in range(1, 6):
    subset_df = comment_df[comment_df["overall"] == rating]
    count = len(subset_df)
    print(f'{count} comments has {rating} star(s) rating, accounts for {100*count/total_count:.2f}%')
    comment_rating_stat.append({
        'rating': rating,
        'count': count,
        'samples': parse_joined_comments_to_json(subset_df.sample(n=SAMPLE_N))
    })

618714 comments has 1 star(s) rating, accounts for 7.23%
396550 comments has 2 star(s) rating, accounts for 4.64%
716448 comments has 3 star(s) rating, accounts for 8.38%
1458679 comments has 4 star(s) rating, accounts for 17.05%
5363696 comments has 5 star(s) rating, accounts for 62.70%


Now we have a proof that indeed most people are generous when it comes to review. More than half of the comments are of 5 stars.

In [14]:
with open(os.path.join(output_base_dir, 'rating.json'), 'w') as f:
    json.dump(comment_rating_stat, f, cls=NpEncoder)

## Sentiment

In [15]:
comment_sentiment_stat = []
thresholds = [-0.1, 0.1]
categories = ['negative (< -0.1)', 'neutral (-0.1 ~ 0.1)', 'positive (> 0.1)']

for idx, cat in enumerate(categories):
    if idx == 0:
        subset_df = comment_df[comment_df["sentiment"] < -0.1]
    elif idx == 1:
        subset_df = comment_df[comment_df["sentiment"].between(-0.1, 0.1)]
    else:
        subset_df = comment_df[comment_df["sentiment"] > 0.1]
    count = len(subset_df)
    print(f'{count} comments in category {cat}, accounts for {100*count/total_count:.2f}%')
    comment_sentiment_stat.append({
        'category': cat,
        'count': count,
        'samples': parse_joined_comments_to_json(subset_df.sample(n=SAMPLE_N))
    })

519326 comments in category negative (< -0.1), accounts for 6.07%
1519371 comments in category neutral (-0.1 ~ 0.1), accounts for 17.76%
6515390 comments in category positive (> 0.1), accounts for 76.17%


Based on subjectively reviewing on the sentiment results, we used -0.1 and 0.1 as the thresholds for neg-neutral and neutral-pos respectively.

The sentiment analysis result is consistant with the customer ratings in number. But we may find some wrongly inferred sentiments from sample comments. This requires a better sentiment analysis model.

In [16]:
with open(os.path.join(output_base_dir, 'sentiment.json'), 'w') as f:
    json.dump(comment_sentiment_stat, f, cls=NpEncoder)

: 