Amazon Musical Instruments Reviews - Kaggle

In [1]:
import json
import pathlib

In [2]:
original_data_filepath = pathlib.Path('Musical_Instruments_5.json')
corrected_data_filepath = pathlib.Path('Musical_Instruments_5_Correct.json')

In [3]:
original_data_filepath.exists()

True

In [4]:
corrected_data_filepath.exists()

False

In [5]:
if not corrected_data_filepath.exists():
    with open(original_data_filepath) as fin:
        incorrect_data = fin.readlines()

    data = {}
    for idx, idx_string in enumerate(incorrect_data):
        data[idx] = json.loads(idx_string.replace('\n',''))

    with open(corrected_data_filepath, 'w') as fout:
        json.dump(data, fout)
else:
    with open(corrected_data_filepath) as fin:
        data = json.load(fin)

In [6]:
corrected_data_filepath.exists()

True

In [7]:
len(data)

10261

In [8]:
keys = list(data.keys())

In [9]:
keys[0]

0

In [10]:
import random

In [11]:
random.seed(1)
random_key = random.choice(keys)
random_key

2201

In [12]:
data[random_key]

{'reviewerID': 'AFFH3F0W6N3MY',
 'asin': 'B0002FOBJY',
 'reviewerName': 'mjosephl',
 'helpful': [0, 0],
 'reviewText': 'The M48 is well built, sturdy and well balanced enough that I have no concerns about tipping, even when it is loaded with several music books and a folder of sheet music. The tray is wide with an appropriately sized lip, and is moveable up and down with a moderate effort. Once in place it stays there with no slipping. For all of the solid construction it does not feel heavy and it is easy to move around, although it would be a little awkward to transport it in a small car.This is one of those purchases where there are a number of alternatives, and I always have some concern that it will not be what I want when I get it... in this case it is a winner. Delivered when promised in good condition and appropriately packed.',
 'overall': 5.0,
 'summary': 'Just right',
 'unixReviewTime': 1320883200,
 'reviewTime': '11 10, 2011'}

In [13]:
import pprint

In [14]:
test = list(data.values())

In [15]:
test[0]

{'reviewerID': 'A2IBPI20UZIR0U',
 'asin': '1384719342',
 'reviewerName': 'cassandra tu "Yeah, well, that\'s just like, u...',
 'helpful': [0, 0],
 'reviewText': "Not much to write about here, but it does exactly what it's supposed to. filters out the pop sounds. now my recordings are much more crisp. it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing,",
 'overall': 5.0,
 'summary': 'good',
 'unixReviewTime': 1393545600,
 'reviewTime': '02 28, 2014'}

In [16]:
pprint.pprint(data[random_key])

{'asin': 'B0002FOBJY',
 'helpful': [0, 0],
 'overall': 5.0,
 'reviewText': 'The M48 is well built, sturdy and well balanced enough that I '
               'have no concerns about tipping, even when it is loaded with '
               'several music books and a folder of sheet music. The tray is '
               'wide with an appropriately sized lip, and is moveable up and '
               'down with a moderate effort. Once in place it stays there with '
               'no slipping. For all of the solid construction it does not '
               'feel heavy and it is easy to move around, although it would be '
               'a little awkward to transport it in a small car.This is one of '
               'those purchases where there are a number of alternatives, and '
               'I always have some concern that it will not be what I want '
               'when I get it... in this case it is a winner. Delivered when '
               'promised in good condition and appropriately packed.

## Can we identify the 30 products with the most reviews?

In [17]:
%%timeit -n 30

asin_counts = {}
for review_data in data.values():
    current_asin = review_data['asin']
    if current_asin in asin_counts:
        asin_counts[current_asin] += 1
    else:
        asin_counts[current_asin] = 1
        
sorted_asin_tuples = sorted(asin_counts.items(), key=lambda item: item[1])
top_30 = dict(sorted_asin_tuples[-30:])

2 ms ± 574 µs per loop (mean ± std. dev. of 7 runs, 30 loops each)


In [18]:
import collections

In [19]:
%%timeit -n 30

asin_list = []
for review_data in data.values():
    current_asin = review_data['asin']
    asin_list.append(current_asin)
    
asin_list_counts = collections.Counter(asin_list)
top_30_from_counter = asin_list_counts.most_common(30)

1.94 ms ± 745 µs per loop (mean ± std. dev. of 7 runs, 30 loops each)


## Aside

In [20]:
random_key

2201

In [21]:
test = dict(data[random_key])

In [22]:
test == data[random_key]

True

In [23]:
test is data[random_key]

False

In [24]:
data[random_key]

{'reviewerID': 'AFFH3F0W6N3MY',
 'asin': 'B0002FOBJY',
 'reviewerName': 'mjosephl',
 'helpful': [0, 0],
 'reviewText': 'The M48 is well built, sturdy and well balanced enough that I have no concerns about tipping, even when it is loaded with several music books and a folder of sheet music. The tray is wide with an appropriately sized lip, and is moveable up and down with a moderate effort. Once in place it stays there with no slipping. For all of the solid construction it does not feel heavy and it is easy to move around, although it would be a little awkward to transport it in a small car.This is one of those purchases where there are a number of alternatives, and I always have some concern that it will not be what I want when I get it... in this case it is a winner. Delivered when promised in good condition and appropriately packed.',
 'overall': 5.0,
 'summary': 'Just right',
 'unixReviewTime': 1320883200,
 'reviewTime': '11 10, 2011'}

In [25]:
test['overall'] = 3.0

In [26]:
data[random_key]

{'reviewerID': 'AFFH3F0W6N3MY',
 'asin': 'B0002FOBJY',
 'reviewerName': 'mjosephl',
 'helpful': [0, 0],
 'reviewText': 'The M48 is well built, sturdy and well balanced enough that I have no concerns about tipping, even when it is loaded with several music books and a folder of sheet music. The tray is wide with an appropriately sized lip, and is moveable up and down with a moderate effort. Once in place it stays there with no slipping. For all of the solid construction it does not feel heavy and it is easy to move around, although it would be a little awkward to transport it in a small car.This is one of those purchases where there are a number of alternatives, and I always have some concern that it will not be what I want when I get it... in this case it is a winner. Delivered when promised in good condition and appropriately packed.',
 'overall': 5.0,
 'summary': 'Just right',
 'unixReviewTime': 1320883200,
 'reviewTime': '11 10, 2011'}

## What are the top 10 reviewers by helpfulness

In [27]:
temp = list(data.values())

In [28]:
num_helpful, num_viewed = temp[50]['helpful']

In [29]:
temp[50]['helpful']

[2, 2]

In [30]:
num_helpful

2

In [31]:
num_viewed

2

In [32]:
reviews_stats = {}
for review_data in data.values():
    current_reviewer = review_data['reviewerID']
    num_helpful, num_viewed = review_data['helpful']
    if current_reviewer in reviews_stats:
        reviews_stats[current_reviewer]['count_helpful'] += num_helpful
        reviews_stats[current_reviewer]['total'] += num_viewed
    else:
        reviews_stats[current_reviewer] = {
            'count_helpful': num_helpful,
            'total': num_viewed,
        }
        
reviewer_helpfulness = {}
for reviewer, reviewer_stats in reviews_stats.items():
    if reviewer_stats['total'] > 0:
        helpfulness_ratio = reviewer_stats['count_helpful']/reviewer_stats['total']
        reviewer_helpfulness[reviewer] = helpfulness_ratio
        
sorted_reviewers = sorted(reviewer_helpfulness.items(), key=lambda item: item[1], reverse=True)
top_10 = dict(sorted_reviewers[:10])

top_10

{'A2IBPI20UZIR0U': 1.0,
 'A94QU4C90B1AX': 1.0,
 'A35QFQI0M46LWO': 1.0,
 'A17SLR18TUMULM': 1.0,
 'A67OJZLHBBUQ9': 1.0,
 'A2W3CLAYZLDPTV': 1.0,
 'AXP9CF1UTFRSU': 1.0,
 'A27DR1VO079F1V': 1.0,
 'A1LQC225SE8UNI': 1.0,
 'A1S0HTDO0P4N5V': 1.0}