Amazon Musical Instruments Reviews - Kaggle

In [1]:
import json
import pathlib

In [2]:
original_data_filepath = pathlib.Path('Musical_Instruments_5.json')
corrected_data_filepath = pathlib.Path('Musical_Instruments_5_Correct.json')

In [3]:
original_data_filepath.exists()

True

In [4]:
corrected_data_filepath.exists()

True

In [5]:
if not corrected_data_filepath.exists():
    with open(original_data_filepath) as fin:
        incorrect_data = fin.readlines()

    data = {}
    for idx, idx_string in enumerate(incorrect_data):
        data[idx] = json.loads(idx_string.replace('\n',''))

    with open(corrected_data_filepath, 'w') as fout:
        json.dump(data, fout)
else:
    with open(corrected_data_filepath) as fin:
        data = json.load(fin)

In [6]:
corrected_data_filepath.exists()

True

In [7]:
keys = list(data.keys())

In [8]:
import random

In [9]:
random.seed(2)
random_key = random.choice(keys)
random_key

'926'

In [10]:
print(data[random_key])

{'reviewerID': 'A37KK1CLMULHE4', 'asin': 'B0002D0CLC', 'reviewerName': 'Wynn "Nemesis"', 'helpful': [0, 0], 'reviewText': "I've used these for more than two years but recently shifted to its big brother, the jazz 3 XL. Same thickness and feel /tone, but in a larger form factor. I would definitely recommend the XL version if rhythm+lead work is required. For strictly solo work with easy to achieve pinch harmonics, this one is better.", 'overall': 3.0, 'summary': 'Nice pick, but try the XL', 'unixReviewTime': 1383004800, 'reviewTime': '10 29, 2013'}


In [11]:
import pprint

In [12]:
random.seed(1)

pprint.pprint(
    data[random_key], 
    width=150,
)

{'asin': 'B0002D0CLC',
 'helpful': [0, 0],
 'overall': 3.0,
 'reviewText': "I've used these for more than two years but recently shifted to its big brother, the jazz 3 XL. Same thickness and feel /tone, but "
               'in a larger form factor. I would definitely recommend the XL version if rhythm+lead work is required. For strictly solo work with '
               'easy to achieve pinch harmonics, this one is better.',
 'reviewTime': '10 29, 2013',
 'reviewerID': 'A37KK1CLMULHE4',
 'reviewerName': 'Wynn "Nemesis"',
 'summary': 'Nice pick, but try the XL',
 'unixReviewTime': 1383004800}


In [13]:
len(data)

10261

### Can we identify the 30 products (ASIN) with the most reviews?

In [14]:
asin_counts = {}
for review_idx, review_data in data.items():
    if 'asin' in review_data:
        asin = review_data['asin']
        if asin in asin_counts:
            asin_counts[asin] += 1
        else:
            asin_counts[asin] = 1
            
asin_counts = {k: v for k, v in sorted(asin_counts.items(), key=lambda item: item[1], reverse=True)}

top_30 = {key: val for idx, (key, val) in enumerate(asin_counts.items()) if idx < 30}
top_30

{'B003VWJ2K8': 163,
 'B0002E1G5C': 143,
 'B0002F7K7Y': 116,
 'B003VWKPHC': 114,
 'B0002H0A3S': 93,
 'B0002CZVXM': 74,
 'B0006NDF8A': 71,
 'B0009G1E0K': 69,
 'B0002E2KPC': 68,
 'B0002GLDQM': 67,
 'B004XNK7AI': 65,
 'B005FKF1PY': 63,
 'B00646MZHK': 62,
 'B0002GMGYA': 58,
 'B001PGXHX0': 58,
 'B000978D58': 57,
 'B0018TIADQ': 57,
 'B0002D0CEO': 55,
 'B001PGXKC8': 55,
 'B000068NW5': 52,
 'B0002IHFVM': 49,
 'B000RNB720': 47,
 'B0002CZW0Y': 46,
 'B0002M6CVC': 46,
 'B0002OOMU8': 46,
 'B0002E2XCW': 45,
 'B0002E3CHC': 44,
 'B000PO30QM': 44,
 'B005CX4GLE': 42,
 'B0002D0E8S': 41}

In [15]:
%%timeit -n 100

asin_counts = {}
for review_idx, review_data in data.items():
    if 'asin' in review_data:
        asin = review_data['asin']
        if asin in asin_counts:
            asin_counts[asin] += 1
        else:
            asin_counts[asin] = 1
            
asin_counts = {k: v for k, v in sorted(asin_counts.items(), key=lambda item: item[1], reverse=True)}

top_30 = {key: val for idx, (key, val) in enumerate(asin_counts.items()) if idx < 30}

1.43 ms ± 200 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
import collections

In [17]:
asin_list = []
for review_idx, review_data in data.items():
    if 'asin' in review_data:
        asin_list.append(review_data['asin'])
asin_counter = collections.Counter(asin_list)
top_30_from_counter = asin_counter.most_common(30)
top_30_from_counter

[('B003VWJ2K8', 163),
 ('B0002E1G5C', 143),
 ('B0002F7K7Y', 116),
 ('B003VWKPHC', 114),
 ('B0002H0A3S', 93),
 ('B0002CZVXM', 74),
 ('B0006NDF8A', 71),
 ('B0009G1E0K', 69),
 ('B0002E2KPC', 68),
 ('B0002GLDQM', 67),
 ('B004XNK7AI', 65),
 ('B005FKF1PY', 63),
 ('B00646MZHK', 62),
 ('B0002GMGYA', 58),
 ('B001PGXHX0', 58),
 ('B000978D58', 57),
 ('B0018TIADQ', 57),
 ('B0002D0CEO', 55),
 ('B001PGXKC8', 55),
 ('B000068NW5', 52),
 ('B0002IHFVM', 49),
 ('B000RNB720', 47),
 ('B0002CZW0Y', 46),
 ('B0002M6CVC', 46),
 ('B0002OOMU8', 46),
 ('B0002E2XCW', 45),
 ('B0002E3CHC', 44),
 ('B000PO30QM', 44),
 ('B005CX4GLE', 42),
 ('B0002D0E8S', 41)]

### Which reviewers that have received at least 10 review votes are most "helpful"?

In [18]:
pprint.pprint(
    data[random_key], 
    width=150,
)

{'asin': 'B0002D0CLC',
 'helpful': [0, 0],
 'overall': 3.0,
 'reviewText': "I've used these for more than two years but recently shifted to its big brother, the jazz 3 XL. Same thickness and feel /tone, but "
               'in a larger form factor. I would definitely recommend the XL version if rhythm+lead work is required. For strictly solo work with '
               'easy to achieve pinch harmonics, this one is better.',
 'reviewTime': '10 29, 2013',
 'reviewerID': 'A37KK1CLMULHE4',
 'reviewerName': 'Wynn "Nemesis"',
 'summary': 'Nice pick, but try the XL',
 'unixReviewTime': 1383004800}


In [19]:
test = data[random_key]

In [20]:
test == data[random_key]

True

In [21]:
test is data[random_key]

True

In [22]:
# test['overall'] = 5.0

In [23]:
# data[random_key]

In [24]:
test = dict(data[random_key])

In [25]:
test == data[random_key]

True

In [26]:
test is data[random_key]

False

In [27]:
test['overall'] = 5.0

In [28]:
data[random_key]

{'reviewerID': 'A37KK1CLMULHE4',
 'asin': 'B0002D0CLC',
 'reviewerName': 'Wynn "Nemesis"',
 'helpful': [0, 0],
 'reviewText': "I've used these for more than two years but recently shifted to its big brother, the jazz 3 XL. Same thickness and feel /tone, but in a larger form factor. I would definitely recommend the XL version if rhythm+lead work is required. For strictly solo work with easy to achieve pinch harmonics, this one is better.",
 'overall': 3.0,
 'summary': 'Nice pick, but try the XL',
 'unixReviewTime': 1383004800,
 'reviewTime': '10 29, 2013'}

In [29]:
data[random_key]['helpful']

[0, 0]

In [30]:
type(data[random_key]['helpful'])

list

In [31]:
type(data[random_key]['helpful'][0])

int

In [32]:
sum(data[random_key]['helpful'])

0

In [33]:
data[random_key]['helpful']

[0, 0]

In [34]:
reviewer_helpful_scores = {}
for review_idx, review_data in data.items():
    if ('helpful' in review_data) and ('reviewerID' in review_data):
        reviewer = review_data['reviewerID']
        num_helpful, num_total = review_data['helpful']
        if reviewer in reviewer_helpful_scores:
            reviewer_helpful_scores[reviewer]['num_helpful'] += num_helpful
            reviewer_helpful_scores[reviewer]['num_total'] += num_total
        else:
            reviewer_helpful_scores[reviewer] = {
                'num_helpful': num_helpful,
                'num_total': num_total,
            }
            
for reviewer, review_stats in reviewer_helpful_scores.items():
    if review_stats['num_total'] > 0:
        helpful_score = review_stats['num_helpful']/review_stats['num_total']
    else:
        helpful_score = 0
    reviewer_helpful_scores[reviewer]['score'] = helpful_score
    
sorted_reviewers = {k: v for k, v in sorted(reviewer_helpful_scores.items(), key=lambda item: item[1]['score'], reverse=True)}

top_reviewers = {}
count = 0
for reviewer in sorted_reviewers:
    if reviewer_helpful_scores[reviewer]['num_total'] >= 10:
        count += 1
        top_reviewers[reviewer] = dict(reviewer_helpful_scores[reviewer])
    if count >= 30:
        break
        
top_reviewers

{'A1LQC225SE8UNI': {'num_helpful': 11, 'num_total': 11, 'score': 1.0},
 'ALUTHT4U058KZ': {'num_helpful': 53, 'num_total': 53, 'score': 1.0},
 'AOINAOO0NQRGN': {'num_helpful': 17, 'num_total': 17, 'score': 1.0},
 'A1YAAMQT5G88XE': {'num_helpful': 10, 'num_total': 10, 'score': 1.0},
 'ABZYVME9NYCQG': {'num_helpful': 17, 'num_total': 17, 'score': 1.0},
 'A2MIP3AQVSF2SS': {'num_helpful': 12, 'num_total': 12, 'score': 1.0},
 'A3R8TPM1N7HFS4': {'num_helpful': 10, 'num_total': 10, 'score': 1.0},
 'A2Z548GT5948WH': {'num_helpful': 10, 'num_total': 10, 'score': 1.0},
 'A1MTN9XWJSA5PI': {'num_helpful': 25, 'num_total': 25, 'score': 1.0},
 'A34QGH6PW5UPA2': {'num_helpful': 18, 'num_total': 18, 'score': 1.0},
 'A2MPM6M93OXIJT': {'num_helpful': 17, 'num_total': 17, 'score': 1.0},
 'A32BCTL0HLOXRV': {'num_helpful': 15, 'num_total': 15, 'score': 1.0},
 'AFS3FQR5JSDVJ': {'num_helpful': 13, 'num_total': 13, 'score': 1.0},
 'A1C92SAQFUBJSZ': {'num_helpful': 14, 'num_total': 14, 'score': 1.0},
 'A2CIR2MO

In [35]:
import datetime

In [36]:
dt = datetime.datetime.fromtimestamp(data[random_key]['unixReviewTime'])
dt

datetime.datetime(2013, 10, 28, 19, 0)

In [37]:
dt.hour

19

In [38]:
dt.weekday()

0

In [39]:
day_name_mapper = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday',
}
day_name_mapper[dt.weekday()]

'Monday'

In [40]:
scores_with_date_info = {}
for review_idx, review_data in data.items():
    dt = datetime.datetime.fromtimestamp(review_data['unixReviewTime'])
    day_of_week = day_name_mapper[dt.weekday()]
    hour = dt.hour
    scores_with_date_info[review_idx] = {
        'day_of_week': day_of_week,
        'hour': hour,
        'overall': review_data['overall'],
    }

In [41]:
hour_scores = {}
for review_idx, time_data in scores_with_date_info.items():
    if time_data['hour'] in hour_scores:
        hour_scores[time_data['hour']].append(time_data['overall'])
    else:
        hour_scores[time_data['hour']] = [time_data['overall']]

hour_average = {}
for hour, hour_score_list in hour_scores.items():
    hour_average[hour] = sum(hour_score_list)/len(hour_score_list)
    
hour_average

{18: 4.4859382203780545, 19: 4.490798581799764}

In [42]:
day_scores = {}
for review_idx, time_data in scores_with_date_info.items():
    if time_data['day_of_week'] in day_scores:
        day_scores[time_data['day_of_week']].append(time_data['overall'])
    else:
        day_scores[time_data['day_of_week']] = [time_data['overall']]

day_average = {}
for day, day_score_list in day_scores.items():
    day_average[day] = sum(day_score_list)/len(day_score_list)
    
day_average

{'Thursday': 4.474802671523983,
 'Friday': 4.492101105845181,
 'Tuesday': 4.504901960784314,
 'Saturday': 4.489397794741306,
 'Wednesday': 4.472727272727273,
 'Monday': 4.503415300546448,
 'Sunday': 4.485322896281801}