In [15]:
import csv
import gzip
from collections import defaultdict

In [2]:
path = '../1_Basic_Data_Processing_and_Visualization/amazon_reviews_us_Gift_Card_v1_00.tsv.gz'
f = gzip.open(path, 'rt')

In [3]:
reader = csv.reader(f, delimiter = '\t')
header = next(reader)

In [4]:
header

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [5]:
# Reading into Key-Value pairs
dataset = []

for line in reader:
    d = dict(zip(header, line))
    
    #convert numeric and boolean types into python
    for field in ['helpful_votes', 'star_rating', 'total_votes']:
        d[field] = int(d[field])
        
    for field in ['verified_purchase', 'vine']:
        if d[field] == 'Y':
            d[field] = True
        else:
            d[field] = False
    dataset.append(d)

In [7]:
len(dataset)

148310

In [8]:
dataset[0]

{'marketplace': 'US',
 'customer_id': '24371595',
 'review_id': 'R27ZP1F1CD0C3Y',
 'product_id': 'B004LLIL5A',
 'product_parent': '346014806',
 'product_title': 'Amazon eGift Card - Celebrate',
 'product_category': 'Gift Card',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': False,
 'verified_purchase': True,
 'review_headline': 'Five Stars',
 'review_body': 'Great birthday gift for a young adult.',
 'review_date': '2015-08-31'}

In [10]:
# filter only rows with "reviewYear" there

dataset = [d for d in dataset if 'review_date' in d]
len(dataset)

148309

In [11]:
# filter by date

for d in dataset:
    d['yearInt'] = int(d['review_date'][:4])

In [12]:
dataset_2009 = [d for d in dataset if d['yearInt'] > 2009]
len(dataset_2009)

148095

In [13]:
# filter by "helpfulness"
# d['total_votes'] < 3 - keep reviews that havent received many votes yet
# d['helpful_votes']/d['total_votes'] >= 0.5 - delete any with less than 50% helpfulness

dataset_help = [d for d in dataset if d['total_votes'] < 3 or d['helpful_votes']/d['total_votes'] >= 0.5]
len(dataset_help)

147959

In [16]:
# filter by user activity

n_reviews_users = defaultdict(int)

for d in dataset:
    n_reviews_users[d['customer_id']] += 1

In [17]:
dataset_reviews = [d for d in dataset if n_reviews_users[d['customer_id']] >= 2]
len(dataset_reviews)

11221

In [19]:
# fiter by review lenght

dataset_review_len = [d for d in dataset_reviews if len(d['review_body'].split()) >= 10]
len(dataset_review_len)

7073