In [2]:
import gzip
import time

<p> Filter the reviews with the date range : <br />
    Start date : 01-01-2013 <br/>
    End date : 31-12-2013
</p>

In [3]:
start_date = time.strptime("01 01, 2013", "%m %d, %Y")
end_date = time.strptime("12 31, 2013", "%m %d, %Y")

<p> Data files : </p>
<i> 1) reviews_Automotive_5.json.gz  (Category : Automatives) </i> <br/>
<i> 2) reviews_Office_Products_5.json.gz (Category: Baby) <i> <br/>
<i> 3) reviews_Digital_Music_5.json.gz (Category: Digital Music) </i>

In [10]:
file1 = "./data/reviews_Automotive_5.json.gz"
file2 = "./data/reviews_Office_Products_5.json.gz"
file3 = "./data/reviews_Digital_Music_5.json.gz"

In [5]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        review = eval(l)
        review_date = time.strptime(review["reviewTime"], "%m %d, %Y")
        if review_date >= start_date and review_date <= end_date:
            yield review

### Task 1:

In [6]:
import re
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

from nltk.tag import pos_tag

In [7]:
def get_term_info(token):
    '''
    1) Convert to lower case
    2) Remove special chars
    3) Remove stop words 
    4) Lemmatize
    '''
    token_info = { 'EMPTY' : False, 'STOP_WORD' : False, 'POS' : '.', 'WORD' : token, 'LM_WORD' : token}
    
    token = token.lower()
    token = re.sub(r'\W', '', token)
    token.strip()
    if token == '':
        token_info['EMPTY'] = True
        return token_info
        
    if token in stop_words:
        token_info['STOP_WORD'] = True
        
    (w, pos) = pos_tag([token])[0]
    if pos[:2] == 'NN':
        token_info['POS'] = 'NOUN'
    elif pos[:2] == 'JJ' or pos[:2] == 'RR':
        token_info['POS'] = 'ADJ/ADV'
     
    token_info['WORD'] = token
    token_info['LM_WORD'] = lemmatizer.lemmatize(token)
    return token_info

#### Fetch top 20 terms, nouns and adjectives:

In [8]:
from heapq import nlargest

def increment(dictionary, key, by=1):
    if key in dictionary:
        dictionary[key] += by
    else:
        dictionary[key] = 1

def get_stats(file_path, k=20):
    '''
    Process the file by tokenizing each review
    and get top k terms, nouns and adjectives
    '''
    term_freq = {'TERM' : {}, 'NOUN' : {}, 'ADJ/ADV' : {}}
    review_gen = parse(file_path)
    total_reviews = 0
    for review in review_gen:
        total_reviews += 1
        for token in word_tokenize(review["reviewText"]):
            token_info = get_term_info(token)
            if not(token_info['EMPTY'] or token_info['STOP_WORD']):
                increment(term_freq['TERM'], token_info['LM_WORD'])
                
            if token_info['POS'] == 'NOUN':
                increment(term_freq['NOUN'], token_info['WORD'])
            elif token_info['POS'] == 'ADJ/ADV':
                increment(term_freq['ADJ/ADV'], token_info['WORD'])
    
    print("Total reviews : {}".format(total_reviews))
    topk_stats = {'TERM' : {}, 'NOUN' : {}, 'ADJ/ADV' : {}}
    topk_stats['TERM'] = {key : term_freq['TERM'][key] for key in nlargest(k, term_freq['TERM'], key = term_freq['TERM'].get)}
    topk_stats['NOUN'] = {key : term_freq['NOUN'][key] for key in nlargest(k, term_freq['NOUN'], key = term_freq['NOUN'].get)}
    topk_stats['ADJ/ADV'] = {key : term_freq['ADJ/ADV'][key] for key in nlargest(k, term_freq['ADJ/ADV'], key = term_freq['ADJ/ADV'].get)}
    
    return topk_stats

In [94]:
get_stats(file1)

Total reviews : 9022


{'TERM': {'nt': 4153,
  'car': 3499,
  'use': 3307,
  'one': 3221,
  'work': 3002,
  'product': 2680,
  'good': 2507,
  'great': 2496,
  'well': 2377,
  'like': 2339,
  'get': 2180,
  'would': 2167,
  'used': 2020,
  'time': 1898,
  'battery': 1734,
  'easy': 1632,
  'light': 1507,
  'make': 1485,
  'need': 1411,
  'much': 1356},
 'NOUN': {'i': 20714,
  'nt': 4153,
  's': 3666,
  'use': 3307,
  'car': 2929,
  'product': 2103,
  'time': 1545,
  'works': 1510,
  'work': 1492,
  'battery': 1409,
  'need': 1193,
  'price': 1072,
  've': 1027,
  'bought': 996,
  'light': 994,
  'water': 970,
  'oil': 964,
  'quality': 946,
  'fit': 908,
  'put': 849},
 'ADJ/ADV': {'great': 2496,
  'good': 2494,
  'easy': 1632,
  'other': 1381,
  'much': 1356,
  'little': 1135,
  'nice': 1057,
  'new': 874,
  'small': 708,
  'last': 698,
  'few': 651,
  'best': 634,
  'most': 596,
  'old': 596,
  'same': 580,
  'many': 542,
  'black': 516,
  'high': 451,
  'hard': 437,
  'different': 418}}

In [11]:
get_stats(file2)

Total reviews : 12391


{'TERM': {'nt': 10323,
  'printer': 9048,
  'use': 7827,
  'one': 7342,
  'paper': 6266,
  'like': 6026,
  'work': 5061,
  'would': 5003,
  'ink': 4915,
  'pen': 4721,
  'great': 4611,
  'print': 4546,
  'good': 4490,
  'well': 4312,
  'get': 3995,
  'color': 3964,
  'need': 3866,
  'time': 3834,
  'also': 3683,
  'easy': 3679},
 'NOUN': {'i': 47074,
  'nt': 10323,
  's': 9508,
  'use': 7827,
  'printer': 7657,
  'paper': 5514,
  'ink': 4578,
  'print': 3447,
  'quality': 3267,
  'time': 3217,
  'tape': 3204,
  'need': 3079,
  'work': 2840,
  've': 2752,
  'price': 2603,
  'color': 2458,
  'm': 2418,
  'pens': 2398,
  'product': 2363,
  'printing': 2306},
 'ADJ/ADV': {'great': 4611,
  'good': 4471,
  'easy': 3679,
  'other': 3335,
  'much': 3059,
  'nice': 2642,
  'little': 2586,
  'small': 2004,
  'most': 1767,
  'black': 1626,
  'many': 1492,
  'few': 1447,
  'old': 1227,
  'same': 1210,
  'new': 1189,
  'last': 1153,
  'big': 1143,
  'scan': 1120,
  'best': 1107,
  'different': 1107

In [12]:
get_stats(file3)

Total reviews : 4589


{'TERM': {'song': 6391,
  'album': 6129,
  'nt': 2884,
  'like': 2860,
  'one': 2689,
  'music': 2326,
  'love': 1944,
  'great': 1915,
  '34': 1884,
  'sound': 1791,
  'track': 1763,
  'good': 1729,
  'band': 1594,
  'time': 1512,
  'cd': 1459,
  'really': 1278,
  'get': 1121,
  'would': 1118,
  'best': 1115,
  'first': 1114},
 'NOUN': {'i': 10537,
  's': 6410,
  'album': 5286,
  'song': 3947,
  'nt': 2884,
  'songs': 2444,
  'music': 2326,
  'love': 1882,
  'cd': 1367,
  'band': 1350,
  'sound': 1218,
  'time': 1210,
  'track': 967,
  'albums': 843,
  'rock': 841,
  'listen': 820,
  'tracks': 796,
  'm': 736,
  'lyrics': 691,
  'way': 688},
 'ADJ/ADV': {'great': 1913,
  'good': 1721,
  'best': 1114,
  'new': 907,
  'much': 894,
  'most': 882,
  'other': 847,
  'many': 571,
  'little': 506,
  'classic': 467,
  'few': 436,
  'last': 411,
  'same': 404,
  'live': 391,
  'such': 386,
  'single': 368,
  'original': 365,
  'own': 364,
  'hard': 361,
  'nice': 358}}