In [18]:
import numpy as np
import pandas as pd
import json
import gzip
import math
import nltk
import string 
import scipy
from nltk.corpus import cmudict
from nltk.corpus import stopwords
import sklearn.metrics as skmetrics
from collections import defaultdict
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from nltk.corpus import sentiwordnet as swn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVR
from sklearn import ensemble
from nltk.corpus import cmudict 

%matplotlib inline 

# Utilities 

In [19]:
# Read from gzip file
def read_gzip(filename):
    for line in gzip.open(filename):
        yield eval(line)

In [20]:
# Get (positive/negative) opinion words from corpus
def get_opinion_words(filename):
    with open(filename,'r') as f:
        for line in f:
            yield line
            
positive_words = set()
negative_words = set()

for pword in get_opinion_words('positive-words.txt'):
    positive_words.add(pword[:-2])
    
for nword in get_opinion_words('negative-words.txt'):
    negative_words.add(nword[:-2])

In [21]:
# Compute number of syllables for a given word
d = cmudict.dict() 
def nsyl(word):
    max_syl = 0
    if word.lower() in d:
        for syl_group in d[word.lower()]:
            tot_syl = 0
            for syl in syl_group:
                if str(syl[-1]).isdigit():
                    tot_syl += 1
            max_syl = max(max_syl,tot_syl)
    return max_syl

# Data Pruning

- Removed data points with greater than 150 votes as the test data has very few data points in that range. 
- Prepared two separate datasete one for highly votes reviews (>10) and one for reviews that recieved low votes (<10)

In [22]:
high_dataset = []
for line in read_gzip("train.json.gz"):
    if (line['helpful'])['outOf'] > 10 and (line['helpful'])['outOf'] <150:
        high_dataset.append(line)

In [23]:
low_dataset = []
for line in read_gzip("train.json.gz"):
    if (line['helpful'])['outOf'] <= 80 and (line['helpful'])['outOf'] > 1 :
        low_dataset.append(line)

### Compute USER specific data

In [24]:
train_user_ratings_dict = defaultdict(list)
train_all_helpful_review = []
train_user_helpful_review_dict = defaultdict(list)
train_user_review_content_dict = defaultdict(list)

for data_point in read_gzip("train.json.gz"):   
    if (data_point['helpful'])['outOf'] == 0:
        continue
        
    train_user_ratings_dict[data_point['reviewerID']].append(data_point['rating'])
    train_all_helpful_review.append(data_point['helpful'])
    train_user_helpful_review_dict[data_point['reviewerID']].append(data_point['helpful'])    
    train_user_review_content_dict[data_point['reviewerID']].append(data_point['reviewText'])

### Compute ITEM specific data

In [25]:
#  Initialize ITEM specific data structures
train_items_ratings_dict = defaultdict(list)
train_user_purchased_items_dict = defaultdict(list)
for line in read_gzip("train.json.gz"):
    user = line['reviewerID']
    item = line['itemID']
    train_user_purchased_items_dict[user].append(item)
    train_items_ratings_dict[item].append(line['rating'])

In [26]:
train_average_items_ratings_dict = {}
train_item_review_count = {}
for item in train_items_ratings_dict:
    train_average_items_ratings_dict[item] = np.mean(train_items_ratings_dict[item])
    train_item_review_count[item] = len(train_items_ratings_dict[item])
print "Average ratings computed for " + str(len(train_average_items_ratings_dict.values())) + " items"
print "Number of reviews computed for " + str(len(train_item_review_count.values())) + " items"

Average ratings computed for 19913 items
Number of reviews computed for 19913 items


### Compute OVERALL average helpfulness

In [27]:
global_average_helpfulness = sum([x['nHelpful'] for x in train_all_helpful_review]) * 1.0 / sum([
        x['outOf'] for x in train_all_helpful_review])
print ("Average Helpfulness : %s" % global_average_helpfulness)

Average Helpfulness : 0.851972088653


### USER SPECIFIC FEATURES

In [28]:
def get_user_review_experience_count_feature(user_review_text_dict):
    user_review_experience = {}
    # Compute number of reviews given by a user
    for user in user_review_text_dict:
        user_review_experience[user] = len(user_review_text_dict[user])
    return user_review_experience

In [29]:
def get_user_average_ratings_feature(user_ratings_dict, train_global_average_ratings):
    user_average_ratings = {}
    # Compute average ratings given by user or fill with global average ratings
    for user in user_ratings_dict:
        total_user_ratings = len(train_user_ratings_dict[user])
        if total_user_ratings > 0:
            user_average_ratings[user] = sum(train_user_ratings_dict[user]) * 1.0/total_user_ratings
        else:
            user_average_ratings[user] = train_global_average_ratings
    return user_average_ratings

In [30]:
def get_user_average_helpfulness_feature(train_user_helpful_review_dict, train_global_average_helpfulness):
    user_average_helpfulness = {}
    # Compute average helpfulness of users or fill with global average helpfulness values
    for user in train_user_helpful_review_dict:
        total_user_helpful_review = sum([x['outOf'] for x in train_user_helpful_review_dict[user]])
        if total_user_helpful_review > 0:
            user_average_helpfulness[user] = sum(
                [x['nHelpful'] for x in train_user_helpful_review_dict[user]]) * 1.0 / total_user_helpful_review
        else:
            user_average_helpfulness[user] = train_global_average_helpfulness

    return user_average_helpfulness

In [31]:
def get_user_rating_deviation_feature(user_ratings_dict, average_items_ratings_dict, user_purchased_items_dict):
    user_rating_deviation = {}
    for user in user_ratings_dict:
        user_rating_deviation[user] = np.mean([(user_ratings_dict[user] - average_items_ratings_dict[item])**2 
         for item in user_purchased_items_dict[user] if item in average_items_ratings_dict])
    return user_rating_deviation

In [32]:
# FEATURE 14 : USER REVIEW EXPERIENCE 
train_user_review_experience = get_user_review_experience_count_feature(train_user_review_content_dict)
print "Extracted user review experience for " + str(len(train_user_review_experience.values())) + " users"

Extracted user review experience for 29005 users


### DATASET DEPENDENT FEATURES

In [33]:
def get_average_helpfulness(dataset):
    data_average_helpfulness = []
    for data_point in dataset:
        data_average_helpfulness.append(global_average_helpfulness)
    return data_average_helpfulness

In [34]:
def get_rating(dataset):
    data_ratings = []
    for data_point in dataset:
        data_ratings.append(data_point['rating'])
    return data_ratings

In [35]:
def get_square_rating(dataset):
    data_ratings = []
    for data_point in dataset:
        data_ratings.append(data_point['rating']**2)
    return data_ratings

In [36]:
def get_log_ratings(dataset):
    data_ratings = []
    for data_point in dataset:
        data_ratings.append(data_point['rating'])
    data_ratings = np.array(data_ratings)
    return np.log(data_ratings.max() + 1 - data_ratings)

In [37]:
def get_helpfulness_votes(dataset):
    data_helpfulness_votes = []
    for data_point in dataset:
        votes_json = data_point['helpful']
        data_helpfulness_votes.append(np.log(votes_json['outOf'] + 1))
    return data_helpfulness_votes

In [38]:
def get_review_word_count(dataset):
    data_review_word_count = []
    for data_point in dataset:
        data_review_word_count.append(np.log(len(data_point['reviewText'].lower().split())+1))
    return data_review_word_count

In [39]:
def get_sentence_count(dataset):
    data_review_sentence_count = []
    for data_point in dataset:
        data_review_sentence_count.append(np.log(len(data_point['reviewText'].lower().split('.'))+1))
    return data_review_sentence_count

In [40]:
def get_review_allcaps_count(dataset):
    data_review_word_allcaps_count = []
    for data_point in dataset:
        data_review_word_allcaps_count.append(np.log(len([
                        word for word in data_point['reviewText'].split() if word.isupper()])+1))
    return data_review_word_allcaps_count

In [41]:
def get_review_char_count(dataset):
    data_review_char_count = []
    for data_point in dataset:
        data_review_char_count.append(np.log(sum([len(word) for word in data_point['reviewText'].lower().split()])+1))
    return data_review_char_count

In [42]:
def get_review_specialchar_count(dataset):
    data_review_specialchar_count = []
    for data_point in dataset:
        data_review_specialchar_count.append(len([word for word in data_point['reviewText'].lower().split() 
                                           if "!" in word or "?" in word ]))
    return data_review_specialchar_count

In [43]:
def get_item_rating_deviation(dataset):
    data_item_rating_deviation = []
    for data_point in dataset:
        data_item_rating_deviation.append(np.abs(data_point['rating'] - train_average_items_ratings_dict[data_point['itemID']]))
    return data_item_rating_deviation

In [44]:
def get_flesch_reading_ease_score(dataset):
    data_review_flesch_reading_score = []
    for data_point in dataset:
        total_words = len(data_point['reviewText'].lower().split())
        total_sent = len(data_point['reviewText'].lower().split('.'))
        total_syllable = sum([nsyl(word) for word in data_point['reviewText'].lower().split()])
        data_review_flesch_reading_score.append(206.835 - (1.015*(total_words* 1.0/(1+total_sent))) - \
                                                (84.6*(total_syllable * 1.0/(1+total_words))))        
    return data_review_flesch_reading_score

In [45]:
def get_readability_index(dataset):
    data_review_readability_index = []
    for data_point in dataset:
        total_char = sum([len(word) for word in data_point['reviewText'].lower().split()])
        total_words = len(data_point['reviewText'].lower().split())
        total_sent = len(data_point['reviewText'].lower().split('.'))
        data_review_readability_index.append((4.71*(total_char*1.0/(1+total_words))) + 
                                            (0.5*(total_words*1.0/(1+total_sent))) - 21.43)
    return data_review_readability_index

In [46]:
def get_summary_word_count(dataset):
    data_summary_word_count = []
    for data_point in dataset:
        data_summary_word_count.append(len([word for word in data_point['summary'].lower().split()]))
    return data_summary_word_count

In [47]:
def get_summary_char_count(dataset):
    data_summary_char_count = []
    for data_point in dataset:
        data_summary_char_count.append(np.log(sum([len(word) for word in data_point['summary'].lower().split()])+1))
    return data_summary_char_count

In [48]:
def get_summary_allcaps_count(dataset):
    data_summary_word_allcaps_count = []
    for data_point in dataset:
        data_summary_word_allcaps_count.append(sum([1 for word in data_point['summary'].split() if word.isupper()]))
    return data_summary_word_allcaps_count

In [49]:
def get_summary_specialchar_count(dataset):
    data_summary_specialchar_count = []
    for data_point in dataset:
        data_summary_specialchar_count.append(len([word for word in data_point['summary'].lower().split() 
                                           if "!" in word or "?" in word ]))
    return data_summary_specialchar_count

In [50]:
def get_review_sentiment_score(dataset):
    data_review_pos_sentiment = []
    data_review_neg_sentiment = []
    data_review_obj_sentiment = []
    
    for data_point in dataset:
        review_text = data_point['reviewText'].lower().split()
        data_review_pos_sentiment.append(sum([
                    sum([x.pos_score() for x in swn.senti_synsets(word.lower())]) for word in review_text]))
        data_review_neg_sentiment.append(sum([
                    sum([x.neg_score() for x in swn.senti_synsets(word.lower())]) for word in review_text]))
        data_review_obj_sentiment.append(sum([
                    sum([x.obj_score() for x in swn.senti_synsets(word.lower())]) for word in review_text]))
    return data_review_pos_sentiment, data_review_neg_sentiment, data_review_obj_sentiment

In [51]:
def get_summary_sentiment_score(dataset):
    data_summary_pos_sentiment = []
    data_summary_neg_sentiment = []
    data_summary_obj_sentiment = []
    for data_point in dataset:
        summary_text = data_point['summary'].lower().split()
        data_summary_pos_sentiment.append(sum([
                    sum([x.pos_score() for x in swn.senti_synsets(word.lower())]) for word in summary_text]))
        data_summary_neg_sentiment.append(sum([
                    sum([x.neg_score() for x in swn.senti_synsets(word.lower())]) for word in summary_text]))
        data_summary_obj_sentiment.append(sum([
                    sum([x.obj_score() for x in swn.senti_synsets(word.lower())]) for word in summary_text]))        
        
    return data_summary_pos_sentiment, data_summary_neg_sentiment, data_summary_obj_sentiment

In [52]:
def get_user_review_experience(dataset):
    data_user_review_experience = []
    for data_point in dataset:
        if data_point['reviewerID'] in train_user_review_experience:
            data_user_review_experience.append(np.log(train_user_review_experience[data_point['reviewerID']]+1))
        else:
            data_user_review_experience.append(0)
    return data_user_review_experience

In [53]:
def get_price(dataset):
    all_price = []
    for data_point in dataset:
        if 'price' in data_point: 
            if not math.isnan(data_point['price']) :
                all_price.append(data_point['price'])
    global_average_price = np.mean(np.array(all_price))
    max_price = np.max(np.array(all_price))
    data_price = []
    for data_point in dataset:
        if ('price' in data_point) and (not math.isnan(data_point['price'])):
            data_price.append(np.cbrt(data_point['price']))
        else:
            data_price.append(np.cbrt(global_average_price))
    return data_price

In [54]:
def get_category_id(dataset):
    category_0 = []
    category_1 = []
    category_2 = []
    category_3 = []
    category_4 = []

    for data_point in dataset:
        cat_id = data_point['categoryID']
        if cat_id == 0:
            category_0.append(1)
            category_1.append(0)
            category_2.append(0)
            category_3.append(0)
            category_4.append(0)
        if cat_id == 1:
            category_0.append(0)
            category_1.append(1)
            category_2.append(0)
            category_3.append(0)
            category_4.append(0)
        if cat_id == 2:
            category_0.append(0)
            category_1.append(0)
            category_2.append(1)
            category_3.append(0)
            category_4.append(0)
        if cat_id == 3:
            category_0.append(0)
            category_1.append(0)
            category_2.append(0)
            category_3.append(1)
            category_4.append(0)
        if cat_id == 4:
            category_0.append(0)
            category_1.append(0)
            category_2.append(0)
            category_3.append(0)
            category_4.append(1)
    return category_0, category_1, category_2, category_3, category_4

In [55]:
def get_tag_category(dataset):
    man = []
    woman = []
    boy = []
    girl = []
    for data_point in dataset:
        categories = data_point['categories']
        man_sum = 0
        wom_sum = 0
        boy_sum = 0
        girl_sum = 0            
        man.append(sum([sum([sum([1 if elem.lower()=='men' else 0 for elem in cat]) 
                             for cat in data_point['categories']])>0]))
        woman.append(sum([sum([sum([1 if elem.lower()=='women' else 0 for elem in cat]) 
                               for cat in data_point['categories']])>0]))
        boy.append(sum([sum([sum([1 if elem.lower()=='boys' else 0 for elem in cat]) 
                             for cat in data_point['categories']])>0]))
        girl.append(sum([sum([sum([1 if elem.lower()=='girls' else 0 for elem in cat]) 
                              for cat in data_point['categories']])>0]))    
    return man, woman, boy, girl

In [56]:
def get_adult_category(dataset):
    adult = []
    kid = []

    for data_point in dataset:
        categories = data_point['categories']           
        adult.append(sum([sum([sum([1 if elem.lower()=='men' or elem.lower()=='women' else 0 for elem in cat]) 
                               for cat in data_point['categories']])>0]))
        kid.append(sum([sum([sum([1 if elem.lower()=='boys' or elem.lower()=='girls' else 0 for elem in cat]) 
                             for cat in data_point['categories']])>0]))

    return adult, kid

In [57]:
def get_sex_category(dataset):
    male = []
    female = []

    for data_point in dataset:
        categories = data_point['categories']           
        male.append(sum([sum([sum([1 if elem.lower()=='men' or elem.lower()=='boys' else 0 for elem in cat]) 
                              for cat in data_point['categories']])>0]))
        female.append(sum([sum([sum([1 if elem.lower()=='women' or elem.lower()=='girls' else 0 for elem in cat]) 
                                for cat in data_point['categories']])>0]))

    return male, female

In [58]:
def get_review_year(dataset):
    review_years = []
    for data_point in dataset:
        year = int(data_point['reviewTime'].split(',')[-1].strip())
        review_years.append(year - 2002)
    return review_years

In [59]:
def get_user_average_ratings(dataset):
    user_average_ratings = []
    for data_point in dataset:
        if data_point['reviewerID'] in train_user_ratings:
            user_average_ratings.append(train_user_ratings[data_point['reviewerID']])
        else:
            user_average_ratings.append(train_global_average_ratings)
    return user_average_ratings

In [60]:
def get_user_average_rating_deviation(dataset):
    user_average_rating_deviation = []
    for data_point in dataset:
        if data_point['reviewerID'] in train_user_mean_rating_deviation:
            user_average_rating_deviation.append(train_user_mean_rating_deviation[data_point['reviewerID']])
        else:
            user_average_rating_deviation.append(train_global_average_ratings - data_point['rating'])
    return user_average_rating_deviation

In [61]:
def get_review_stopwords(dataset):
    review_stopwords = []
    stop_words = set(stopwords.words('english'))
    for data_point in dataset:
        review = data_point['reviewText'].lower().split()
        review_stopwords.append(np.log(sum([1 if word in stop_words else 0 for word in review])+1))
    return review_stopwords

In [62]:
def get_review_non_stopwords(dataset):
    review_nonstopwords = []
    stop_words = set(stopwords.words('english'))
    for data_point in dataset:
        review = data_point['reviewText'].lower().split()
        review_nonstopwords.append(np.log(sum([1 if word not in stop_words else 0 for word in review])+1))
    return review_nonstopwords

In [63]:
def get_review_positive_words(dataset):
    review_positive_words = []
    for data_point in dataset:
        review = data_point['reviewText'].lower().split()
        review_positive_words.append(np.log(sum([1 if word in positive_words else 0 for word in review])+1))
    return review_positive_words

In [64]:
def get_review_negative_words(dataset):
    review_negative_words = []
    for data_point in dataset:
        review = data_point['reviewText'].lower().split()
        review_negative_words.append(np.log(sum([1 if word in negative_words else 0 for word in review])+1))
    return review_negative_words

In [65]:
def get_review_posneg_diff_words(dataset):
    review_posneg_diff_words = []
    for data_point in dataset:
        review = data_point['reviewText'].lower().split()
        neg = sum([1 if word in negative_words else 0 for word in review])
        pos = sum([1 if word in positive_words else 0 for word in review])
        review_posneg_diff_words.append(np.log(abs(neg-pos)+1))
    return review_posneg_diff_words

In [66]:
def get_summary_positive_words(dataset):
    review_positive_words = []
    for data_point in dataset:
        review = data_point['summary'].lower().split()
        review_positive_words.append(np.log(sum([1 if word in positive_words else 0 for word in review])+1))
    return review_positive_words

In [67]:
def get_summary_negative_words(dataset):
    review_negative_words = []
    for data_point in dataset:
        review = data_point['summary'].lower().split()
        review_negative_words.append(np.log(sum([1 if word in negative_words else 0 for word in review])+1))
    return review_negative_words

In [68]:
def get_summary_posneg_diff_words(dataset):
    review_posneg_diff_words = []
    for data_point in dataset:
        review = data_point['summary'].lower().split()
        neg = sum([1 if word in negative_words else 0 for word in review])
        pos = sum([1 if word in positive_words else 0 for word in review])
        review_posneg_diff_words.append(np.log(abs(neg-pos)+1))
    return review_posneg_diff_words

In [69]:
def get_review_season(dataset):
    nov_dec_jan = []
    feb_mar_apr = []
    may_jun_jul = []
    aug_sep_oct = []
    
    for data_point in dataset:
        month = int((data_point['reviewTime'].split(',')[0]).split()[0])
        if month == 1 or month > 10:
            nov_dec_jan.append(1)
            feb_mar_apr.append(0)
            may_jun_jul.append(0)
            aug_sep_oct.append(0)
        if month >= 2 and month <= 4 :
            nov_dec_jan.append(0)
            feb_mar_apr.append(1)
            may_jun_jul.append(0)
            aug_sep_oct.append(0)
        if month >= 5 and month <= 7 :
            nov_dec_jan.append(0)
            feb_mar_apr.append(0)
            may_jun_jul.append(1)
            aug_sep_oct.append(0)
        if month >= 8 and month <= 10 :
            nov_dec_jan.append(0)
            feb_mar_apr.append(0)
            may_jun_jul.append(0)
            aug_sep_oct.append(1)

    return nov_dec_jan, feb_mar_apr, may_jun_jul, aug_sep_oct

In [70]:
def get_review_month(dataset):
    review_months=[[0]*len(dataset) for n in range(12)] 
    i=0
    for data_point in dataset:
        month = int((data_point['reviewTime'].split(',')[0]).split()[0])
        review_months[month-1][i]=1
        i+=1
    return review_months

In [71]:
def get_item_review_count(dataset):
    item_review_count = []
    for data_point in dataset:
        if data_point['itemID'] in train_item_review_count:
            item_review_count.append(train_item_review_count[data_point['itemID']])
        else:
            item_review_count.append(0)
    return item_review_count

In [72]:
def get_rating_category(dataset):
    bad = []
    ok = []
    good = []
    
    for data_point in dataset:
        r = data_point['rating']
        if r < 3.0:
            bad.append(1)
            ok.append(0)
            good.append(0)
        if r >=3.0 and r<5.0:
            bad.append(0)
            ok.append(1)
            good.append(0)
        if r == 5.0:
            bad.append(0)
            ok.append(0)
            good.append(1)
    return bad, ok, good

In [73]:
def get_outof_group(dataset):
    outof_low = []
    outof_mid1 = []
    outof_mid2 = []
    outof_high = []
    for data_point in dataset:
        out_of = (data_point['helpful'])['outOf']
        if out_of < 10:
            outof_low.append(1)
            outof_mid1.append(0)
            outof_mid2.append(0)
            outof_high.append(0)
            
        elif out_of < 50:
            outof_low.append(0)
            outof_mid1.append(2)
            outof_mid2.append(0)
            outof_high.append(0)
        
        elif out_of < 140:
            outof_low.append(0)
            outof_mid1.append(0)
            outof_mid2.append(3)
            outof_high.append(0)
            
        elif out_of < 600:
            outof_low.append(0)
            outof_mid1.append(0)
            outof_mid2.append(0)
            outof_high.append(4)
            
    return outof_low, outof_mid1, outof_mid2, outof_high

In [74]:
def get_unixtime(dataset):
    unixTime = []
    user_unix_time = []
    for data_point in dataset:
        unixTime.append(data_point['unixReviewTime'])
    max_unix = max(unixTime)
    min_unix = min(unixTime)
    
    for data_point in dataset:
        user_unix_time.append(np.log(max_unix - data_point['unixReviewTime'] + 1))
    return unixTime

In [75]:
def get_price_range(dataset):
    cheap = []
    low = []
    mid = []
    high = []
    for data_point in dataset:
        if 'price' in data_point:
            if data_point['price'] <= 10.:
                cheap.append(1)
                low.append(0)
                mid.append(0)
                high.append(0)
            elif data_point['price'] <= 50.:
                cheap.append(0)
                low.append(1)
                mid.append(0)
                high.append(0)
            elif data_point['price'] <= 150.:
                cheap.append(0)
                low.append(0)
                mid.append(1)
                high.append(0)
            elif data_point['price'] > 150.:
                cheap.append(0)
                low.append(0)
                mid.append(0)
                high.append(1)
        else:
            cheap.append(0)
            low.append(0)
            mid.append(0)
            high.append(0)
    return cheap,low,mid,high

# Helpfulness Prediction Features

### Prepare Training Feature Set for Prediction

In [82]:
def get_features(dataset):

    ratings = get_rating(dataset)
    print "Ratings extracted.." + str(np.matrix(ratings).shape)

    price = get_price(dataset)
    print "Price feature extracted.. " + str(np.matrix(price).shape)

    square_ratings = get_square_rating(dataset)
    print "Squared Ratings extracted.. " + str(np.matrix(square_ratings).shape)

    total_helpfulness_votes = get_helpfulness_votes(dataset)
    print "Helpfulness extracted.."+ str(np.matrix(total_helpfulness_votes).shape)

    review_word_count = get_review_word_count(dataset)
    print "Review word count extracted.."+ str(np.matrix(review_word_count).shape)
    
    review_sentence_count = get_sentence_count(dataset)
    print "Review Sentence count extracted.."+ str(np.matrix(review_sentence_count).shape)
    
    review_word_allcaps_count = get_review_allcaps_count(dataset)
    print "Review word all caps extracted.."+ str(np.matrix(review_word_allcaps_count).shape)
    
    review_char_count = get_review_char_count(dataset)
    print "Review character count extracted.."+ str(np.matrix(review_char_count).shape)

    item_rating_deviation = get_item_rating_deviation(dataset)
    print "Item rating deviation extracted.."+ str(np.matrix(item_rating_deviation).shape)
    
    summary_word_count = get_summary_word_count(dataset)
    print "Summary word count extracted.."+ str(np.matrix(summary_word_count).shape)

    summary_word_allcaps_count = get_summary_allcaps_count(dataset)
    print "Summary word all caps extracted.."+ str(np.matrix(summary_word_allcaps_count).shape)

    user_review_experience = get_user_review_experience(dataset)
    print "User review experience extracted.." + str(np.matrix(user_review_experience).shape)
    
    category_0, category_1, category_2, category_3, category_4 = get_category_id(dataset)
    print "One hot encoding of category ID complete.." + str(np.matrix(category_0).shape)

    review_readability_index = get_readability_index(dataset)
    print "Review readability score extracted.."+ str(np.matrix(review_readability_index).shape)
    
    review_posneg_diff = get_review_posneg_diff_words(dataset)
    print "Review positive-negative difference extracted.." + str(np.matrix(review_posneg_diff).shape)
    
    review_stopwords = get_review_stopwords(dataset)
    print "Review stop words extracted.. " + str(np.matrix(review_stopwords).shape)
    
    summary_neg_words = get_summary_negative_words(dataset)
    print "Summary negative words extracted.." + str(np.matrix(summary_neg_words).shape)

    summary_specialchar_count = get_summary_specialchar_count(dataset)
    print "Summary special character count extracted.."+ str(np.matrix(summary_specialchar_count).shape)
    
    man_cat, woman_cat, boy_cat, girl_cat = get_tag_category(dataset)
    print "Extracted audience categories.."
        
    summary_pos_words = get_summary_positive_words(dataset)
    print "Summary positive words extracted.." + str(np.matrix(summary_pos_words).shape)
      
    summary_posneg_words = get_summary_posneg_diff_words(dataset)
    print "Summary posneg difference extracted.. " + str(np.matrix(summary_posneg_words).shape)

    rating_bad, rating_ok, rating_good = get_rating_category(dataset)
    print "Extracted rating category.." + str(np.matrix(rating_bad).shape)

    review_nonstopwords = get_review_non_stopwords(dataset)
    print "Review non-stop words extracted.." + str(np.matrix(review_nonstopwords).shape)
    
    review_year = get_review_year(dataset)
    print "Review years extracted.." + str(np.matrix(review_year).shape)
    
    outOf_low, outOf_mid1, outOf_mid2, outOf_high = get_outof_group(dataset)
    print "Extracted one-hot encoded outOf categories.." + str(np.matrix(outOf_low).shape)
    
    unixTime = get_unixtime(dataset)
    print "Extracted unix time of review.." + str(np.matrix(unixTime).shape)
    
    price_cheap, price_low, price_mid, price_high = get_price_range(dataset)
    print "Extracted one-hot encoded price ranges.." + str(np.matrix(price_cheap).shape)
    
##################################################################################################################
    
#     test_item_review_count = get_item_review_count(dataset)
#     print "Extracted item review count .." + str(np.matrix(test_item_review_count).shape)
    
#     test_adult_cat, test_kid_cat = get_adult_category(dataset)
#     print "Extracted age category.."
    
#     test_male_cat, test_female_cat = get_sex_category(dataset)
#     print "Extract sex category.."
     
#     test_review_pos_words = get_review_positive_words(dataset)
#     print "Review positive words extracted.." + str(np.matrix(test_review_pos_words).shape)
    
#     test_review_neg_words = get_review_negative_words(dataset)
#     print "Review negative words extracted.." + str(np.matrix(test_review_neg_words).shape)
  
#     test_average_helpfulness = get_average_helpfulness(dataset)
#     print "Average Helfulness extracted.. " + str(np.matrix(test_average_helpfulness).shape)
    
#     test_user_avg_ratings = get_user_average_ratings(dataset)
#     print "User average ratings extracted.. " + str(np.matrix(test_user_avg_ratings).shape)
    
#     test_user_avg_rating_deviation = get_user_average_rating_deviation(dataset)
#     print "User average rating deviation extracted.. " + str(np.matrix(test_user_avg_rating_deviation).shape)
    
#     test_data_log_ratings = get_log_ratings(dataset)
#     print "Log Ratings extracted.. " + str(np.matrix(test_data_log_ratings).shape)
    
#     test_data_review_specialchar_count = get_review_specialchar_count(dataset)
#     print "Review special character extracted.."+ str(np.matrix(test_data_review_specialchar_count).shape)
    
#     test_data_review_flesch_reading_score = get_flesch_reading_ease_score(dataset)
#     print "Review flesch reading score extracted.."+ str(np.matrix(test_data_review_flesch_reading_score).shape)
        
#     test_data_summary_char_count = get_summary_char_count(dataset)
#     print "Summary character count extracted"+ str(np.matrix(test_data_summary_char_count).shape)
       
#     test_data_pos_sentiment_score, test_data_neg_sentiment_score, test_data_obj_sentiment_score = get_review_sentiment_score(dataset)
#     print "Review Sentiment scores extracted.."
    
#     test_data_summ_pos_sentiment_score, test_data_summ_neg_sentiment_score, \
#         test_data_summ_obj_sentiment_score = get_summary_sentiment_score(dataset)
#     print "Summary sentiment scores extracted.."

#     review_months = get_review_month(dataset)
#     print "Extracted encoded months.." 
######################################################################################################################    
  
    feature_set = [
        np.ones(len(dataset)),
        ratings,
        price,
        square_ratings,
        total_helpfulness_votes,
        review_word_count,
        review_sentence_count,
        review_word_allcaps_count,
        review_char_count,
        item_rating_deviation,
        summary_word_count,
        summary_word_allcaps_count,
        user_review_experience, 
        category_0, 
        category_1, 
        category_2, 
        category_3, 
        category_4,
        review_readability_index,
        review_posneg_diff,
        review_stopwords,
        summary_neg_words,
        summary_specialchar_count,
        man_cat,
        woman_cat,
        boy_cat,
        girl_cat,
        outOf_low,
        outOf_mid1,
        outOf_mid2,
        outOf_high,
        rating_bad, 
        rating_ok, 
        rating_good,
        unixTime,
        price_cheap,
        price_low,
        price_mid,
        price_high,
        
        #########################################
#         test_item_review_count,
#         test_review_month,
#         test_adult_cat,
#         test_kid_cat,
#         test_male_cat,
#         test_female_cat,
#         test_summary_pos_words,
#         test_summary_posneg_words,
#         test_review_pos_words,
#         test_review_neg_words,        
#         test_average_helpfulness,
#         test_user_avg_ratings,
#         test_data_log_ratings,
#         test_data_review_specialchar_count,
#         test_user_avg_rating_deviation,
#         test_data_review_flesch_reading_score,
#         test_data_summary_char_count,   
#         test_data_summ_pos_sentiment_score,
#         test_data_summ_neg_sentiment_score,
#         test_data_summ_obj_sentiment_score,
#         test_review_year,
#         test_review_nonstopwords,
    ]

    dataset = np.stack(feature_set, axis=1)
    return dataset

In [83]:
train_high_dataset = get_features(high_dataset)

Ratings extracted..(1L, 4621L)
Price feature extracted.. (1L, 4621L)
Squared Ratings extracted.. (1L, 4621L)
Helpfulness extracted..(1L, 4621L)
Review word count extracted..(1L, 4621L)
Review Sentence count extracted..(1L, 4621L)
Review word all caps extracted..(1L, 4621L)
Review character count extracted..(1L, 4621L)
Item rating deviation extracted..(1L, 4621L)
Summary word count extracted..(1L, 4621L)
Summary word all caps extracted..(1L, 4621L)
User review experience extracted..(1L, 4621L)
One hot encoding of category ID complete..(1L, 4621L)
Review readability score extracted..(1L, 4621L)
Review positive-negative difference extracted..(1L, 4621L)
Review stop words extracted.. (1L, 4621L)
Summary negative words extracted..(1L, 4621L)
Summary special character count extracted..(1L, 4621L)
Extracted audience categories..
Summary positive words extracted..(1L, 4621L)
Summary posneg difference extracted.. (1L, 4621L)
Extracted rating category..(1L, 4621L)
Review non-stop words extracted

In [84]:
train_low_dataset = get_features(low_dataset)

Ratings extracted..(1L, 34300L)
Price feature extracted.. (1L, 34300L)
Squared Ratings extracted.. (1L, 34300L)
Helpfulness extracted..(1L, 34300L)
Review word count extracted..(1L, 34300L)
Review Sentence count extracted..(1L, 34300L)
Review word all caps extracted..(1L, 34300L)
Review character count extracted..(1L, 34300L)
Item rating deviation extracted..(1L, 34300L)
Summary word count extracted..(1L, 34300L)
Summary word all caps extracted..(1L, 34300L)
User review experience extracted..(1L, 34300L)
One hot encoding of category ID complete..(1L, 34300L)
Review readability score extracted..(1L, 34300L)
Review positive-negative difference extracted..(1L, 34300L)
Review stop words extracted.. (1L, 34300L)
Summary negative words extracted..(1L, 34300L)
Summary special character count extracted..(1L, 34300L)
Extracted audience categories..
Summary positive words extracted..(1L, 34300L)
Summary posneg difference extracted.. (1L, 34300L)
Extracted rating category..(1L, 34300L)
Review non

In [88]:
train_high_helpfulness = []
for data_point in high_dataset:
    data_helpfulness = data_point['helpful']
    if data_helpfulness['outOf'] > 10:
        train_high_helpfulness.append(data_helpfulness['nHelpful'] * 1.0/data_helpfulness['outOf'])

train_high_helpfulness = np.matrix(train_high_helpfulness).T
print "Extracted helpfulness score for " + str(len(train_high_helpfulness)) + " data points"

Extracted helpfulness score for 4621 data points


In [89]:
train_low_helpfulness = []
for data_point in low_dataset:
    data_helpfulness = data_point['helpful']
    train_low_helpfulness.append(data_helpfulness['nHelpful'] * 1.0/data_helpfulness['outOf'])

train_low_helpfulness = np.matrix(train_low_helpfulness).T
print "Extracted helpfulness score for " + str(len(train_low_helpfulness)) + " data points"

Extracted helpfulness score for 34300 data points


### Split Train and Validation

In [90]:
train_high_x = train_high_dataset[:int(0.5*len(train_high_dataset))]
valid_high_x = train_high_dataset[int(0.5*len(train_high_dataset)):]
train_high_y = train_high_helpfulness[:int(0.5*len(train_high_helpfulness))]
valid_high_y = train_high_helpfulness[int(0.5*len(train_high_helpfulness)):]
print train_high_x.shape
print valid_high_x.shape
print train_high_y.shape
print valid_high_y.shape

(2310L, 39L)
(2311L, 39L)
(2310L, 1L)
(2311L, 1L)


In [91]:
train_low_x = train_low_dataset[:int(0.5*len(train_low_dataset))]
valid_low_x = train_low_dataset[int(0.5*len(train_low_dataset)):]
train_low_y = train_low_helpfulness[:int(0.5*len(train_low_helpfulness))]
valid_low_y = train_low_helpfulness[int(0.5*len(train_low_helpfulness)):]
print train_low_x.shape
print valid_low_x.shape
print train_low_y.shape
print valid_low_y.shape

(17150L, 39L)
(17150L, 39L)
(17150L, 1L)
(17150L, 1L)


### Prediction Models

In [95]:
# ElasticNet Regressor
from sklearn.linear_model import ElasticNet
predictor_high = ElasticNet(alpha=0.09, l1_ratio=0.005)
predictor_high.fit((train_high_x), (train_high_y))
predict_high_y = predictor_high.predict((valid_high_x))

In [94]:
params = {'n_estimators': 220, 'max_depth': 4, 'min_samples_split': 2, 'loss': 'ls'}
predictor_low = ensemble.GradientBoostingRegressor(**params)
predictor_low.fit((train_low_x), (train_low_y))
predict_low_y = predictor_low.predict((valid_low_x))

In [505]:
# # # Linear Regression Model
# predictor = linear_model.LinearRegression()
# predictor_low.fit((train_low_x), (train_low_y))
# predict_low_y = predictor_low.predict((valid_low_x))

### Model Evaluation

In [96]:
# Mean Absolute Error
mae_high = skmetrics.mean_absolute_error(valid_high_y, predict_high_y)
print "Mean Absolute Error of Predictor : " + str(mae_high)

Mean Absolute Error of Predictor : 0.0832317539593


In [97]:
# Mean Absolute Error
mae_low = skmetrics.mean_absolute_error(valid_low_y, predict_low_y)
print "Mean Absolute Error of Predictor : " + str(mae_low)

Mean Absolute Error of Predictor : 0.189705696104


# Prepare Complete Dataset for Test prediction

In [99]:
# TRAIN Linear Regression Model
predictor = ElasticNet(alpha=0.09, l1_ratio=0.005)
predictor.fit((train_high_dataset), (train_high_helpfulness))

ElasticNet(alpha=0.09, copy_X=True, fit_intercept=True, l1_ratio=0.005,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [100]:
params = {'n_estimators': 150, 'max_depth': 4, 'min_samples_split': 2, 'loss': 'ls'}
predictor_low = ensemble.GradientBoostingRegressor(**params)
predictor_low.fit((train_low_dataset),(train_low_helpfulness))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=150,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [101]:
def test_predict(m_predictor, x_test):
    return m_predictor.predict(np.matrix(x_test))

In [102]:
test_dataset = []
for line in read_gzip("test_Helpful.json.gz"):
    test_dataset.append(line)

In [103]:
test_feature_set = get_features(test_dataset)
print test_feature_set.shape

Ratings extracted..(1L, 14000L)
Price feature extracted.. (1L, 14000L)
Squared Ratings extracted.. (1L, 14000L)
Helpfulness extracted..(1L, 14000L)
Review word count extracted..(1L, 14000L)
Review Sentence count extracted..(1L, 14000L)
Review word all caps extracted..(1L, 14000L)
Review character count extracted..(1L, 14000L)
Item rating deviation extracted..(1L, 14000L)
Summary word count extracted..(1L, 14000L)
Summary word all caps extracted..(1L, 14000L)
User review experience extracted..(1L, 14000L)
One hot encoding of category ID complete..(1L, 14000L)
Review readability score extracted..(1L, 14000L)
Review positive-negative difference extracted..(1L, 14000L)
Review stop words extracted.. (1L, 14000L)
Summary negative words extracted..(1L, 14000L)
Summary special character count extracted..(1L, 14000L)
Extracted audience categories..
Summary positive words extracted..(1L, 14000L)
Summary posneg difference extracted.. (1L, 14000L)
Extracted rating category..(1L, 14000L)
Review non

### Prepare results file for KAGGLE UPLOAD

In [105]:
predictions = open("predictions_Helpful.txt", 'w')
idx = 0
for l in open("pairs_Helpful.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,i,outOf = l.strip().split('-')
    outOf = int(outOf)
    if outOf > 10:
        pred = int(round(outOf*test_predict(predictor, test_feature_set[idx])))
    else:
        pred = int(round(outOf*test_predict(predictor_low, test_feature_set[idx])))
    predictions.write(u + '-' + i + '-' + str(outOf) + ',' + str(pred) + '\n')
    idx += 1
predictions.close()

=====================================================================================================================