## Nirmal Budhathoki- DSE220 Final_ Selected Submission 1

## Importing Libraries

In [58]:
import numpy as np
import pandas as pd
import json
import gzip
import math
import string
import sklearn.metrics as skmetrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn import ensemble
from nltk.stem.porter import PorterStemmer
import string
%matplotlib inline

In [59]:
# Read from gzip file
def read_gzip(filename):
    for line in gzip.open(filename):
        yield eval(line)

In [60]:
# Get (positive/negative) opinion words from corpus
# I downloaded .txt files for positive and negative words from http://positivewordsresearch.com/sentiment-analysis-resources/

def retrieve_words(filename):
    with open(filename,'r',encoding='latin-1') as f:
        for line in f:
            yield line
            
positive_words = []
negative_words = []

for pword in retrieve_words('positive-words.txt'): # collecting positive words in the list
    positive_words.append(pword[:-1])
    
for nword in retrieve_words('negative-words.txt'): # collecting negative words in the list
    negative_words.append(nword[:-1])

In [61]:
positive_words[:5]

['a+', 'abound', 'abounds', 'abundance', 'abundant']

In [62]:
negative_words[:5]

['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable']

## Splitting dataset based on number of votes

Because of the long tail problem with the dataset, I splitted the data between two sets one for higher votes and one for lower number of votes, and used ensembling models. 

In [63]:
#data with high votes
high_vote_dataset = []
for line in read_gzip("train.json.gz"):
    if (line['helpful'])['outOf'] >15 and (line['helpful'])['outOf'] <150: #during EDA 15 is the best boundary number noticed
        high_vote_dataset.append(line)

In [74]:
len(high_vote_dataset)

2767

In [65]:
#data with low votes, leaving outOf= 0 votes out
low_vote_dataset = []
for line in read_gzip("train.json.gz"):
    if (line['helpful'])['outOf'] <= 40  and (line['helpful'])['outOf'] > 1 : #again EDA gave me insight for boundary numbers
        low_vote_dataset.append(line)

In [83]:
len(low_vote_dataset)

33876

## Generating Features

In [91]:
#get the rating
def get_rating(review_data):
    ratings = []
    for x in review_data:
        ratings.append(x['rating'])
    return ratings

#get helpful votes
def get_helpful_votes(review_data):
    helpfulness_votes = []
    for x in review_data:
        votes = x['helpful']
        helpfulness_votes.append(np.log(votes['outOf'] + 1))
    return helpfulness_votes

#get word count in review
# punctuation = set(string.punctuation)
# from collections import defaultdict

#get word count in review
def get_reviewText_word_count(review_data):
    data_review_word_count = []
    for x in review_data:
        data_review_word_count.append(np.log(len(x['reviewText'].lower().split())+1))
    return data_review_word_count
    
#get allcaps words in reviewText
def get_reviewText_allcaps_count(review_data):
    data_review_word_allcaps_count = []
    for x in review_data:
        data_review_word_allcaps_count.append(np.log(len([word for word in x['reviewText'].split() if word.isupper()])+1))
    return data_review_word_allcaps_count

def get_reviewText_specialchar_count(dataset):
    spec_chars=['!','#','$','%','^','&','*','?']
    data_review_specialchar_count = []
    for data_point in dataset:
        data_review_specialchar_count.append(np.log(len([word for word in data_point['reviewText'].lower().split() 
                                           if word in spec_chars])+1))
    return data_review_specialchar_count

#get flesch reading ease score of review text
from textstat.textstat import textstat
def get_flesch_reading_ease_score(review_data):
    data_review_flesch_reading_score = []
    for x in review_data:
        data_review_flesch_reading_score.append(textstat.flesch_reading_ease(x['reviewText']))
    return data_review_flesch_reading_score

In [92]:
#one hot encoding for category
def get_category_id(review_data):
    cat_0 = []
    cat_1 = []
    cat_2 = []
    cat_3 = []
    cat_4 = []
    
    for x in review_data:
        cat_id = x['categoryID']
        if cat_id == 0:
            cat_0.append(1)
            cat_1.append(0)
            cat_2.append(0)
            cat_3.append(0)
            cat_4.append(0)
        if cat_id == 1:
            cat_0.append(0)
            cat_1.append(1)
            cat_2.append(0)
            cat_3.append(0)
            cat_4.append(0)
        if cat_id == 2:
            cat_0.append(0)
            cat_1.append(0)
            cat_2.append(1)
            cat_3.append(0)
            cat_4.append(0)
        if cat_id == 3:
            cat_0.append(0)
            cat_1.append(0)
            cat_2.append(0)
            cat_3.append(1)
            cat_4.append(0)
        if cat_id == 4:
            cat_0.append(0)
            cat_1.append(0)
            cat_2.append(0)
            cat_3.append(0)
            cat_4.append(1)
    return cat_0, cat_1, cat_2, cat_3, cat_4

In [93]:
def get_reviewText_posneg_diff_words(review_data):
    review_posneg_diff_words = []
    for x in review_data:
        review = x['reviewText'].lower().split()
        neg = sum([1 if word in negative_words else 0 for word in review])
        pos = sum([1 if word in positive_words else 0 for word in review])
        review_posneg_diff_words.append(np.log(abs(pos-neg)+1))
    return review_posneg_diff_words

In [94]:

def collect_features(review_data):

    ratings = get_rating(review_data)
    
    helpful_votes = get_helpful_votes(review_data)
    
    review_word_count = get_reviewText_word_count(review_data)
    
    review_word_allcaps_count = get_reviewText_allcaps_count(review_data)
    
    review_specialchar_count = get_reviewText_specialchar_count(review_data)
    
    ease_of_reading_score=get_flesch_reading_ease_score(review_data)
    
    cat_0, cat_1, cat_2, cat_3, cat_4 = get_category_id(review_data)
    
    review_posneg_diff = get_reviewText_posneg_diff_words(review_data)
    
    feature_set = [np.ones(len(review_data)),ratings,helpful_votes,review_word_count,review_word_allcaps_count,\
                   review_specialchar_count,ease_of_reading_score,cat_0,cat_1, cat_2, cat_3,\
                   cat_4,review_posneg_diff]
    
    review_data = np.stack(feature_set, axis=1)
    print ("All Features are successfully extracted...")
    return review_data


In [95]:
train_high_helpfulness = []
for x in high_vote_dataset:
    data_helpfulness = x['helpful']
    train_high_helpfulness.append(data_helpfulness['nHelpful'] * 1.0/data_helpfulness['outOf'])

train_high_helpfulness = np.matrix(train_high_helpfulness).T
print ("Extracted helpfulness score for " + str(len(train_high_helpfulness)) + " rows of data")

Extracted helpfulness score for 2767 rows of data


In [96]:
train_low_helpfulness = []
for x in low_vote_dataset:
    data_helpfulness = x['helpful']
    train_low_helpfulness.append(data_helpfulness['nHelpful'] * 1.0/data_helpfulness['outOf'])

train_low_helpfulness = np.matrix(train_low_helpfulness).T
print ("Extracted helpfulness score for " + str(len(train_low_helpfulness)) + " rows of data")

Extracted helpfulness score for 33876 rows of data


In [97]:
#Collecting features for high votes data
train_high_dataset = collect_features(high_vote_dataset)

All Features are successfully extracted...


In [98]:
#collecting features for low votes data
train_low_dataset = collect_features(low_vote_dataset)

All Features are successfully extracted...


## k-fold Validation

In [99]:
#Cross Validation
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error as mae
kf=KFold(n_splits=10, random_state=None, shuffle=True)
for train_index, test_index in kf.split(train_high_dataset):
#print("TRAIN:", train_index, "TEST:", test_index)
#Fitting model
    model_1=ElasticNet(alpha=0.09, l1_ratio=0.005)
    model_1.fit(train_high_dataset[train_index], train_high_helpfulness[train_index])
#predict    
    pred_1=model_1.predict(train_high_dataset[test_index])
#mae
    mae_1 = mae(train_high_helpfulness[test_index], pred_1)
    print ("Mean Absolute Error of Predictor : " + str(mae_1))

Mean Absolute Error of Predictor : 0.0689207493827
Mean Absolute Error of Predictor : 0.0780129975504
Mean Absolute Error of Predictor : 0.0768868528261
Mean Absolute Error of Predictor : 0.0706008441347
Mean Absolute Error of Predictor : 0.0692037430554
Mean Absolute Error of Predictor : 0.0816918792378
Mean Absolute Error of Predictor : 0.0765967238911
Mean Absolute Error of Predictor : 0.0713873756136
Mean Absolute Error of Predictor : 0.0717808813845
Mean Absolute Error of Predictor : 0.0734490812551


In [133]:
kf=KFold(n_splits=10, random_state=None, shuffle=True)
for train_index, test_index in kf.split(train_low_dataset):
    #print("TRAIN:", train_index, "TEST:", test_index)
#Fitting model
    model_2=ensemble.GradientBoostingRegressor(n_estimators= 220, max_depth= 4, min_samples_split= 2, loss= 'ls')
    model_2.fit(train_low_dataset[train_index], train_low_helpfulness[train_index].A1)
#predict    
    pred_2=model_2.predict(train_low_dataset[test_index])
#mae
    mae_2 = mae(train_low_helpfulness[test_index], pred_2)
    print ("Mean Absolute Error of Predictor : " + str(mae_2))

Mean Absolute Error of Predictor : 0.191622257551
Mean Absolute Error of Predictor : 0.192149175583
Mean Absolute Error of Predictor : 0.196824634686
Mean Absolute Error of Predictor : 0.192415327003
Mean Absolute Error of Predictor : 0.19371851015
Mean Absolute Error of Predictor : 0.191916719755
Mean Absolute Error of Predictor : 0.191900746561
Mean Absolute Error of Predictor : 0.194314356039
Mean Absolute Error of Predictor : 0.194778132462
Mean Absolute Error of Predictor : 0.191928261529


## Split train and validation data

In [101]:
train_high_x,valid_high_x,train_high_y,valid_high_y=train_test_split(train_high_dataset,\
                                        train_high_helpfulness, test_size=0.3, random_state=42)

In [102]:
print (train_high_x.shape)
print (valid_high_x.shape)
print (train_high_y.shape)
print (valid_high_y.shape)

(1936, 13)
(831, 13)
(1936, 1)
(831, 1)


In [103]:
train_low_x,valid_low_x,train_low_y,valid_low_y=train_test_split(train_low_dataset,\
                                        train_low_helpfulness, test_size=0.3, random_state=42)

In [124]:
print (train_low_x.shape)
print (valid_low_x.shape)
print (train_low_y.shape)
print (valid_low_y.shape)

(23713, 13)
(10163, 13)
(23713, 1)
(10163, 1)


##  Parameter Tuning Models

In [126]:
train_low_dataset.shape
train_low_helpfulness.reshape()

(33876, 1)

In [136]:
#VALIDATION DATA TO TUNE PARAMETERS
from sklearn.grid_search import GridSearchCV
param_grid={'learning_rate': [0.25,0.1,0.05,0.02,0.01],
            'max_depth': [3,4,5,6],
            'min_samples_leaf':[3,5,9,13,19],
            'max_features':[1.0,0.3,0.1]
           }
est=ensemble.GradientBoostingRegressor(n_estimators=200)
gs_cv=GridSearchCV(est,param_grid,scoring='neg_mean_absolute_error',cv=5)
gs_cv.fit(valid_low_x,valid_low_y.A1)
gs_cv.best_params_,gs_cv.best_score_

({'learning_rate': 0.05,
  'max_depth': 6,
  'max_features': 0.3,
  'min_samples_leaf': 9},
 -0.1948817182989291)

In [137]:
param_test1 = {'n_estimators':[100,120,140,160,180,200,220,240,260,280,300]}
gsearch1 = GridSearchCV(estimator = ensemble.GradientBoostingRegressor(learning_rate=0.05,max_features=0.3,min_samples_split=100,min_samples_leaf=9,max_depth=6,subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='neg_mean_absolute_error',n_jobs=4,iid=False, cv=5)
gsearch1.fit(valid_low_x,valid_low_y.A1)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: -0.19511, std: 0.00220, params: {'n_estimators': 100},
  mean: -0.19496, std: 0.00215, params: {'n_estimators': 120},
  mean: -0.19483, std: 0.00217, params: {'n_estimators': 140},
  mean: -0.19486, std: 0.00218, params: {'n_estimators': 160},
  mean: -0.19489, std: 0.00226, params: {'n_estimators': 180},
  mean: -0.19484, std: 0.00226, params: {'n_estimators': 200},
  mean: -0.19486, std: 0.00226, params: {'n_estimators': 220},
  mean: -0.19492, std: 0.00222, params: {'n_estimators': 240},
  mean: -0.19492, std: 0.00217, params: {'n_estimators': 260},
  mean: -0.19493, std: 0.00215, params: {'n_estimators': 280},
  mean: -0.19495, std: 0.00211, params: {'n_estimators': 300}],
 {'n_estimators': 140},
 -0.19483087167319926)

In [144]:
# ElasticNet Regressor performed good with high votes
# For Elastic Net we can use ElasticNetCV to get better results
from sklearn.linear_model import ElasticNetCV
predictor_high = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1])
predictor_high.fit(train_high_x,train_high_y.A1)
predict_high_y = predictor_high.predict(valid_high_x)

In [194]:
# gradientboosting
#params = {n_estimators= 220, max_depth= 4, min_samples_split= 2, loss= 'ls'}
predictor_low = ensemble.GradientBoostingRegressor(n_estimators= 240, max_depth= 4, min_samples_split= 100,learning_rate=0.05)
predictor_low.fit(train_low_x,train_low_y.A1)
predict_low_y = predictor_low.predict(valid_low_x)

## Evaluating Models

In [146]:
# Mean Absolute Error for Elasticnet
from sklearn.metrics import mean_absolute_error as mae
mae_high = mae(valid_high_y, predict_high_y)
print ("Mean Absolute Error of Predictor : " + str(mae_high))

Mean Absolute Error of Predictor : 0.0749424255468


In [195]:
# Mean Absolute Error for GradientBoost
mae_low = mae(valid_low_y, predict_low_y)
print ("Mean Absolute Error of Predictor : " + str(mae_low))

Mean Absolute Error of Predictor : 0.195411163339


## Now using high performing params on the full training dataset to train the model again 

In [186]:
predictor_high = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],cv=5)
predictor_high.fit(train_high_dataset,train_high_helpfulness.A1)

ElasticNetCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
       l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=1000,
       n_alphas=100, n_jobs=1, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='cyclic',
       tol=0.0001, verbose=0)

In [187]:
predictor_low = ensemble.GradientBoostingRegressor(n_estimators= 240, max_depth= 4, min_samples_split= 100,learning_rate=0.05)
predictor_low.fit(train_low_dataset,train_low_helpfulness.A1)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='ls', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=100,
             min_weight_fraction_leaf=0.0, n_estimators=240,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [188]:
def predict_test_samples(predictor, test_data):
    return predictor.predict(np.matrix(test_data))

In [189]:
#Getting the test samples
test_dataset = []
for line in read_gzip("test_Helpful.json.gz"):
    test_dataset.append(line)

In [190]:
#Collecting features for test samples
test_feature_set = collect_features(test_dataset)
print (test_feature_set.shape)

All Features are successfully extracted...
(14000, 13)


## Preparing results for Kaggle

In [191]:
predictions = open("predictions5_Helpful.txt", 'w')
index = 0
for l in open("pairs_Helpful.txt"):
    if l.startswith("userID"):
        
        predictions.write(l)
        continue
    u,i,outOf = l.strip().split('-')
    outOf = int(outOf)
    if outOf==0:
        pred=0
    if (outOf > 15):
        pred = int(np.around(outOf*predict_test_samples(predictor_high, test_feature_set[index])))
    else:
        pred = int(np.around(outOf*predict_test_samples(predictor_low, test_feature_set[index])))
    predictions.write(u + '-' + i + '-' + str(outOf) + ',' + str(pred) + '\n')
    index += 1
predictions.close()