In [None]:
#import the necessary libraries
import pandas as pd
import numpy as np
import gzip
import scipy.sparse as sp
from numpy.linalg import norm
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
from pyparsing import anyOpenTag, anyCloseTag
from xml.sax.saxutils import unescape as unescape
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

In [None]:
#read the json file to pandas dataframe
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('reviews_Health_and_Personal_Care_5.json.gz')

In [None]:
#Drop the columns not required
df.drop(['helpful','reviewerName','reviewerID','unixReviewTime','reviewTime', 'asin'], axis=1)    

In [None]:
#Extract the reviews from dataframe and store it in list
list_reviews = df['reviewText'].values.tolist()

In [None]:
#Extract the ratings from dataframe and store it in list
list_rating = df['overall'].values.tolist()

In [None]:
#remove punctuation marks
import re
for i in range(0,):
      list_reviews[i] = re.sub('[^a-zA-Z0-9]', ' ', list_reviews[i])

In [None]:
#remove words of length less than four
def filterLen(docs, minlen): 
    r""" filter out terms that are too short. docs is a list of lists, each inner list is a document represented as a list of words minlen is the minimum length of the word to keep """ 
    return [ [t for t in d if len(t) >= minlen ] for d in docs ] 
docs_final = filterLen(list_reviews, 4)

In [None]:
#split each and every word in the list
docs1 = [l.split() for l in docs_final]

In [None]:
#compute CSR matrix
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat

In [None]:
#csr info
def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

In [None]:
#build CSR matrix
mat = build_matrix(docs1)

csr_info(mat)

In [None]:
def csr_idf(mat, copy=False, **kargs):
    r""" Scale a CSR matrix by idf. 
    Returns scaling factors as dict. If copy is True, 
    returns scaled matrix and scaling factors.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
        
    return df if copy is False else mat

In [None]:
#l2 normalization
def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat

In [None]:
mat2 = csr_idf(mat, copy=True)
mat3 = csr_l2normalize(mat2, copy=True)

In [None]:
#split train and test data
train_mat = mat3[:320000, :]
test_mat = mat3[320000:, :]
print train_mat.shape
print test_mat.shape

In [None]:
#split the ratings for test and train data
train_rating = list_rating[:320000]
test_rating = list_rating[320000:]
print len(train_rating)
print len(test_rating)

In [None]:
#SVD dimensionality reduction
svd = TruncatedSVD(n_components=800)
svfit = svd.fit(train_mat)
train_final = svfit.transform(train_mat)
test_final = svfit.transform(test_mat)

In [None]:
#linear regression
from sklearn.linear_model import LinearRegression
linearreg = LinearRegression(fit_intercept=False, normalize=False, copy_X=True, n_jobs=1)
linearreg.fit(train_final, train_rating)
test_ratings_linear = linearreg.predict(test_final)

In [None]:
#ridge regression
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1.0)
clf.fit(train_final, train_rating)
test_ratings_ridge = clf.predict(test_final)

In [None]:
#KNN Regression
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=20, weights='distance', algorithm='auto', leaf_size=30, p=3,
                            metric='euclidean', metric_params=None, n_jobs=10)
neigh.fit(train_final, train_rating)

test_ratings_knn = neigh.predict(test_final)

In [None]:
#logistic regression
from sklearn.linear_model import LogisticRegression
regr = LogisticRegression()
regr.fit(train_final, train_rating)
test_ratings_logistic = regr.predict(test_final)

In [None]:
#calculating mean square error
from sklearn.metrics import mean_squared_error
#mean_squared_error(test_true_rating, test_rating_predict, sample_weight=None, multioutput=’uniform_average’)
print mean_squared_error(test_rating, test_ratings_knn, sample_weight=None)
print mean_squared_error(test_rating, test_ratings_linear, sample_weight=None)
print mean_squared_error(test_rating, test_ratings_ridge, sample_weight=None)
print mean_squared_error(test_rating, test_ratings_logistic, sample_weight=None)


