In [2]:
import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model
import operator
from sklearn.metrics import mean_squared_error

In [3]:
def parseData(fname):
    for l in urllib.request.urlopen(fname):
        yield eval(l)

### Just the first 5000 reviews

print ("Reading data...")
data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))[:5000]
print ("done")

Reading data...
done


In [16]:
### How many unique words are there?

wordCount = defaultdict(int)
for d in data:
    for w in d['review/text'].split():
        wordCount[w] += 1

print(len(wordCount))

### Ignore capitalization and remove punctuation

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for w in r.split():
    #w = stemmer.stem(w) # with stemming
        wordCount[w] += 1

### Just take the most popular words...

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

### Bigram counts
translator = str.maketrans(dict.fromkeys(string.punctuation))
bigramCount = defaultdict(int)
reviewBigramContent = []
for d in data:
    text = d['review/text']
    removed = text.translate(translator)
    lowered = removed.lower()
    words = lowered.split()
    prev = None
    reviewSet = []
    for w in words:
        if (prev is None):
            bigramCount[('/',w)] += 1
            reviewSet.append(('/',w))
        else:
            bigramCount[(prev,w)] += 1
            reviewSet.append((prev,w))
        prev = w
    reviewBigramContent.append(reviewSet)
frequentWords = sorted(list(bigramCount.items()), key=operator.itemgetter(1))
frequentWords.reverse()
bigrams = set(bigramCount.keys())
bigramNoDup = list(bigrams)
print('')
print('Total number of unique bigrams: {}'.format(len(frequentWords)))
print('Top 5 bigrams:')
for i in range(5):
    print('Bigram {} with count {}'.format(frequentWords[i][0],frequentWords[i][1]))

wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

def feature(datum, wordKey):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordKey[w]] += 1
    feat.append(1) #offset
    return feat

X = [feature(d,wordId) for d in data]
y = [d['review/overall'] for d in data]

#No regularization
#theta,residuals,rank,s = numpy.linalg.lstsq(X, y)

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
err = mean_squared_error(y,predictions)
print('\nMSE of unigram model: {}'.format(err))

### Bigram model
def featureBi(bigramList,wordKey):
    feat = [0] * len(bigrams)
    for w in bigramList:
        feat[wordKey[w]] += 1
    feat.append(1)
    return feat
wordId = dict(zip(bigramNoDup, range(len(bigramNoDup))))
X = [featureBi(d,wordId) for d in reviewBigramContent]
y = [d['review/overall'] for d in data]
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X,y)
theta = clf.coef_
predictions = clf.predict(X)
err = mean_squared_error(y,predictions)
print('\nMSE of bigram model: {}'.format(err))

36225

Total number of unique bigrams: 182902
Top 5 bigrams:
Bigram ('with', 'a') with count 4587
Bigram ('in', 'the') with count 2595
Bigram ('of', 'the') with count 2245
Bigram ('is', 'a') with count 2056
Bigram ('on', 'the') with count 2033

MSE of unigram model: 0.48462143628455434

MSE of bigram model: 0.000416364843711807
