In [22]:
import numpy as np
import urllib
import scipy.optimize
import random
from collections import defaultdict
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model
import operator
from math import log
from sklearn.metrics import mean_squared_error

In [3]:
def parseData(fname):
    for l in urllib.request.urlopen(fname):
        yield eval(l)

### Just the first 5000 reviews

print ("Reading data...")
data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))[:5000]
print ("done")

Reading data...
done


In [4]:
### How many unique words are there?

wordCount = defaultdict(int)
for d in data:
    for w in d['review/text'].split():
        wordCount[w] += 1

print(len(wordCount))

### Ignore capitalization and remove punctuation

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for w in r.split():
    #w = stemmer.stem(w) # with stemming
        wordCount[w] += 1

### Just take the most popular words...

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

### Bigram counts
translator = str.maketrans(dict.fromkeys(string.punctuation))
bigramCount = defaultdict(int)
reviewBigramContent = []
for d in data:
    text = d['review/text']
    removed = text.translate(translator)
    lowered = removed.lower()
    wordList = lowered.split()
    prev = None
    reviewSet = []
    for w in wordList:
        if (prev is None):
            bigramCount[('/',w)] += 1
            reviewSet.append(('/',w))
        else:
            bigramCount[(prev,w)] += 1
            reviewSet.append((prev,w))
        prev = w
    reviewBigramContent.append(reviewSet)
frequentWords = sorted(list(bigramCount.items()), key=operator.itemgetter(1))
frequentWords.reverse()
bigrams = set(bigramCount.keys())
bigramNoDup = list(bigrams)
print('')
print('Total number of unique bigrams: {}'.format(len(frequentWords)))
print('Top 5 bigrams:')
for i in range(5):
    print('Bigram {} with count {}'.format(frequentWords[i][0],frequentWords[i][1]))

wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

def feature(datum, wordKey):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordKey[w]] += 1
    feat.append(1) #offset
    return feat

X = [feature(d,wordId) for d in data]
y = [d['review/overall'] for d in data]

#No regularization
#theta,residuals,rank,s = numpy.linalg.lstsq(X, y)

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
err = mean_squared_error(y,predictions)
print('\nMSE of unigram model: {}'.format(err))

### Bigram model
def featureBi(bigramList,wordKey):
    feat = [0] * len(bigrams)
    for w in bigramList:
        feat[wordKey[w]] += 1
    feat.append(1)
    return feat
wordId = dict(zip(bigramNoDup, range(len(bigramNoDup))))
X = [featureBi(d,wordId) for d in reviewBigramContent]
y = [d['review/overall'] for d in data]
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X,y)
theta = clf.coef_
predictions = clf.predict(X)
err = mean_squared_error(y,predictions)
print('\nMSE of bigram model: {}'.format(err))

36225

Total number of unique bigrams: 182902
Top 5 bigrams:
Bigram ('with', 'a') with count 4587
Bigram ('in', 'the') with count 2595
Bigram ('of', 'the') with count 2245
Bigram ('is', 'a') with count 2056
Bigram ('on', 'the') with count 2033

MSE of unigram model: 0.48462143628455434

MSE of bigram model: 0.0004163648437118068


In [8]:
reviewsAndWords = []
allWords = set()
for d in data:
    text = d['review/text']
    words = ((text.translate(translator)).lower()).split()
    reviewsAndWords.append(words)
    for w in words:
        allWords.add(w)

In [12]:
checkWords = ['foam','smell','banana','lactic','tart']
totalDocuments = len(reviewsAndWords)
inverseFrequencies = defaultdict(float)
for word in allWords:
    doc = 0
    for rev in reviewsAndWords:
        if word in rev:
            doc += 1
    inverseFrequencies[word] = log(totalDocuments/doc, 10)

In [14]:
for word in checkWords:
    print('IDF score for word "{}": {}'.format(word,inverseFrequencies[word]))

IDF score for word "foam": 1.1378686206869628
IDF score for word "smell": 0.5379016188648442
IDF score for word "banana": 1.6777807052660807
IDF score for word "lactic": 2.920818753952375
IDF score for word "tart": 1.8068754016455382


In [18]:
for word in checkWords:
    c = reviewsAndWords[0].count(word)
    print('TF-IDF score for word "{}" in first document: {}'.format(word,c*inverseFrequencies[word]))

TF-IDF score for word "foam" in first document: 2.2757372413739256
TF-IDF score for word "smell" in first document: 0.5379016188648442
TF-IDF score for word "banana" in first document: 3.3555614105321614
TF-IDF score for word "lactic" in first document: 5.84163750790475
TF-IDF score for word "tart" in first document: 1.8068754016455382


In [40]:
v1 = []
v2 = []
for word in allWords:
    c1 = reviewsAndWords[0].count(word)
    c2 = reviewsAndWords[1].count(word)
    v1.append(c1*inverseFrequencies[word])
    v2.append(c2*inverseFrequencies[word])
def cos_sim(a, b):
    """Takes 2 vectors a, b and returns the cosine similarity according 
    to the definition of the dot product
    """
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    if norm_a == 0 or norm_b == 0:
        return 0
    return dot_product / (norm_a * norm_b)
print('Cosine similarity of first and second review: {}'.format(cos_sim(v1,v2)))

Cosine similarity of first and second review: 0.0658819397474438


In [41]:
vectors = []
for i in range(len(reviewsAndWords)):
    v = []
    for word in allWords:
        c = reviewsAndWords[i].count(word)
        v.append(c*inverseFrequencies[word])
    vectors.append(v)

In [55]:
v1 = vectors[0]

maxCos = 0
index = 1
maxIndex = -1
for v in vectors[1:]:
    cos = cos_sim(v1,v)
    if cos > maxCos:
        maxCos = cos
        maxIndex = index
    index += 1
print('Review with highest cosine similarity to first review has beerId "{}" and profileName "{}"'.format(data[maxIndex]['beer/beerId'], data[maxIndex]['user/profileName']))

Review with highest cosine similarity to first review has beerId "72146" and profileName "spicelab"
