In [173]:
import numpy as np
from collections import defaultdict
import string
from sklearn.linear_model import Ridge
import pandas as pd
from sklearn.metrics import mean_squared_error
from nltk.stem.porter import *
import math
from sklearn.metrics.pairwise import cosine_similarity

Using the code provided on the webpage, read the first 10,000 reviews from the corpus, and read the reviews without capitalization or punctuation.

1. How many unique bigrams are there amongst the reviews? List the 5 most-frequently-occurring bigrams along with their number of occurrences in the corpus (1 mark).

In [8]:

def parseData(fname):
    for l in open(fname):
        yield eval(l)

data = list(parseData("train_Category.json"))


wordCount = defaultdict(int)
punctuation = set(string.punctuation)

for d in data[:10000]:
    r = ''.join([
        c for c in d['review_text'].lower() 
        if not c in punctuation])
    
    text = r.split()
    n = 2
    bigram = [' '.join(text[i:i+n]) for i in range(0,len(text))]
    
    for w in bigram:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

<b> Answer to Q1:</b>

In [13]:
counts[:5]

[(7927, 'of the'),
 (5850, 'this book'),
 (5627, 'in the'),
 (3189, 'and the'),
 (3183, 'is a')]

2. The code provided performs least squares using the 1000 most common unigrams. Adapt it to use the 1000 most common bigrams and report the MSE obtained using the new predictor (use bigrams only, i.e., not unigrams+bigrams) (1 mark). Note that the code performs regularized regression with a regularization parameter of 1.0. The prediction target should be the ‘rating’ field in each review.

In [86]:
top_1000_words = [x[1] for x in counts[:1000]]
wordId = dict(zip(top_1000_words, range(len(top_1000_words))))
wordSet = set(top_1000_words)
data = list(parseData("train_Category.json"))

def feature(datum):
    feat = [0]*len(top_1000_words)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    
    text = r.split()
    n = 2
    bigram = [''.join(text[i:i+n]) for i in range(0,len(text))]
    
    for w in bigram:
        if w in top_1000_words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

X = [feature(d) for d in data]
y = [d['rating'] for d in data]

In [95]:
clf = Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

<B>Answer to Q2</B>

In [97]:
mean_squared_error(y, predictions)

1.3584004213814633

3. Repeat the above experiment using unigrams and bigrams, still considering the 1000 most common. That is, your model will still use 1000 features (plus an offset), but those 1000 features will be some combination of unigrams and bigrams. Report the MSE obtained using the new predictor (1 mark).

In [99]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)

for d in data[:10000]:
    r = ''.join([
        c for c in d['review_text'].lower() 
        if not c in punctuation])
    
    text = r.split()
    
    #creating unigram
    for u in text:
        wordCount[u] +=1
    
    #creating bigram
    n = 2
    bigram = [' '.join(text[i:i+n]) for i in range(0,len(text))]
    for w in bigram:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [101]:
top_1000_words = [x[1] for x in counts[:1000]]
wordId = dict(zip(top_1000_words, range(len(top_1000_words))))
wordSet = set(top_1000_words)
data = list(parseData("train_Category.json"))

def feature(datum):
    feat = [0]*len(top_1000_words)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    
    text = r.split()
    n = 2
    bigram = [''.join(text[i:i+n]) for i in range(0,len(text))]
    
    for w in bigram:
        if w in top_1000_words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

X = [feature(d) for d in data]
y = [d['rating'] for d in data]

In [102]:
clf = Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

<B>Answer to Q3</B>

In [103]:
mean_squared_error(y, predictions)

1.327923922181295

4. What is the inverse document frequency of the words ‘stories’, ‘magician’, ‘psychic’, ‘writing’, and ‘wonder’? What are their tf-idf scores in the first review (using log base 10, following the first definition of tf-idf given in the slides) (1 mark)?

In [128]:
wordCountTotal = defaultdict(int)
punctuation = set(string.punctuation)
# stemmer = PorterStemmer()
for d in data[:10000]:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in r.split():
#         w = stemmer.stem(w)
        wordCountTotal[w] += 1

In [150]:
wordCountTotal

defaultdict(int,
            {'genuinely': 45,
             'enthralling': 23,
             'if': 3520,
             'collins': 39,
             'or': 3514,
             'bernard': 4,
             'did': 2109,
             'invent': 3,
             'this': 19353,
             'out': 4057,
             'of': 32552,
             'whole': 763,
             'cloth': 9,
             'they': 4498,
             'deserve': 43,
             'a': 39577,
             'medal': 12,
             'for': 12076,
             'imagination': 136,
             'lets': 199,
             'leave': 281,
             'the': 73431,
             'veracity': 1,
             'aside': 127,
             'moment': 260,
             'always': 1054,
             'touchy': 5,
             'subject': 102,
             'when': 3666,
             'it': 20110,
             'comes': 541,
             'to': 36821,
             'real': 890,
             'life': 1967,
             'stories': 1053,
             'occult': 7,
    

In [156]:
#IDF = (Total number of documents / Number of documents with word t in it)
def idf(word, num_docs):
    return math.log(num_docs/abs(wordCountTotal[word]), 10)

def tf(word, document):
    return document.count(word)

words = ['stories', 'magician','psychic', 'writing','wonder']
for i in words:
    IDF = idf(i, len(data[:10000]))
    TF = tf(i, data[0]['review_text'])
    print(i+' idf is: '+str(IDF))
    print(i+' tf-idf is: '+str(TF*IDF))
    print()

stories idf is: 0.9775716288145134
stories tf-idf is: 0.9775716288145134

magician idf is: 2.5228787452803374
magician tf-idf is: 2.5228787452803374

psychic idf is: 2.481486060122112
psychic tf-idf is: 4.962972120244224

writing idf is: 0.9115095298176037
writing tf-idf is: 0.9115095298176037

wonder idf is: 1.7144426909922261
wonder tf-idf is: 1.7144426909922261



5. Adapt your unigram model to use the tfidf scores of words, rather than a bag-of-words representation. That is, rather than your features containing the word counts for the 1000 most common unigrams, it should contain tfidf scores for the 1000 most common unigrams. Report the MSE of this new model (1 mark).

In [157]:
def idf(word, num_docs):
    return math.log(num_docs/abs(wordCountTotal[word]),10)

def tf(word, document):
    return document.count(word)

wordTFIDF = defaultdict(int)
punctuation = set(string.punctuation)

for d in data[:10000]:
    r = ''.join([
        c for c in d['review_text'].lower() 
        if not c in punctuation])
    
    text = r.split()
    #creating unigram
    for u in text:
        wordTFIDF[u] += idf(u, len(data[:10000]))*tf(u, d['review_text'])

In [161]:
TFIDFs = [(wordTFIDF[w], w) for w in wordTFIDF]
TFIDFs.sort()
TFIDFs.reverse()

In [162]:
TFIDFs

[(371047.7024455447, 'e'),
 (109590.36674497082, 'o'),
 (71213.07455950932, 'he'),
 (38940.21970262443, 'an'),
 (35730.23498541466, 'l'),
 (35334.89288680381, 'di'),
 (31245.425221149064, 'de'),
 (29697.752108140925, 'or'),
 (26884.29054152944, 'en'),
 (25242.120145823887, 'at'),
 (24594.36058641295, 'la'),
 (21673.913607286955, 's'),
 (18741.84810971184, 'que'),
 (17834.880269664754, 'on'),
 (17803.11446444847, 'n'),
 (17744.284752056956, 'che'),
 (16130.531378217607, 'es'),
 (14598.341823962923, 'un'),
 (13780.274595808653, 'w'),
 (13413.573018771187, 'le'),
 (13024.921479739949, 'me'),
 (12941.97744547872, 'his'),
 (12644.834667205227, 'no'),
 (12392.808254580488, 'da'),
 (12305.419351521097, 'y'),
 (11851.22848624911, 'na'),
 (11849.935488137104, 'z'),
 (11540.198767605, 'be'),
 (11454.196189649218, 'er'),
 (11181.125836531904, 'yang'),
 (11087.876846502615, 'r'),
 (10959.910289111649, 'si'),
 (10846.606214952697, 'm'),
 (9757.112669321063, 'u'),
 (9659.034392188563, 'je'),
 (9584.

In [165]:
tfidf_1000_words = [x[1] for x in TFIDFs[:1000]]
wordIdq5 = dict(zip(tfidf_1000_words, range(len(tfidf_1000_words))))
wordSetq5 = set(tfidf_1000_words)

def feature(datum):
    feat = [0]*len(tfidf_1000_words)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    for w in r.split():
        if w in tfidf_1000_words:
            feat[wordIdq5[w]] += 1
    feat.append(1) #offset
    return feat

#     feat = [0]*len(tfidf_1000_words)
#     r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    
#     text = r.split()
#     n = 2
#     bigram = [''.join(text[i:i+n]) for i in range(0,len(text))]
    
#     for w in bigram:
#         if w in top_1000_words:
#             feat[wordId[w]] += 1
#     feat.append(1) #offset
#     return feat

X = [feature(d) for d in data]
y = [d['rating'] for d in data]

In [166]:
clf = Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [167]:
mean_squared_error(y, predictions)

1.1093979996952812

6. Which other review has the highest cosine similarity compared to the first review (provide the review id, or the text of the review) (1 mark)?

In [168]:
first_review = data[0]
X_first_review = feature(first_review)

In [170]:
len(X_first_review)

1001

In [190]:
coses = []
for i in data[1:10000]:
    X_i_review = feature(i)
    dot = np.dot(X_first_review, X_i_review)
    norma = np.linalg.norm(X_first_review)
    normb = np.linalg.norm(X_i_review)
    cos = dot / (norma * normb)
#     print(cos)
    coses.append((cos, i['review_id']))

In [191]:
coses.sort()
coses.reverse()
coses

[(0.38426608984854055, 'r13136801'),
 (0.3835690309508412, 'r42982550'),
 (0.3806847767518744, 'r71755146'),
 (0.38035984058519634, 'r49529747'),
 (0.37990088317648774, 'r50747756'),
 (0.37606889417164235, 'r80509777'),
 (0.3723285197856431, 'r25474155'),
 (0.37208135368768197, 'r92212881'),
 (0.3709498896050606, 'r81665095'),
 (0.368670265907498, 'r06874956'),
 (0.3670905431948272, 'r06555296'),
 (0.36559247549190665, 'r56907707'),
 (0.36367687521372233, 'r51025703'),
 (0.36275617127899423, 'r13824279'),
 (0.3601470287992685, 'r92360181'),
 (0.3598457312185919, 'r41213114'),
 (0.3594844762144827, 'r05868461'),
 (0.3554458669657, 'r01103808'),
 (0.3551797055174278, 'r76841163'),
 (0.3547552035986036, 'r36336272'),
 (0.35260338061586866, 'r69067391'),
 (0.3518567769552858, 'r82741631'),
 (0.35171389622338484, 'r23612581'),
 (0.3512216442029781, 'r81978797'),
 (0.3505426036290943, 'r27564484'),
 (0.35053297152514906, 'r42522678'),
 (0.34826153135958615, 'r84382801'),
 (0.3480049656834667

7. Implement a validation pipeline for this same data, by randomly shuffling the data, using 10,000 reviews for training, another 10,000 for validation, and another 10,000 for testing.1 Consider regularization parameters in the range {0.01, 0.1, 1, 10, 100}, and report MSEs on the test set for the model that performs best on the validation set. Using this pipeline, compare the following alternatives in terms of their performance:


• Unigrams vs. bigrams


• Removing punctuation vs. preserving it. The model that preserves punctuation should treat punc-
tuation characters as separate words, e.g. “Amazing!” would become [‘amazing’, ‘!’]


• tfidf scores vs. word counts


In total you should compare 2 × 2 × 2 = 8 models, and produce a table comparing their performance (2 marks)

In [197]:
data_df = pd.DataFrame(data)
training = data_df.sample(len(data_df), replace = True)
validation = data_df.sample(len(data_df), replace = True)
test = data_df.sample(len(data_df), replace = True)

In [217]:
def unigram_punc_cnt(datum, top_1000_words_uni):
    
    feat = [0]*len(top_1000_words_uni)
    r = datum['review_text'].lower()
    text = re.split('(\W)',r)
    
    for w in text:
        if w in top_1000_words_uni:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [None]:
def unigram_punc_tfidf(datum, top_1000_words_uni):
    feat = [0]*len(top_1000_words_uni)
    r = datum['review_text'].lower()
    text = re.split('(\W)',r)
    text = list(filter(None, text)) ##removing empty string
    
    for w in text:
        if w in top_1000_words_uni:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [218]:
def bigram(datum, top_1000_words_bi):
    feat = [0]*len(top_1000_words_bi)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    text = r.split()
    n = 2
    bigram = [''.join(text[i:i+n]) for i in range(0,len(text))]

    for w in bigram:
        if w in top_1000_words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [None]:
#unigram & removing punc & tfidf scores
def uni_nopunc_top_tfidf():
    wordNTT = defaultdict(int)
    # punctuation = set(string.punctuation)

    for d in data[:10000]:
        r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
        
        text = r.split()
        for w in text:
            wordNTT[w] += idf(w, len(data[:10000]))*tf(w, d['review_text'])

    counts = [(wordNTT[w], w) for w in wordNTT]
    counts.sort()
    counts.reverse()
    return counts[:1000]

In [224]:
#unigram & removing punc & counts
def uni_nopunc_top_counts():
    wordUNC = defaultdict(int)
    # punctuation = set(string.punctuation)

    for d in data[:10000]:
        r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
        text = r.split()

        for w in text:
            wordUNC[w] += 1

    counts = [(wordUNC[w], w) for w in wordUNC]
    counts.sort()
    counts.reverse()
    return counts[:1000]

In [225]:
#unigram & keeping punc & tfidf scores
def uni_punc_top_tfidf():
    wordUPT = defaultdict(int)
    # punctuation = set(string.punctuation)

    for d in data[:10000]:
        r = d['review_text'].lower()
        text = re.split('(\W)',r)
        text = list(filter(None, text)) ##removing empty string
        
        for w in text:
            wordNTT[w] += idf(w, len(data[:10000]))*tf(w, d['review_text'])

    counts = [(wordNTT[w], w) for w in wordNTT]
    counts.sort()
    counts.reverse()
    return counts[:1000]

In [None]:
#unigram & keeping punc & counts

def uni_punc_top_counts():
    wordUKC = defaultdict(int)
    # punctuation = set(string.punctuation)

    for d in data[:10000]:
        r = d['review_text'].lower()
        text = re.split('(\W)',r)
        text = list(filter(None, text)) ##removing empty string

        for w in text:
            wordUKC[w] += 1

    counts = [(wordUKC[w], w) for w in wordUKC]
    counts.sort()
    counts.reverse()
    return counts[:1000]

def unigram_punc_cnt(datum, top_1000_words_uni):
    feat = [0]*len(top_1000_words_uni)
    r = datum['review_text'].lower()
    text = re.split('(\W)',r)
    
    for w in text:
        if w in top_1000_words_uni:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat



In [200]:
#bigram & removing punc & tfidf scores
def bi_nopunc_top_tfidf():
    wordBNT = defaultdict(int)
    # punctuation = set(string.punctuation)

    for d in data[:10000]:
        r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
        
        text = r.split()
        
        n = 2
        bigram = [' '.join(text[i:i+n]) for i in range(0,len(text))]
        for w in bigram:
            wordBNT[w] += idf(w, len(data[:10000]))*tf(w, d['review_text'])

    counts = [(wordBNT[w], w) for w in wordBNT]
    counts.sort()
    counts.reverse()
    return counts[:1000]

In [None]:
#bigram & removing punc & counts
def bi_nopunc_top_counts():
    wordBNC = defaultdict(int)
    # punctuation = set(string.punctuation)

    for d in data[:10000]:
        r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
        text = r.split()
        
        n = 2
        bigram = [' '.join(text[i:i+n]) for i in range(0,len(text))]
    
        for w in bigram:
            wordBNC[w] += 1

    counts = [(wordBNC[w], w) for w in wordBNC]
    counts.sort()
    counts.reverse()
    return counts[:1000]

In [None]:
#bigram & keeping punc & tfidf scores

In [None]:
#bigram & keeping punc & counts
def bi_punc_top_counts():
    wordBKC = defaultdict(int)
    # punctuation = set(string.punctuation)

    for d in data[:10000]:
        r = d['review_text'].lower()
        text = re.split('(\W)',r)
        text = list(filter(None, text)) ##removing empty string
        
        n = 2
        bigram = [' '.join(text[i:i+n]) for i in range(0,len(text))]
    
        for w in bigram:
            wordBKC[w] += 1

    counts = [(wordBKC[w], w) for w in wordBKC]
    counts.sort()
    counts.reverse()
    return counts[:1000]

In [223]:
a = re.split('(\W)','a,s,  4')
list(filter(None, a))

['a', ',', 's', ',', ' ', ' ', '4']