In [173]:
import numpy as np
from collections import defaultdict
import string
from sklearn.linear_model import Ridge
import pandas as pd
from sklearn.metrics import mean_squared_error
from nltk.stem.porter import *
import math
from sklearn.metrics.pairwise import cosine_similarity

Using the code provided on the webpage, read the first 10,000 reviews from the corpus, and read the reviews without capitalization or punctuation.

1. How many unique bigrams are there amongst the reviews? List the 5 most-frequently-occurring bigrams along with their number of occurrences in the corpus (1 mark).

In [8]:

def parseData(fname):
    for l in open(fname):
        yield eval(l)

data = list(parseData("train_Category.json"))


wordCount = defaultdict(int)
punctuation = set(string.punctuation)

for d in data[:10000]:
    r = ''.join([
        c for c in d['review_text'].lower() 
        if not c in punctuation])
    
    text = r.split()
    n = 2
    bigram = [' '.join(text[i:i+n]) for i in range(0,len(text))]
    
    for w in bigram:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

<b> Answer to Q1:</b>

In [13]:
counts[:5]

[(7927, 'of the'),
 (5850, 'this book'),
 (5627, 'in the'),
 (3189, 'and the'),
 (3183, 'is a')]

2. The code provided performs least squares using the 1000 most common unigrams. Adapt it to use the 1000 most common bigrams and report the MSE obtained using the new predictor (use bigrams only, i.e., not unigrams+bigrams) (1 mark). Note that the code performs regularized regression with a regularization parameter of 1.0. The prediction target should be the ‘rating’ field in each review.

In [227]:
top_1000_words = [x[1] for x in counts[:1000]]
wordId = dict(zip(top_1000_words, range(len(top_1000_words))))
wordSet = set(top_1000_words)
data = list(parseData("train_Category.json"))

def feature(datum):
    feat = [0]*len(top_1000_words)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    
    text = r.split()
    n = 2
    bigram = [''.join(text[i:i+n]) for i in range(0,len(text))]
    
    for w in bigram:
        if w in top_1000_words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

X = [feature(d) for d in data[:10000]]
y = [d['rating'] for d in data[:10000]]

In [228]:
clf = Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

<B>Answer to Q2</B>

In [229]:
mean_squared_error(y, predictions)

1.2581931882592643

3. Repeat the above experiment using unigrams and bigrams, still considering the 1000 most common. That is, your model will still use 1000 features (plus an offset), but those 1000 features will be some combination of unigrams and bigrams. Report the MSE obtained using the new predictor (1 mark).

In [99]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)

for d in data[:10000]:
    r = ''.join([
        c for c in d['review_text'].lower() 
        if not c in punctuation])
    
    text = r.split()
    
    #creating unigram
    for u in text:
        wordCount[u] +=1
    
    #creating bigram
    n = 2
    bigram = [' '.join(text[i:i+n]) for i in range(0,len(text))]
    for w in bigram:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [230]:
top_1000_words = [x[1] for x in counts[:1000]]
wordId = dict(zip(top_1000_words, range(len(top_1000_words))))
wordSet = set(top_1000_words)
data = list(parseData("train_Category.json"))

def feature(datum):
    feat = [0]*len(top_1000_words)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    
    text = r.split()
    n = 2
    bigram = [''.join(text[i:i+n]) for i in range(0,len(text))]
    
    for w in bigram:
        if w in top_1000_words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

X = [feature(d) for d in data[:10000]]
y = [d['rating'] for d in data[:10000]]

In [231]:
clf = Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

<B>Answer to Q3</B>

In [103]:
mean_squared_error(y, predictions)

1.327923922181295

4. What is the inverse document frequency of the words ‘stories’, ‘magician’, ‘psychic’, ‘writing’, and ‘wonder’? What are their tf-idf scores in the first review (using log base 10, following the first definition of tf-idf given in the slides) (1 mark)?

In [128]:
wordCountTotal = defaultdict(int)
punctuation = set(string.punctuation)
# stemmer = PorterStemmer()
for d in data[:10000]:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in r.split():
#         w = stemmer.stem(w)
        wordCountTotal[w] += 1


In [150]:
wordCountTotal

defaultdict(int,
            {'genuinely': 45,
             'enthralling': 23,
             'if': 3520,
             'collins': 39,
             'or': 3514,
             'bernard': 4,
             'did': 2109,
             'invent': 3,
             'this': 19353,
             'out': 4057,
             'of': 32552,
             'whole': 763,
             'cloth': 9,
             'they': 4498,
             'deserve': 43,
             'a': 39577,
             'medal': 12,
             'for': 12076,
             'imagination': 136,
             'lets': 199,
             'leave': 281,
             'the': 73431,
             'veracity': 1,
             'aside': 127,
             'moment': 260,
             'always': 1054,
             'touchy': 5,
             'subject': 102,
             'when': 3666,
             'it': 20110,
             'comes': 541,
             'to': 36821,
             'real': 890,
             'life': 1967,
             'stories': 1053,
             'occult': 7,
    

In [266]:
#IDF = (Total number of documents / Number of documents with word t in it)
def idf(word, num_docs):
    return math.log(num_docs/abs(wordCountTotal[word]), 10)

def tf(word, document):
    return document.count(word)

words = ['stories', 'magician','psychic', 'writing','wonder']
for i in words:
    IDF = idf(i, len(data[:10000]))
    TF = tf(i, data[0]['review_text'])
    print(i+' idf is: '+str(IDF))
    print(i+' tf-idf is: '+str(TF*IDF))
    print()

stories idf is: 0.9775716288145134
stories tf-idf is: 0.9775716288145134

magician idf is: 2.5228787452803374
magician tf-idf is: 2.5228787452803374

psychic idf is: 2.481486060122112
psychic tf-idf is: 4.962972120244224

writing idf is: 0.9115095298176037
writing tf-idf is: 0.9115095298176037

wonder idf is: 1.7144426909922261
wonder tf-idf is: 1.7144426909922261



5. Adapt your unigram model to use the tfidf scores of words, rather than a bag-of-words representation. That is, rather than your features containing the word counts for the 1000 most common unigrams, it should contain tfidf scores for the 1000 most common unigrams. Report the MSE of this new model (1 mark).

In [157]:
def idf(word, num_docs):
    return math.log(num_docs/abs(wordCountTotal[word]),10)

def tf(word, document):
    return document.count(word)

wordTFIDF = defaultdict(int)
punctuation = set(string.punctuation)

for d in data[:10000]:
    r = ''.join([
        c for c in d['review_text'].lower() 
        if not c in punctuation])
    
    text = r.split()
    #creating unigram
    for u in text:
        wordTFIDF[u] += idf(u, len(data[:10000]))*tf(u, d['review_text'])

In [161]:
TFIDFs = [(wordTFIDF[w], w) for w in wordTFIDF]
TFIDFs.sort()
TFIDFs.reverse()

In [162]:
TFIDFs

[(371047.7024455447, 'e'),
 (109590.36674497082, 'o'),
 (71213.07455950932, 'he'),
 (38940.21970262443, 'an'),
 (35730.23498541466, 'l'),
 (35334.89288680381, 'di'),
 (31245.425221149064, 'de'),
 (29697.752108140925, 'or'),
 (26884.29054152944, 'en'),
 (25242.120145823887, 'at'),
 (24594.36058641295, 'la'),
 (21673.913607286955, 's'),
 (18741.84810971184, 'que'),
 (17834.880269664754, 'on'),
 (17803.11446444847, 'n'),
 (17744.284752056956, 'che'),
 (16130.531378217607, 'es'),
 (14598.341823962923, 'un'),
 (13780.274595808653, 'w'),
 (13413.573018771187, 'le'),
 (13024.921479739949, 'me'),
 (12941.97744547872, 'his'),
 (12644.834667205227, 'no'),
 (12392.808254580488, 'da'),
 (12305.419351521097, 'y'),
 (11851.22848624911, 'na'),
 (11849.935488137104, 'z'),
 (11540.198767605, 'be'),
 (11454.196189649218, 'er'),
 (11181.125836531904, 'yang'),
 (11087.876846502615, 'r'),
 (10959.910289111649, 'si'),
 (10846.606214952697, 'm'),
 (9757.112669321063, 'u'),
 (9659.034392188563, 'je'),
 (9584.

In [232]:
tfidf_1000_words = [x[1] for x in TFIDFs[:1000]]
wordIdq5 = dict(zip(tfidf_1000_words, range(len(tfidf_1000_words))))
wordSetq5 = set(tfidf_1000_words)

def feature(datum):
    feat = [0]*len(tfidf_1000_words)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    for w in r.split():
        if w in tfidf_1000_words:
            feat[wordIdq5[w]] += 1
    feat.append(1) #offset
    return feat

X = [feature(d) for d in data[:10000]]
y = [d['rating'] for d in data[:10000]]


In [233]:
clf = Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [234]:
mean_squared_error(y, predictions)

0.993503475098554

6. Which other review has the highest cosine similarity compared to the first review (provide the review id, or the text of the review) (1 mark)?

In [168]:
first_review = data[0]
X_first_review = feature(first_review)

In [190]:
coses = []
for i in data[1:10000]:
    X_i_review = feature(i)
    dot = np.dot(X_first_review, X_i_review)
    norma = np.linalg.norm(X_first_review)
    normb = np.linalg.norm(X_i_review)
    cos = dot / (norma * normb)
#     print(cos)
    coses.append((cos, i['review_id']))

In [191]:
coses.sort()
coses.reverse()
coses

[(0.38426608984854055, 'r13136801'),
 (0.3835690309508412, 'r42982550'),
 (0.3806847767518744, 'r71755146'),
 (0.38035984058519634, 'r49529747'),
 (0.37990088317648774, 'r50747756'),
 (0.37606889417164235, 'r80509777'),
 (0.3723285197856431, 'r25474155'),
 (0.37208135368768197, 'r92212881'),
 (0.3709498896050606, 'r81665095'),
 (0.368670265907498, 'r06874956'),
 (0.3670905431948272, 'r06555296'),
 (0.36559247549190665, 'r56907707'),
 (0.36367687521372233, 'r51025703'),
 (0.36275617127899423, 'r13824279'),
 (0.3601470287992685, 'r92360181'),
 (0.3598457312185919, 'r41213114'),
 (0.3594844762144827, 'r05868461'),
 (0.3554458669657, 'r01103808'),
 (0.3551797055174278, 'r76841163'),
 (0.3547552035986036, 'r36336272'),
 (0.35260338061586866, 'r69067391'),
 (0.3518567769552858, 'r82741631'),
 (0.35171389622338484, 'r23612581'),
 (0.3512216442029781, 'r81978797'),
 (0.3505426036290943, 'r27564484'),
 (0.35053297152514906, 'r42522678'),
 (0.34826153135958615, 'r84382801'),
 (0.3480049656834667

7. Implement a validation pipeline for this same data, by randomly shuffling the data, using 10,000 reviews for training, another 10,000 for validation, and another 10,000 for testing.1 Consider regularization parameters in the range {0.01, 0.1, 1, 10, 100}, and report MSEs on the test set for the model that performs best on the validation set. Using this pipeline, compare the following alternatives in terms of their performance:


• Unigrams vs. bigrams


• Removing punctuation vs. preserving it. The model that preserves punctuation should treat punc-
tuation characters as separate words, e.g. “Amazing!” would become [‘amazing’, ‘!’]


• tfidf scores vs. word counts


In total you should compare 2 × 2 × 2 = 8 models, and produce a table comparing their performance (2 marks)

In [397]:
data_df = pd.DataFrame(data[:10000])
training = data_df.sample(len(data_df), replace = True)
validation = data_df.sample(len(data_df), replace = True)
test = data_df.sample(len(data_df), replace = True)

In [408]:
training

Unnamed: 0,genre,genreID,n_votes,rating,review_id,review_text,user_id
3273,mystery_thriller_crime,3,0,0,r39993320,Stephen King recommended book. He said in Ente...,u91608908
5915,comics_graphic,1,24,4,r21002920,I almost flipped out over this series today be...,u42211735
912,fantasy_paranormal,2,4,5,r18811640,"Wonder. Maybe that's it, that's what I can say...",u40613070
4719,fantasy_paranormal,2,0,5,r46519933,Holy crap... I'VE COMPLETED THE SERIES!! \n It...,u86523806
1236,comics_graphic,1,0,1,r22197550,"Aww, I expected this to be better. I should pr...",u93592690
1180,young_adult,4,0,3,r63885229,"I enjoyed the story, but it seems like there s...",u97501558
3622,fantasy_paranormal,2,0,5,r73650224,Laugh out loud funny. I kept finding myself re...,u98680108
5948,fantasy_paranormal,2,1,4,r87578469,~ 4 Fury Stars ~ \n Fury \n is book one in the...,u76690953
6404,fantasy_paranormal,2,0,4,r16745368,I don't know since when I love Kiernan's writi...,u17875845
619,fantasy_paranormal,2,0,3,r12569830,My edition seems to be from a different editor...,u97853031


In [399]:
def word_total_counts(punc):
    wordCountTotal = defaultdict(int)
    if punc == False:
        punctuation = set(string.punctuation)
        for _,row in training.iterrows():
            r = ''.join([c for c in row['review_text'].lower() if not c in punctuation])
            for w in r.split():
                wordCountTotal[w] += 1
    else:
        for _,row in training.iterrows():
            r = row['review_text'].lower()
            text = re.split('(\W)',r)
            text = list(filter(None, text)) ##removing empty string
            for w in text:
                wordCountTotal[w] += 1
    return wordCountTotal
        

In [400]:
with_punc_tot_words = word_total_counts(True)

In [401]:
with_punc_tot_words

defaultdict(int,
            {'stephen': 99,
             ' ': 1526517,
             'king': 338,
             'recommended': 200,
             'book': 14902,
             '.': 96321,
             'he': 6841,
             'said': 501,
             'in': 21803,
             'entertainment': 63,
             'weekly': 8,
             "'": 35770,
             's': 15200,
             'column': 7,
             'my': 5642,
             'top': 233,
             '20': 118,
             'of': 32465,
             '2011': 22,
             ':': 3738,
             '"': 8241,
             'a': 39659,
             'new': 1871,
             'york': 108,
             'financial': 12,
             'whiz': 6,
             'loses': 38,
             'his': 5968,
             'wife': 197,
             'the': 73016,
             '9': 76,
             '/': 2813,
             '11': 46,
             'attacks': 24,
             'and': 44355,
             'flees': 9,
             'to': 36611,
             'arizo

In [402]:
without_punc_tot_words = word_total_counts(False)

In [403]:
without_punc_tot_words

defaultdict(int,
            {'stephen': 97,
             'king': 265,
             'recommended': 200,
             'book': 14671,
             'he': 5874,
             'said': 498,
             'in': 21629,
             'entertainment': 62,
             'weeklys': 2,
             'column': 7,
             'my': 5627,
             'top': 206,
             '20': 90,
             'of': 32393,
             '2011': 17,
             'a': 39442,
             'new': 1842,
             'york': 107,
             'financial': 11,
             'whiz': 6,
             'loses': 38,
             'his': 5947,
             'wife': 169,
             'the': 72779,
             '911': 9,
             'attacks': 23,
             'and': 44121,
             'flees': 9,
             'to': 36501,
             'arizona': 16,
             'rethink': 7,
             'life': 1917,
             'there': 4257,
             'falls': 149,
             'afoul': 4,
             'mexican': 7,
             'drug': 94,
 

In [404]:
def tfidf (word, num_docs, document, punc):
    if punc == False:
        wordcnt = without_punc_tot_words
    else:
        wordcnt = with_punc_tot_words
    
    def idf(word, num_docs):
        return math.log(num_docs/abs(wordcnt[word]), 10)

    def tf(word, document):
        return document.count(word)

    
    return idf(w, num_docs)*tf(w, document['review_text'])

In [409]:
#unigram & removing punc & tfidf scores
def uni_nopunc_top_tfidf():
    wordNTT = defaultdict(int)
    # punctuation = set(string.punctuation)

    for _,row in training.iterrows():
        r = ''.join([c for c in row['review_text'].lower() if not c in punctuation])
        
        text = r.split()
        for w in text:
            wordNTT[w] += tfidf (w, len(training), row, False)

    counts = [(wordNTT[w], w) for w in wordNTT]
    counts.sort()
    counts.reverse()
    return counts[:1000]

In [410]:
uni_nopunc_tfidf = uni_nopunc_top_tfidf()

In [411]:
uni_nopunc_tfidf

[(34184.51742268324, 'the'),
 (20497.27512374558, 'and'),
 (17454.382836581422, 'a'),
 (17376.709835805217, 'to'),
 (16713.34042377081, 'of'),
 (12457.069908265388, 'i'),
 (10745.11498575246, 'is'),
 (10137.376236436125, 'in'),
 (8348.797948292731, 'that'),
 (8025.5103234405915, 'it'),
 (7121.774463058354, 'this'),
 (5856.964112581464, 'was'),
 (5849.6166665620785, 'with'),
 (5534.726122874902, 'for'),
 (5205.140687148952, 'but'),
 (4984.71730656796, 'book'),
 (4449.403382299731, 'her'),
 (4127.165392593164, 'as'),
 (3694.7157125960907, 'his'),
 (3492.1361294906696, 'on'),
 (3436.505466772593, 'are'),
 (3364.080641724547, 'not'),
 (3303.2018032783562, 'he'),
 (3029.2470302705124, 'be'),
 (3027.1477599792593, 'have'),
 (2889.645555902528, 'you'),
 (2871.8017584269223, 'she'),
 (2588.4002691084634, 'they'),
 (2573.705377069725, 'so'),
 (2565.308295904737, 'has'),
 (2434.1039027017496, 'one'),
 (2415.210470080518, 'all'),
 (2374.2746994011864, 'at'),
 (2323.8922124112355, 'an'),
 (2280.85

In [412]:
#unigram & removing punc & counts
def uni_nopunc_top_counts():
    wordUNC = defaultdict(int)
    # punctuation = set(string.punctuation)

    for _,row in training.iterrows():
        r = ''.join([c for c in row['review_text'].lower() if not c in punctuation])
        text = r.split()

        for w in text:
            wordUNC[w] += 1

    counts = [(wordUNC[w], w) for w in wordUNC]
    counts.sort()
    counts.reverse()
    return counts[:1000]

In [413]:
uni_nopunc_cnt = uni_nopunc_top_counts()

In [414]:
uni_nopunc_cnt

[(72779, 'the'),
 (44121, 'and'),
 (39442, 'a'),
 (36567, 'i'),
 (36501, 'to'),
 (32393, 'of'),
 (21987, 'is'),
 (21629, 'in'),
 (20218, 'it'),
 (19220, 'this'),
 (17144, 'that'),
 (15518, 'was'),
 (14671, 'book'),
 (12065, 'but'),
 (11938, 'for'),
 (11384, 'with'),
 (9504, 'her'),
 (9102, 'as'),
 (7220, 'on'),
 (7138, 'not'),
 (6914, 'read'),
 (6889, 'you'),
 (6823, 'she'),
 (6699, 'so'),
 (6627, 'story'),
 (6510, 'are'),
 (6316, 'me'),
 (6298, 'be'),
 (6238, 'one'),
 (6187, 'have'),
 (5947, 'his'),
 (5874, 'he'),
 (5627, 'my'),
 (5399, 'like'),
 (5374, 'all'),
 (5354, 'about'),
 (5090, 'at'),
 (5018, 'from'),
 (4934, 'an'),
 (4932, 'just'),
 (4923, 'more'),
 (4665, 'its'),
 (4619, 'really'),
 (4504, 'has'),
 (4459, 'they'),
 (4400, 'what'),
 (4276, 'characters'),
 (4270, 'who'),
 (4257, 'there'),
 (3991, 'by'),
 (3977, 'out'),
 (3890, 'up'),
 (3775, 'when'),
 (3774, 'series'),
 (3747, 'love'),
 (3566, 'if'),
 (3525, 'or'),
 (3383, 'how'),
 (3300, 'some'),
 (3291, 'very'),
 (3285, 'wi

In [418]:
#unigram & keeping punc & tfidf scores
def uni_punc_top_tfidf():
    wordUPT = defaultdict(int)
    # punctuation = set(string.punctuation)

    for _,row in training.iterrows():
        r = row['review_text'].lower()
        text = re.split('(\W)',r)
        text = list(filter(None, text)) ##removing empty string
        
        for w in text:
            wordUPT[w] += tfidf (w, len(training), row, True)

    counts = [(wordUPT[w], w) for w in wordUPT]
    counts.sort()
    counts.reverse()
    return counts[:1000]

In [419]:
uni_punc_tfidf = uni_punc_top_tfidf()

In [420]:
uni_punc_tfidf

[(660446.6880667816, ' '),
 (38108.6435704784, '.'),
 (34132.41667536457, 'the'),
 (32369.037397747397, ','),
 (20552.201159768774, 'and'),
 (17464.458192321767, 'a'),
 (17360.975946628838, 'to'),
 (16691.99981285574, 'of'),
 (16558.204586101212, "'"),
 (14427.93371011661, 'i'),
 (10731.84057101864, 'is'),
 (10209.202966508221, 'in'),
 (9746.146048911916, 'it'),
 (8961.144366937277, '\n'),
 (8702.961390308994, 'that'),
 (7731.900721128443, '-'),
 (7645.142878779798, 's'),
 (7139.229677613598, 'this'),
 (5838.907317591292, 'was'),
 (5837.862042382229, 'with'),
 (5529.505855721012, 'for'),
 (5246.236274076254, 'but'),
 (5043.452883526186, 'book'),
 (4463.325142519433, 't'),
 (4448.691289593138, 'her'),
 (4122.565424378555, 'as'),
 (4085.9807920628136, '"'),
 (3881.106851094711, 'he'),
 (3695.0478638889854, 'his'),
 (3557.0715362982237, 'on'),
 (3431.638511215712, 'are'),
 (3404.461355781168, 'she'),
 (3361.605072211309, 'not'),
 (3314.5676878053673, 'you'),
 (3023.9811796975496, 'be'),
 

In [421]:
#unigram & keeping punc & counts

def uni_punc_top_counts():
    wordUKC = defaultdict(int)
    # punctuation = set(string.punctuation)

    for _,row in training.iterrows():
        r = row['review_text'].lower()
        text = re.split('(\W)',r)
        text = list(filter(None, text)) ##removing empty string

        for w in text:
            wordUKC[w] += 1

    counts = [(wordUKC[w], w) for w in wordUKC]
    counts.sort()
    counts.reverse()
    return counts[:1000]


In [422]:
uni_punc_cnt = uni_punc_top_counts()

In [423]:
uni_punc_cnt

[(1526517, ' '),
 (96321, '.'),
 (73016, 'the'),
 (68697, ','),
 (44355, 'and'),
 (41497, 'i'),
 (39659, 'a'),
 (36611, 'to'),
 (35770, "'"),
 (32465, 'of'),
 (24055, 'it'),
 (22024, 'is'),
 (21803, 'in'),
 (20985, '\n'),
 (19290, 'this'),
 (18021, 'that'),
 (15543, 'was'),
 (15200, 's'),
 (14902, 'book'),
 (14097, '-'),
 (12209, 'but'),
 (11964, 'for'),
 (11407, 'with'),
 (10653, 't'),
 (9539, 'her'),
 (9142, 'as'),
 (8241, '"'),
 (7905, 'she'),
 (7831, 'you'),
 (7670, '!'),
 (7308, 'on'),
 (7228, 'not'),
 (7133, 'read'),
 (6841, 'he'),
 (6770, 'so'),
 (6735, 'story'),
 (6525, 'are'),
 (6438, 'one'),
 (6372, 'me'),
 (6326, 'be'),
 (6197, 'have'),
 (5968, 'his'),
 (5642, 'my'),
 (5526, 'all'),
 (5489, 'like'),
 (5376, ')'),
 (5366, 'about'),
 (5112, 'at'),
 (5027, 'from'),
 (5021, 'there'),
 (4973, 'just'),
 (4959, 'more'),
 (4951, 'they'),
 (4949, 'an'),
 (4915, '('),
 (4636, 'what'),
 (4623, 'really'),
 (4507, 'has'),
 (4445, 'who'),
 (4236, 'characters'),
 (4132, '?'),
 (4059, 'out'

In [426]:
#bigram & removing punc & tfidf scores
def bi_nopunc_top_tfidf():
    wordBNT = defaultdict(int)
    # punctuation = set(string.punctuation)

    for _,row in training.iterrows():
        r = ''.join([c for c in row['review_text'].lower() if not c in punctuation])
        
        text = r.split()
        
        n = 2
        bigram = [''.join(text[i:i+n]) for i in range(0,len(text))]
        for w in bigram:
            wordBNT[w] += tfidf (w, len(training), row, False)

    counts = [(wordBNT[w], w) for w in wordBNT]
    counts.sort()
    counts.reverse()
    return counts[:1000]

In [427]:
bi_nopunc_tfidf = bi_nopunc_top_tfidf()

In [428]:
#bigram & removing punc & counts
def bi_nopunc_top_counts():
    wordBNC = defaultdict(int)
    # punctuation = set(string.punctuation)

    for _,row in training.iterrows():
        r = ''.join([c for c in row['review_text'].lower() if not c in punctuation])
        text = r.split()
        
        n = 2
        bigram = [''.join(text[i:i+n]) for i in range(0,len(text))]
    
        for w in bigram:
            wordBNC[w] += 1

    counts = [(wordBNC[w], w) for w in wordBNC]
    counts.sort()
    counts.reverse()
    return counts[:1000]

In [429]:
bi_nopunc_cnt = bi_nopunc_top_counts()

In [430]:
#bigram & keeping punc & tfidf scores
def bi_punc_top_tfidf():
    wordBKT = defaultdict(int)
    # punctuation = set(string.punctuation)

    for _,row in training.iterrows():
        r = row['review_text'].lower()
        text = re.split('(\W)',r)
        text = list(filter(None, text)) ##removing empty string
        
        n = 2
        bigram = [' '.join(text[i:i+n]) for i in range(0,len(text))]
        for w in bigram:
            wordBKT[w] += tfidf (w, len(training), row, True)

    counts = [(wordBKT[w], w) for w in wordBKT]
    counts.sort()
    counts.reverse()
    return counts[:1000]

In [431]:
bi_punc_tfidf = bi_punc_top_tfidf()

In [432]:
bi_punc_tfidf

[(34081.198190122566, 'the  '),
 (33678.76723464967, '  the'),
 (32158.937080734213, ',  '),
 (32037.68515648754, '.  '),
 (20355.689420472878, 'and  '),
 (20171.720983685158, '  and'),
 (17411.149156661697, 'a  '),
 (17276.30865469814, '  to'),
 (17270.037003443875, '  a'),
 (17203.139390066688, 'to  '),
 (16629.283300314477, '  of'),
 (16575.974264654425, 'of  '),
 (13642.93202814204, '  i'),
 (12455.49939069416, 'i  '),
 (10684.803186612708, '  is'),
 (10403.62415538609, 'is  '),
 (10021.05342888449, '  in'),
 (9982.378246150689, 'in  '),
 (9586.2189419317, '  it'),
 (8961.144366937277, '  \n'),
 (8961.144366937277, '\n  '),
 (8678.920060501523, '  that'),
 (8007.853376310129, 'that  '),
 (7597.060219164826, "' s"),
 (7521.800404115306, 's  '),
 (6864.322297641015, '  this'),
 (6852.824270341796, 'this  '),
 (6737.843997349472, 'it  '),
 (5827.409290292057, '  was'),
 (5797.096309230403, '  with'),
 (5766.783328168827, 'was  '),
 (5716.61011813578, 'with  '),
 (5501.283425077438, ' 

In [433]:
#bigram & keeping punc & counts
def bi_punc_top_counts():
    wordBKC = defaultdict(int)
    # punctuation = set(string.punctuation)

    for _,row in training.iterrows():
        r = row['review_text'].lower()
        text = re.split('(\W)',r)
        text = list(filter(None, text)) ##removing empty string
        
        n = 2
        bigram = [''.join(text[i:i+n]) for i in range(0,len(text))]
    
        for w in bigram:
            wordBKC[w] += 1

    counts = [(wordBKC[w], w) for w in wordBKC]
    counts.sort()
    counts.reverse()
    return counts[:1000]

In [434]:
bi_punc_cnt = bi_punc_top_counts()

In [435]:
bi_punc_cnt

[(76103, '. '),
 (72917, 'the '),
 (71857, ' the'),
 (68099, ', '),
 (44072, 'and '),
 (43826, ' and'),
 (39478, 'a '),
 (39012, ' a'),
 (38603, ' i'),
 (36743, 'i '),
 (36459, ' to'),
 (36203, 'to '),
 (32376, ' of'),
 (32276, 'of '),
 (23623, ' it'),
 (21959, ' is'),
 (21465, ' in'),
 (21461, 'is '),
 (21348, 'in '),
 (20985, ' \n'),
 (20985, '\n '),
 (18582, 'this '),
 (18014, ' this'),
 (17936, ' that'),
 (16981, 'it '),
 (16419, 'that '),
 (15510, ' was'),
 (15265, 'was '),
 (14939, "'s"),
 (14859, 's '),
 (14817, ' book'),
 (12017, ' but'),
 (11935, 'but '),
 (11864, ' for'),
 (11795, 'for '),
 (11326, ' with'),
 (11164, 'with '),
 (10562, "'t"),
 (10435, 'book '),
 (10382, 't '),
 (9492, ' her'),
 (9108, 'as '),
 (8963, ' as'),
 (8625, 'her '),
 (8179, '..'),
 (7826, ' she'),
 (7710, ' you'),
 (7387, '.'),
 (7213, ' on'),
 (6996, ' not'),
 (6958, 'not '),
 (6901, ' read'),
 (6830, 'she '),
 (6777, ' he'),
 (6750, 'on '),
 (6710, ' story'),
 (6594, ' so'),
 (6499, ' are'),
 (6414

In [436]:
def featureq8(datum, top_1000_words, uni, wordID):
    
    topwords = [x[1] for x in top_1000_words]
    feat = [0]*len(top_1000_words)
#     print(len(feat))
    r = datum.lower()
    if uni == True:
        r = ''.join([c for c in datum.lower() if not c in punctuation])
        text = r.split()
    else:
        text = re.split('(\W)',r)
        text = list(filter(None, text)) ##removing empty string
    
    for w in text:
        if w in topwords:
            feat[wordID[w]] += 1
    feat.append(1) #offset
    return feat



In [437]:
y_train = training['rating']
y_val = validation['rating']

In [458]:
#unigram & removing punc & tfidf scores
words_1 = [x[1] for x in uni_nopunc_tfidf]
wordId_1 = dict(zip(words_1, range(len(words_1))))
print('start featuring train')
X_train = training['review_text'].apply(lambda x: featureq8(x,uni_nopunc_tfidf,True, wordId_1))
print('finish featuring train')

print('start featuring val')
X_val = validation['review_text'].apply(lambda x: featureq8(x,uni_nopunc_tfidf,True, wordId_1))
print('finish featuring val')


for C in [0.01, 0.1, 1, 10, 100]:
    print(C)
    clf = Ridge(C, fit_intercept=False)
    clf.fit(X_train, list(y_train))
    theta = clf.coef_
    predictions = clf.predict(X_val)
    print(mean_squared_error(y_val, predictions))

start featuring train
finish featuring train
start featuring val
finish featuring val
0.01


ValueError: setting an array element with a sequence.

In [455]:
y

[5,
 5,
 4,
 5,
 5,
 3,
 2,
 5,
 4,
 3,
 4,
 4,
 3,
 4,
 3,
 4,
 5,
 5,
 4,
 5,
 3,
 4,
 5,
 4,
 5,
 5,
 4,
 5,
 3,
 4,
 2,
 3,
 3,
 5,
 2,
 3,
 5,
 5,
 4,
 5,
 3,
 4,
 3,
 2,
 4,
 4,
 3,
 5,
 2,
 5,
 3,
 4,
 5,
 5,
 3,
 4,
 4,
 4,
 5,
 3,
 4,
 4,
 2,
 4,
 5,
 5,
 4,
 5,
 4,
 2,
 2,
 3,
 1,
 5,
 3,
 5,
 5,
 3,
 0,
 3,
 3,
 5,
 4,
 3,
 3,
 5,
 2,
 4,
 5,
 3,
 3,
 5,
 4,
 4,
 5,
 4,
 4,
 5,
 3,
 4,
 2,
 4,
 4,
 1,
 4,
 5,
 3,
 3,
 3,
 4,
 3,
 5,
 3,
 2,
 1,
 4,
 4,
 5,
 4,
 3,
 3,
 5,
 4,
 5,
 4,
 2,
 5,
 4,
 4,
 5,
 3,
 5,
 4,
 5,
 4,
 3,
 5,
 5,
 3,
 3,
 4,
 4,
 5,
 5,
 3,
 3,
 4,
 4,
 5,
 5,
 4,
 3,
 4,
 4,
 3,
 3,
 5,
 5,
 2,
 5,
 5,
 5,
 3,
 4,
 4,
 4,
 5,
 3,
 2,
 3,
 5,
 2,
 3,
 5,
 5,
 5,
 2,
 5,
 4,
 5,
 5,
 3,
 4,
 4,
 5,
 4,
 5,
 5,
 4,
 3,
 4,
 3,
 4,
 4,
 5,
 3,
 3,
 4,
 3,
 3,
 4,
 4,
 4,
 2,
 5,
 3,
 5,
 5,
 5,
 3,
 2,
 3,
 3,
 3,
 1,
 5,
 3,
 3,
 4,
 3,
 5,
 4,
 4,
 3,
 5,
 5,
 4,
 3,
 0,
 5,
 4,
 5,
 5,
 5,
 5,
 3,
 4,
 4,
 5,
 3,
 4,
 5,
 5,
 2,
 5,
 5,
 0,
 4,
 4,
 5,


In [454]:
X_train.apply(lambda x:len(x)).unique()

array([1001])

In [457]:
list(y_train)

[0,
 4,
 5,
 5,
 1,
 3,
 5,
 4,
 4,
 3,
 4,
 4,
 4,
 3,
 4,
 0,
 3,
 3,
 4,
 5,
 2,
 5,
 5,
 3,
 4,
 4,
 2,
 4,
 5,
 4,
 5,
 5,
 3,
 4,
 3,
 2,
 3,
 5,
 4,
 5,
 4,
 4,
 5,
 3,
 5,
 3,
 4,
 5,
 2,
 5,
 5,
 4,
 5,
 0,
 4,
 3,
 2,
 3,
 3,
 3,
 3,
 5,
 3,
 3,
 5,
 5,
 5,
 3,
 4,
 2,
 4,
 4,
 4,
 4,
 5,
 4,
 3,
 3,
 1,
 4,
 2,
 5,
 1,
 3,
 3,
 5,
 5,
 5,
 4,
 3,
 3,
 4,
 3,
 4,
 5,
 5,
 0,
 4,
 2,
 5,
 5,
 5,
 4,
 5,
 3,
 4,
 5,
 4,
 5,
 4,
 3,
 4,
 3,
 5,
 4,
 4,
 5,
 5,
 3,
 5,
 4,
 3,
 5,
 5,
 5,
 4,
 4,
 4,
 3,
 3,
 5,
 4,
 4,
 5,
 4,
 4,
 4,
 5,
 5,
 3,
 5,
 3,
 3,
 4,
 3,
 5,
 4,
 5,
 5,
 3,
 3,
 5,
 5,
 4,
 4,
 5,
 5,
 5,
 0,
 4,
 3,
 4,
 2,
 4,
 2,
 4,
 3,
 3,
 0,
 5,
 3,
 5,
 3,
 5,
 4,
 3,
 5,
 4,
 3,
 3,
 3,
 5,
 3,
 4,
 4,
 5,
 4,
 4,
 5,
 5,
 4,
 5,
 4,
 5,
 1,
 3,
 4,
 4,
 5,
 4,
 5,
 4,
 3,
 5,
 5,
 3,
 3,
 4,
 2,
 5,
 3,
 3,
 3,
 3,
 3,
 5,
 3,
 4,
 3,
 4,
 5,
 4,
 5,
 3,
 4,
 4,
 3,
 5,
 4,
 3,
 5,
 5,
 4,
 2,
 4,
 3,
 4,
 1,
 4,
 4,
 5,
 4,
 3,
 4,
 0,
 4,
 3,
 4,
 0,
 4,


In [None]:
#unigram & removing punc & cnt scores
X_train = training['review_text'].apply(lambda x: featureq8(x,uni_nopunc_cnt,True))
X_val = validation['review_text'].apply(lambda x: featureq8(x,uni_nopunc_cnt,True))
# X_train = [featureq8(d,uni_nopunc_top_tfidf(),True) for d in training['review_text']]
# X_val =  [featureq8(d,uni_nopunc_top_tfidf(),True) for d in validation['review_text']]
for C in [0.01, 0.1, 1, 10, 100]:
    clf = Ridge(C, fit_intercept=False)
    clf.fit(X_train, y_train)
    theta = clf.coef_
    predictions = clf.predict(X_val)
    print(mean_squared_error(y_val, predictions))

In [None]:
#unigram & keeping punc & cnt scores
X_train = training['review_text'].apply(lambda x: featureq8(x,uni_punc_cnt,True))
X_val = validation['review_text'].apply(lambda x: featureq8(x,uni_punc_cnt,True))
# X_train = [featureq8(d,uni_nopunc_top_tfidf(),True) for d in training['review_text']]
# X_val =  [featureq8(d,uni_nopunc_top_tfidf(),True) for d in validation['review_text']]
for C in [0.01, 0.1, 1, 10, 100]:
    clf = Ridge(C, fit_intercept=False)
    clf.fit(X_train, y_train)
    theta = clf.coef_
    predictions = clf.predict(X_val)
    print(mean_squared_error(y_val, predictions))

In [None]:
#bigram & removing punc & tfidf scores
X_train = training['review_text'].apply(lambda x: featureq8(x,bi_nopunc_tfidf,True))
X_val = validation['review_text'].apply(lambda x: featureq8(x,bi_nopunc_tfidf,True))
# X_train = [featureq8(d,uni_nopunc_top_tfidf(),True) for d in training['review_text']]
# X_val =  [featureq8(d,uni_nopunc_top_tfidf(),True) for d in validation['review_text']]
for C in [0.01, 0.1, 1, 10, 100]:
    clf = Ridge(C, fit_intercept=False)
    clf.fit(X_train, y_train)
    theta = clf.coef_
    predictions = clf.predict(X_val)
    print(mean_squared_error(y_val, predictions))

In [None]:
X = [feature(d) for d in data[:10000]]
y = [d['rating'] for d in data[:10000]]

clf = Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)