In [32]:
# Imports
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
from collections import defaultdict
import string
from nltk.stem.porter import * # PorterStemmer
from sklearn import linear_model

## Loading the Dataset

In [5]:
# Function to load dataset
def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return data

In [6]:
DIR = "/Users/milliehuang/Documents/dsc/dsu/goodreads poetry"

In [7]:
poetry_reviews = load_data(os.path.join(DIR, 'goodreads_reviews_poetry.json.gz'))

In [8]:
print(' == sample record (poetry review) ==')
display(np.random.choice(poetry_reviews))

 == sample record (poetry review) ==


{'user_id': '18a90fd306154bbad480e04b153a144b',
 'book_id': '34051',
 'review_id': 'e5f1585ce75154669a5349d9c74a93da',
 'rating': 5,
 'review_text': '"When you\'re wounded and left on Afghanistan\'s plains, \n And the women come out to cut up what remains, \n Jest roll to your rifle and blow out your brains \n An\' go to your Gawd like a soldier." \n Wow. The pathos. \n Re-reading Kipling again. The older I get them more dark and critical his poetry sounds.',
 'date_added': 'Fri Jun 17 08:56:18 -0700 2011',
 'date_updated': 'Fri Jun 17 08:58:43 -0700 2011',
 'read_at': 'Wed Jun 15 00:00:00 -0700 2011',
 'started_at': '',
 'n_votes': 1,
 'n_comments': 0}

In [19]:
word_count = defaultdict(int)
for d in poetry_reviews:
    for w in d['review_text'].split():
        word_count[w] += 1

len(word_count)

10755

In [20]:
word_count = defaultdict(int)
punctuation = set(string.punctuation)
for d in poetry_reviews:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in r.split():
        word_count[w] += 1

len(word_count)

7702

In [21]:
word_count = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in poetry_reviews:
  r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
  for w in r.split():
    w = stemmer.stem(w)
    word_count[w] += 1
    
len(word_count)

6391

In [22]:
word_count = defaultdict(int)
punctuation = set(string.punctuation)
for d in poetry_reviews:
  r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
  for w in r.split():
    word_count[w] += 1

counts = [(word_count[w], w) for w in word_count]
counts.sort()
counts.reverse()

In [25]:
words = [x[1] for x in counts[:1000]]
words[:5]

['the', 'and', 'of', 'a', 'i']

## Sentiment Analysis

In [26]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [28]:
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) # offset
    return feat

In [34]:
X = [feature(d) for d in poetry_reviews]
y = [d['rating'] for d in poetry_reviews]

In [35]:
# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)