# Predicting sentiment from product reviews using Logstic Regression

In this assignment, we will use product review data from Amazon.com to predict whether the sentiments about a product (from its reviews) are positive or negative.

## Load amazon dataset

In [9]:
import pandas as pd 

In [10]:
products = pd.read_csv('amazon_baby.csv')

In [11]:
products.shape

(183531, 3)

There are 183531 reviews for this product

In [13]:
products.head(5)

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [43]:
#find rating scores
products.rating.unique()

array([3, 5, 4, 2, 1])

## Perform text cleaning

We start by removing punctuation, so that words "cake." and "cake!" are counted as the same word

In [38]:
#Remove NA rows with an empty string
products = products.fillna({'review':""})
#Write a function remove_punctuation that strips punctuation from a line of text
def remove_punctuation(text):
    import string
    translator = str.maketrans({key: None for key in string.punctuation})
    return text.translate(translator).lower

In [39]:
products['review_clean']=products['review'].apply(remove_punctuation)

## Extract Sentiments

We will remove the reviews with a rating=3, since they tend to have a neutral sentiment. We will assign reviews with a rating of 4 or higher to be$positive while the ones with a rating 2 or lower to be negative .

In [46]:
products = products[products['rating']!=3]
products['sentiment'] = products['rating'].apply(lambda rating: +1 if rating>3 else -1)

## Split into training and test sets

In [81]:
from urllib.request import urlopen

with urlopen("https://s3.amazonaws.com/static.dato.com/files/coursera/course-3/"
                   "indices-json/module-2-assignment-train-idx.json") as f:
    train_idx = f.read().decode('ascii')
    train_idx = train_idx.split(", ")
    
    
with urlopen("https://s3.amazonaws.com/static.dato.com/files/coursera/course-3/"
                   "indices-json/module-2-assignment-test-idx.json") as f:
    test_idx = f.read().decode('ascii')
    test_idx = test_idx.split(", ")

In [101]:
train_idx[0]=train_idx[0].strip("[")
train_idx[-1]=train_idx[-1].strip("]")
test_idx[0]=test_idx[0].strip("[")
test_idx[-1]=test_idx[1].strip("]")

In [104]:
train_idx = [int(i) for i in train_idx]
test_idx = [int(i) for i in test_idx]

In [105]:
train_data, test_data = products.iloc[train_idx],products.iloc[test_idx]

## Build the word count vector for each review

### Learn a vocabulary (set of all words) from the training data.

In [111]:
def create_vocabulary():
    vocabulary = []
    for text in train_data.review_clean:
        word_count = {}
        words = text.split(" ")
        for word in words:
            if word in word_count:
                word_count[word]+=1
            else:
                word_count[word]=1
        vocabulary.append(word_count)
    return vocabulary

In [112]:
create_vocabulary()

[{'and': 3,
  'bags': 1,
  'came': 1,
  'disappointed': 1,
  'does': 1,
  'early': 1,
  'highly': 1,
  'holder': 1,
  'i': 1,
  'it': 3,
  'keps': 1,
  'leak': 1,
  'love': 1,
  'moist': 1,
  'my': 2,
  'not': 2,
  'now': 1,
  'osocozy': 1,
  'planet': 1,
  'recommend': 1,
  'was': 1,
  'wipe': 1,
  'wipes': 1,
  'wise': 1},
 {'Very': 1,
  'and': 2,
  'anyone': 1,
  'bed': 1,
  'comfortable': 1,
  'for': 1,
  'full': 1,
  'it': 1,
  'looking': 1,
  'looksfit': 1,
  'of': 1,
  'perfectlywould': 1,
  'quilt': 1,
  'recommend': 1,
  'size': 1,
  'soft': 1,
  'than': 1,
  'the': 1,
  'this': 1,
  'to': 1,
  'type': 1,
  'warmer': 1},
 {'': 4,
  'I': 3,
  'She': 1,
  'This': 1,
  'What': 1,
  'a': 2,
  'about': 1,
  'and': 3,
  'anything': 1,
  'approach': 2,
  'artwork': 1,
  'back': 1,
  'binky': 2,
  'chart': 1,
  'clever': 1,
  'daughter': 1,
  'else': 1,
  'fairy': 1,
  'found': 1,
  'getting': 1,
  'has': 1,
  'have': 1,
  'her': 1,
  'herself': 1,
  'how': 1,
  'in': 2,
  'ingenious'