In [1]:
from __future__ import division
import sframe
import math
import string

In [3]:
products = sframe.SFrame('dataset.gl/')

In [4]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

In [5]:
products['review'] = products['review'].apply(remove_punctuation)

In [6]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [7]:
train_data, test_data = products.random_split(.8, seed=1)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review'])
test_matrix = vectorizer.transform(test_data['review'])

In [13]:
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression(verbose=True)
sentiment_model.fit(train_matrix, train_data['sentiment'])

[LibLinear]



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=True, warm_start=False)

In [None]:
from sklearn.svm import SVC
support_vector = SVC(verbose=2)
support_vector.fit(train_matrix, train_data['sentiment'])

In [10]:
prediction = sentiment_model.predict(test_matrix)

In [11]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [12]:
prediction = map(float, prediction)
true_labels = map(float, test_data['sentiment'])
print accuracy_score(prediction, true_labels)
print recall_score(prediction, true_labels)
print precision_score(prediction, true_labels)

0.883684855082
0.911267847927
0.939853092509


In [12]:
#With no reviews removed
data2 = products[products['review'] != '']

In [13]:
train_data2, test_data2 = data2.random_split(.8, seed=1)

In [14]:
vectorizer2 = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix2 = vectorizer2.fit_transform(train_data2['review'])
test_matrix2 = vectorizer2.transform(test_data2['review'])

In [15]:
sentiment_model2 = LogisticRegression()
sentiment_model2.fit(train_matrix2, train_data2['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
prediction2 = sentiment_model2.predict(test_matrix2)

In [17]:
prediction2 = map(float, prediction2)
true_labels2 = map(float, test_data2['sentiment'])
print accuracy_score(prediction2, true_labels2)
print recall_score(prediction2, true_labels2)
print precision_score(prediction2, true_labels2)

0.883539735914
0.909050194646
0.942157317813


In [18]:
#Using tfidf and stemmer and stopwords
from nltk.stem.snowball import SnowballStemmer

In [19]:
stemmer = SnowballStemmer('english')

In [20]:
def stem_text(text):
    words = []
    for word in text.split():
        words.append(stemmer.stem(word))
    return string.join(words)

In [21]:
data_stemmed = data2['review'].apply(stem_text)

In [22]:
data2['review'] =  data_stemmed

In [23]:
train_data3, test_data3 = data2.random_split(.8, seed=1)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')

In [25]:
train_matrix3 = tfidf_vectorizer.fit_transform(train_data3['review'])
test_matrix3 = tfidf_vectorizer.transform(test_data3['review'])

In [26]:
sentiment_model3 = LogisticRegression()
sentiment_model3.fit(train_matrix3, train_data3['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [27]:
prediction3 = sentiment_model3.predict(test_matrix3)

In [28]:
prediction3 = map(float, prediction3)
true_labels3 = map(float, test_data3['sentiment'])
print accuracy_score(prediction3, true_labels3)
print recall_score(prediction3, true_labels3)
print precision_score(prediction3, true_labels3)

0.874873561333
0.893834465138
0.949334095048


In [29]:
from sklearn.decomposition import PCA

In [34]:
train_data3

name,review,rating,sentiment
Planetwise Flannel Wipes,these flannel wipe are ok but in my opinion not ...,3.0,-1
Planetwise Wipe Pouch,it came earli and was not disappoint i love planet ...,5.0,1
Annas Dream Full Quilt with 2 Shams ...,veri soft and comfort and warmer than it looksfit ...,5.0,1
Stop Pacifier Sucking without tears with ...,this is a product well worth the purchas i have ...,5.0,1
Stop Pacifier Sucking without tears with ...,all of my kid have cri nonstop when i tri to ...,5.0,1
Stop Pacifier Sucking without tears with ...,when the binki fairi came to our hous we didnt ...,5.0,1
A Tale of Baby's Days with Peter Rabbit ...,love book it bound tight so you may not be abl to ...,4.0,1
"Baby Tracker&reg; - Daily Childcare Journal, ...",perfect for new parent we were abl to keep trac ...,5.0,1
"Baby Tracker&reg; - Daily Childcare Journal, ...",i love this journal and our nanni use it ever ...,4.0,1
"Baby Tracker&reg; - Daily Childcare Journal, ...",this book is perfect im a first time new mom and ...,5.0,1


In [39]:
pca = PCA(whiten=True, n_components=250)

In [41]:
from sklearn.decomposition import TruncatedSVD
pca = TruncatedSVD(n_components=250)

In [42]:
pca.fit(train_matrix3)

TruncatedSVD(algorithm='randomized', n_components=250, n_iter=5,
       random_state=None, tol=0.0)

In [43]:
train_pca = pca.transform(train_matrix3)
test_pca = pca.transform(test_matrix3)

In [44]:
sentiment_pca = LogisticRegression()
sentiment_pca.fit(train_pca, train_data3['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [45]:
prediction_pca = sentiment_pca.predict(test_pca)

In [46]:
prediction_pca = map(float, prediction_pca)
print accuracy_score(prediction_pca, true_labels3)
print recall_score(prediction_pca, true_labels3)
print precision_score(prediction_pca, true_labels3)

0.851581508516
0.872328496042
0.944371050095
