In [1]:
from __future__ import division
import sframe
import math
import string
from sklearn.linear_model import LogisticRegression
from time import time
from sklearn.metrics import accuracy_score, recall_score, precision_score
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
products = sframe.SFrame('dataset.gl/')

In [4]:
def remove_punctuation(text):
    return text.translate(None, string.punctuation)

In [5]:
def stem_text(text):
    words = []
    for word in text.split():
        words.append(stemmer.stem(word))
    return string.join(words)

In [6]:
products['review'] = products['review'].apply(remove_punctuation)
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [7]:
train_data, test_data = products.random_split(.8, seed=1)

In [8]:
stemmer = SnowballStemmer('english')

In [9]:
products['review'] = products['review'].apply(stem_text)

In [10]:
tfidf_vectorizer = TfidfVectorizer()

In [11]:
train_matrix = tfidf_vectorizer.fit_transform(train_data['review'])
test_matrix = tfidf_vectorizer.transform(test_data['review'])

In [12]:
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
prediction = sentiment_model.predict(test_matrix)
prediction = map(float, prediction)
true_labels = map(float, test_data['sentiment'])
print "Accuracy: " + str(accuracy_score(prediction, true_labels))
print "Recall: " + str(recall_score(prediction, true_labels))
print "Precision: " + str(precision_score(prediction, true_labels))

Accuracy: 0.888011974418
Recall: 0.908258125806
Precision: 0.949930804443


In [13]:
from sklearn import metrics

In [15]:
fpr, tpr, thresholds = metrics.roc_curve(true_labels, prediction, pos_label=2)



In [14]:
from sklearn.metrics import roc_auc_score

In [15]:
roc_auc_score(true_labels, prediction)

0.81709524808777234

In [16]:
train_matrix.shape

(146786, 131277)

In [16]:
from sklearn.decomposition import TruncatedSVD

In [17]:
pca = TruncatedSVD(n_components=1000, random_state=42)

In [None]:
pca.fit(train_matrix)

In [18]:
train_matrix_pca = pca.transform(train_matrix)
test_matrix_pca = pca.transform(test_matrix)

In [19]:
sentiment_model_pca = LogisticRegression()
sentiment_model_pca.fit(train_matrix_pca, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
prediction_pca = sentiment_model_pca.predict(test_matrix_pca)
prediction_pca = map(float, prediction_pca)
true_labels = map(float, test_data['sentiment'])
print "Accuracy: " + str(accuracy_score(prediction_pca, true_labels))
print "Recall: " + str(recall_score(prediction_pca, true_labels))
print "Precision: " + str(precision_score(prediction_pca, true_labels))

Accuracy: 0.868302051268
Recall: 0.885708110984
Precision: 0.950171208446


In [21]:
roc_auc_score(true_labels, prediction_pca)

0.77691311488842474

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [1]:
rdf_classifier = RandomForestClassifier(n_estimators=50, max_features=0.1)

NameError: name 'RandomForestClassifier' is not defined

In [None]:
rdf_classifier.fit(train_matrix, train_data['sentiment'])

In [None]:
prediction_rdf = rdf_classifier.predict(test_matrix)
prediction_rdf = map(float, prediction_rdf)
true_labels = map(float, test_data['sentiment'])
print "Accuracy: " + str(accuracy_score(prediction_rdf, true_labels))
print "Recall: " + str(recall_score(prediction_rdf, true_labels))
print "Precision: " + str(precision_score(prediction_rdf, true_labels))
print "Area Under Curve: " + str(roc_acu_score(true_labels, prediction_rdf))