# Sentiment Analysis

In this demo, we are using simple NLP techniques to preprocess our data into the bag-of-words like dataset. Then, we are able to analyze sentiment based on the words used in the given review. This approach is not very robust, especially compared to RNN, but it's definitely better than guessing.

http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

In [2]:
negative_reviews = BeautifulSoup(open('../data/sorted_data_acl/electronics/negative.review').read())
negative_reviews = [tag.text for tag in negative_reviews.findAll('review_text')]
len(negative_reviews)

1000

In [3]:
positive_reviews = BeautifulSoup(open('../data/sorted_data_acl/electronics/positive.review').read())
positive_reviews = [tag.text for tag in positive_reviews.findAll('review_text')]
len(positive_reviews)

1000

In [4]:
stop_words = set(open('../data/general/english-stopwords.txt').read().split('\n'))
lemma = WordNetLemmatizer()

In [5]:
import re

def preprocess_text(t):
    words = re.split('; |, |\*|\n|/|_', t)
    words = [word.strip(',.:-_').lower() for word in words]
    words = [lemma.lemmatize(word) for word in words]
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

positive_reviews = [preprocess_text(r) for r in positive_reviews]
negative_reviews = [preprocess_text(r) for r in negative_reviews]

In [6]:
positive_reviews = np.array(positive_reviews)
negative_reviews = np.array(negative_reviews)

In [7]:
X = np.concatenate((positive_reviews, negative_reviews))
y = np.concatenate((np.ones(len(positive_reviews)), np.zeros(len(negative_reviews))))

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(X)
X.shape

(2000, 1000)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [10]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)

In [11]:
model.score(X_test, y_test)

0.7954545454545454