Permalink
Find file
e6408ee Dec 3, 2016
Hong Ong Uncomment
259 lines (207 sloc) 8.07 KB
"""
CLASSIFICATION
Case study: Analyzing sentiment
Models:
Linear classifiers (logistic regression, SVMs, perceptron)
Kernels
Decision trees
Algorithms:
Stochastic gradient descent
Boosting
Concepts:
Decision boundaries, MLE, ensemble methods, random forests, CART, online learning
"""
import datetime
import os
import re
import time
from itertools import islice
from operator import itemgetter
import numpy as np
import pandas as pd
from BeautifulSoup import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
def time_diff_str(t1, t2):
"""
Calculates time durations.
"""
diff = t2 - t1
mins = int(diff / 60)
secs = round(diff % 60, 2)
return str(mins) + " mins and " + str(secs) + " seconds"
def clean_sentence(sentence):
# Remove HTML
review_text = BeautifulSoup(sentence).text
# Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", review_text)
return letters_only
def convert_plain_to_csv(plain_name, csv_name):
t0 = time.time()
with open(plain_name, "r") as f1, open(csv_name, "w") as f2:
i = 0
f2.write("productId,score,summary,text\n")
while True:
next_n_lines = list(islice(f1, 9))
if not next_n_lines:
break
# process next_n_lines: get productId,score,summary,text info
# remove special characters from summary and text
output_line = ""
for line in next_n_lines:
if "product/productId:" in line:
output_line += line.split(":")[1].strip() + ","
elif "review/score:" in line:
output_line += line.split(":")[1].strip() + ","
elif "review/summary:" in line:
summary = clean_sentence(line.split(":")[1].strip()) + ","
output_line += summary
elif "review/text:" in line:
text = clean_sentence(line.split(":")[1].strip()) + "\n"
output_line += text
f2.write(output_line)
# print status
i += 1
if i % 10000 == 0:
print "%d reviews converted..." % i
print " %s - Converting completed %s" % (datetime.datetime.now(), time_diff_str(t0, time.time()))
def get_reviews_data(file_name):
"""Get reviews data, from local csv."""
if os.path.exists(file_name):
print("-- " + file_name + " found locally")
df = pd.read_csv(file_name)
return df
def review_to_words(review):
"""
Function to convert a raw review to a string of words
:param review
:return: meaningful_words
"""
# 1. Convert to lower case, split into individual words
words = review.lower().split()
#
# 2. In Python, searching a set is much faster than searching
# a list, so convert the stop words to a set
stops = set(stopwords.words("english"))
#
# 3. Remove stop words
meaningful_words = [w for w in words if not w in stops]
#
# 4. Join the words back into one string separated by space,
# and return the result.
return " ".join(meaningful_words)
def cleaning_data(dataset, file_name):
t0 = time.time()
# Get the number of reviews based on the dataframe column size
num_reviews = dataset["text"].size
# Initialize an empty list to hold the clean reviews
clean_train_reviews = []
# Loop over each review
for i in xrange(0, num_reviews):
# If the index is evenly divisible by 1000, print a message
if (i + 1) % 10000 == 0:
print "Review %d of %d\n" % (i + 1, num_reviews)
# Call our function for each one, and add the result to the list of
# clean reviews
productId = str(dataset["productId"][i])
score = str(dataset["score"][i])
summary = str(dataset["summary"][i])
text = review_to_words(str(dataset["text"][i]))
clean_train_reviews.append(productId + "," + score + "," + summary + "," + text + "\n")
print "Writing clean train reviews..."
with open(file_name, "w") as f:
f.write("productId,score,summary,text\n")
for review in clean_train_reviews:
f.write("%s\n" % review)
print " %s - Write file completed %s" % (datetime.datetime.now(), time_diff_str(t0, time.time()))
def print_words_frequency(train_data_features):
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print "Words in vocabulary:", vocab
# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)
# For each, print the vocabulary word and the number of times it
# appears in the training set
print "Words frequency..."
for tag, count in zip(vocab, dist):
print count, tag
if __name__ == "__main__":
"""
Pre-processing
"""
# converting plain text for next processing
convert_plain_to_csv("foods.txt", "foods.csv")
# Reading the Data
train = get_reviews_data("foods.csv")
print "Data dimensions:", train.shape
print "List features:", train.columns.values
print "First review:", train["summary"][0], "|", train["text"][0]
cleaning_data(train, "clean_train_reviews.csv")
"""
Bag of Words features
"""
clean_train_reviews = pd.read_csv("clean_train_reviews.csv", nrows=1000)
# ignore all 3* reviews
clean_train_reviews = clean_train_reviews[clean_train_reviews["score"] != 3]
# positive sentiment = 4* or 5* reviews
clean_train_reviews["sentiment"] = clean_train_reviews["score"] >= 4
train, test = train_test_split(clean_train_reviews, test_size=0.2)
print "Creating the bag of words...\n"
vectorizer = CountVectorizer(analyzer="word",
tokenizer=None,
preprocessor=None,
stop_words=None,
max_features=10)
train_text = train["text"].values.astype('U')
test_text = test["text"].values.astype('U')
# convert data-set to term-document matrix
X_train = vectorizer.fit_transform(train_text).toarray()
y_train = train["sentiment"]
X_test = vectorizer.fit_transform(test_text).toarray()
y_test = test["sentiment"]
print_words_frequency(X_train)
"""
Training
"""
print "---------------------------"
print "Training"
print "---------------------------"
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
"Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
"Naive Bayes", "QDA"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
MLPClassifier(alpha=1),
AdaBoostClassifier(),
GaussianNB(),
QuadraticDiscriminantAnalysis()]
# iterate over classifiers
results = {}
for name, clf in zip(names, classifiers):
print "Training " + name + " classifier..."
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
results[name] = score
print "---------------------------"
print "Evaluation results"
print "---------------------------"
# sorting results and print out
sorted(results.items(), key=itemgetter(1))
for name in results:
print name + " accuracy: %0.3f" % results[name]