# Pattern: A Python package for unsupervised (lexicon-based) sentiment analysis

More advanced than simplying counting the number of positive and negative words and determining the overall sentiment (or opinion)

Written text can be broadly categorized into two types: facts and opinions. Opinions carry people's sentiments, appraisals and feelings toward the world. The pattern.en module bundles a lexicon of adjectives (e.g., good, bad, amazing, irritating, ...) that occur frequently in product reviews, annotated with scores for sentiment polarity (positive ↔ negative) and subjectivity (objective ↔ subjective). 

The sentiment() function returns a (polarity, subjectivity)-tuple for the given sentence, based on the adjectives it contains, where polarity is a value between -1.0 and +1.0 and subjectivity between 0.0 and 1.0. The sentence can be a string, Text, Sentence, Chunk, Word or a Synset (see below). 

The positive() function returns True if the given sentence's polarity is above the threshold. The threshold can be lowered or raised, but overall +0.1 gives the best results for product reviews. Accuracy is about 75% for movie reviews.

Source: http://www.clips.ua.ac.be/pages/pattern-en#sentiment

## The simplest way to install Pattern is using pip:
**pip install pattern**

In [None]:
from pattern.en import sentiment
import csv
import pandas as pd

import re

import nltk
from nltk.corpus import stopwords

In [None]:
sentiment('iPhone 5 is best smartphone in the world')

In [None]:
sentiment('dirty place and poor service')

In [None]:
reviews = [
    "The server was rude, bad location, poor service overall",
    "This bike is amazing, but the brake is very poor",
    "This ice maker works great, the price is very reasonable, some bad smell from the ice maker",
    "The food was awesome, but the water was very rude"
    ]

for row in reviews:
    print sentiment(row)    

<img src="images\pattern.gif">

In [None]:
reviews = [
    "The server was rude, bad location, poor service overall",
    "This bike is amazing, but the brake is very poor",
    "This ice maker works great, the price is very reasonable, some bad smell from the ice maker",
    "The food was awesome, but the water was very rude"
    ]

for row in reviews:
    score = sentiment(row)
    print score[0], score[1]    

In [None]:
reviews = [
    "The server was rude, bad location, poor service overall",
    "This bike is amazing, but the brake is very poor",
    "This ice maker works great, the price is very reasonable, some bad smell from the ice maker",
    "The food was awesome, but the water was very rude"
    ]

# Remove useless numbers and alphanumerical words
documents = [re.sub("[^a-zA-Z]+", " ", document) for document in reviews]
# tokenize
texts = [[word for word in document.lower().split() ] for document in documents]
# remove common words 
stoplist = stopwords.words('english')
texts = [[word for word in text if word not in stoplist] for text in texts]
#remove short words
texts = [[ word for word in tokens if len(word) >= 3 ] for tokens in texts]

for row in texts:
    score = sentiment(row)
    print score[0], score[1]  

# Sentiment Analysis of Actual Twitter Data

In [None]:
reviews = []
openfile = open('data/sample_tweet_from_azuredatafactory.csv', 'rb')
r = csv.reader(openfile)
for i in r:
    reviews.append(i)  
openfile.close()
reviews[5]

In [None]:
len(reviews)

In [None]:
# without text preprocessing ... this is good enough 
for row in reviews:
    tweet = row[0]
    score = sentiment(tweet)
    print score[0], score[1] 

In [None]:
#remove extra brackets
reviews = [x for y in reviews for x in y]
# Remove useless numbers and alphanumerical words
documents = [re.sub("[^a-zA-Z]+", " ", document) for document in reviews]
# tokenize
texts = [[word for word in document.lower().split() ] for document in documents]
# remove common words 
stoplist = stopwords.words('english')
texts = [[word for word in text if word not in stoplist] for text in texts]
#remove short words
texts = [[ word for word in tokens if len(word) >= 3 ] for tokens in texts]

for row in texts:
    data = row[0]
    score = sentiment(data)
    print score[0], score[1]  

In [None]:
#saving results in csv

reviews = []
openfile = open('data/sample_tweet_from_azuredatafactory.csv', 'rb')
r = csv.reader(openfile)
for i in r:
    reviews.append(i)  
openfile.close()

writefile = open('data/output_sentiscore_tweets.csv', 'wb')
w = csv.writer(writefile)
for row in reviews:
    tweet = row[0]
    score = sentiment(tweet)
    w.writerow([score[0], score[1]])     
writefile.close()

In [None]:
positive_review

# Appendix

In [None]:
#saving results in csv - another approach

reviews = []
openfile = open('data\\sample_tweet_from_azuredatafactory.csv', 'rb')
r = csv.reader(openfile)
for i in r:
    reviews.append(i)  
openfile.close()

score=[]
for row in reviews:
    score.append(sentiment(row[0]))

output=zip(score)
writer = csv.writer(open('data\\output_sentiscore3.csv', 'wb'))
writer.writerows(output)

In [None]:
for item in score:
    print item[0], ",", item[1]

# Separating Positive Tweets and Negative Tweets & Word Frequency

In [None]:
positive_review = []
negative_review = []
neutral_review = []

for row in reviews:
    tweet = row[0]
    score = sentiment(tweet)
    if score[0] > 0:
        positive_review.append(tweet)
    elif score[0] == 0:
        neutral_review.append(tweet)
    else:
        negative_review.append(tweet)

In [None]:
print len(positive_review)
print len(negative_review)
print len(neutral_review)

In [None]:
positive_review

### Word Frequency of Positive Reivews Only

In [None]:
# remove all urls
documents = [re.sub(r"http\S+", '', document) for document in positive_review] 
# Remove useless numbers and alphanumerical words
documents = [re.sub("[^a-zA-Z]+", " ", document) for document in documents]
# tokenize
texts = [document.lower().split() for document in documents]
# remove common words 
stoplist = stopwords.words('english')
texts = [[word for word in text if word not in stoplist] for text in texts]
#remove short words
texts = [[ word for word in tokens if len(word) >= 3 ] for tokens in texts]

In [None]:
# remove extra brackets)
cleaned_positive_review_tokens = [x for y in texts for x in y]
cleaned_positive_review_tokens[:10]

In [None]:
# word frequency
from collections import Counter

positive_review_wordcounts= Counter(cleaned_positive_review_tokens)
positive_review_wordcounts.most_common()

In [None]:
# save the word frequency in dataframe (Excel like)
positivereview_wordfreq = pd.DataFrame(positive_review_wordcounts.most_common())
positivereview_wordfreq

In [None]:
# This process could be very slow for a large corpus

from os import path
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

text = str(cleaned_positive_review_tokens)

# Generate a word cloud image
wc = WordCloud(background_color="white", max_words=2000).generate(text)
wc.generate(text)

# Display the generated image:
# the matplotlib way:
plt.imshow(wc)
plt.axis("off")

# take relative word frequencies into account, lower max_font_size
#wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
plt.figure(figsize=(16,16))
plt.imshow(wc)
plt.axis("off")
plt.savefig("data/pos.png")
plt.savefig("data/pos.pdf")

# Now, you can perform the same analysis for negative reviews
- First, perform the word frequency analysis on the negative reviews
- Then, combine the results of positive and negative reviews for comparison
- Also, you can compare the word clouds