# Scoring Opinions and Sentiments

## Understanding How Machines Read

In [1]:
text_1 = 'The quick brown fox jumps over the lazy dog.'
text_2 = 'My dog is quick and can jump over fences.'
text_3 = 'Your dog is so lazy that it sleeps all the day.'
corpus = [text_1, text_2, text_3]

In [2]:
from sklearn.feature_extraction import text
vectorizer = text.CountVectorizer(binary=True).fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense())

[[0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 1 0 0 0 1 0]
 [0 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 0 0 0 0 0]
 [1 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 1]]


In [3]:
print(vectorizer.vocabulary_)

{'the': 19, 'quick': 15, 'brown': 2, 'fox': 7, 'jumps': 11, 'over': 14, 'lazy': 12, 'dog': 5, 'my': 13, 'is': 8, 'and': 1, 'can': 3, 'jump': 10, 'fences': 6, 'your': 20, 'so': 17, 'that': 18, 'it': 9, 'sleeps': 16, 'all': 0, 'day': 4}


## Processing and Enhancing Text

In [4]:
text_4 = 'A black dog just passed by but my dog is brown.'
corpus.append(text_4)
vectorizer = text.CountVectorizer().fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense()[-1])

[[0 0 1 1 1 1 0 0 2 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0]]


In [5]:
TfidF = text.TfidfTransformer(norm='l1')
tfidf = TfidF.fit_transform(vectorized_text)

phrase = 3 # choose a number from 0 to 3
total = 0
for word in vectorizer.vocabulary_:
    pos = vectorizer.vocabulary_[word]
    value = list(tfidf.toarray()[phrase])[pos]
    if value !=0:
        print ("%10s: %0.3f" % (word, value))
        total += value
print ('\nSummed values of a phrase: %0.1f' % total)

     brown: 0.095
       dog: 0.126
        my: 0.095
        is: 0.077
     black: 0.121
      just: 0.121
    passed: 0.121
        by: 0.121
       but: 0.121

Summed values of a phrase: 1.0


In [6]:
bigrams = text.CountVectorizer(ngram_range=(2,2))
print (bigrams.fit(corpus).vocabulary_)

{'the quick': 30, 'quick brown': 24, 'brown fox': 3, 'fox jumps': 9, 'jumps over': 15, 'over the': 21, 'the lazy': 29, 'lazy dog': 17, 'my dog': 19, 'dog is': 7, 'is quick': 11, 'quick and': 23, 'and can': 1, 'can jump': 6, 'jump over': 14, 'over fences': 20, 'your dog': 31, 'is so': 12, 'so lazy': 26, 'lazy that': 18, 'that it': 27, 'it sleeps': 13, 'sleeps all': 25, 'all the': 0, 'the day': 28, 'black dog': 2, 'dog just': 8, 'just passed': 16, 'passed by': 22, 'by but': 5, 'but my': 4, 'is brown': 10}


## Stemming and removing stop words

In [7]:
from sklearn.feature_extraction import text

import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')

stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

vocab = ['Sam loves swimming so he swims all the time']
vect = text.CountVectorizer(tokenizer=tokenize, 
                           stop_words='english')
vec = vect.fit(vocab)

sentence1 = vec.transform(['George loves swimming too!'])

print (vec.get_feature_names())
print (sentence1.toarray())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Luca\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['love', 'sam', 'swim', 'time']
[[1 0 1 0]]


## Analyzing reviews from e-commerce

In [8]:
try:
    import urllib2 # Python 2.7.x
except:
    import urllib.request as urllib2 # Python 3.x
import requests, io, os, zipfile

UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip'
response = requests.get(UCI_url)
compressed_file = io.BytesIO(response.content)
z = zipfile.ZipFile(compressed_file)
print ('Extracting in %s' %  os.getcwd())
for name in z.namelist():
    filename = name.split('/')[-1]
    nameOK = ('MACOSX' not in name and '.DS' not in name)
    if filename and nameOK:
            newfile = os.path.join(os.getcwd(), 
                                   os.path.basename(filename))
            with open(newfile, 'wb') as f:
                f.write(z.read(name))
            print ('\tunzipping %s' % newfile)

Extracting in C:\Users\Luca\Dropbox\ASSIRM\assirm_20170523\algorithms
	unzipping C:\Users\Luca\Dropbox\ASSIRM\assirm_20170523\algorithms\amazon_cells_labelled.txt
	unzipping C:\Users\Luca\Dropbox\ASSIRM\assirm_20170523\algorithms\imdb_labelled.txt
	unzipping C:\Users\Luca\Dropbox\ASSIRM\assirm_20170523\algorithms\readme.txt
	unzipping C:\Users\Luca\Dropbox\ASSIRM\assirm_20170523\algorithms\yelp_labelled.txt


In [9]:
import numpy as np
import pandas as pd
dataset = 'imdb_labelled.txt'
data = pd.read_csv(dataset, header=None, sep=r"\t", engine='python')
data.columns = ['review','sentiment']

In [10]:
data.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [11]:
try:
    from sklearn.model_selection import train_test_split
except:
    from sklearn.cross_validation import train_test_split
corpus, test_corpus, y, yt = train_test_split(data.ix[:,0], data.ix[:,1], test_size=0.25, random_state=101)

In [12]:
from sklearn.feature_extraction import text
vectorizer = text.CountVectorizer(ngram_range=(1, 2), 
                    stop_words='english').fit(corpus)
TfidF = text.TfidfTransformer()
X = TfidF.fit_transform(vectorizer.transform(corpus))
Xt = TfidF.transform(vectorizer.transform(test_corpus))

In [13]:
print ("cases by features: %s" % str(X.shape))

cases by features: (750, 6466)


In [14]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
try:
    from sklearn.model_selection import GridSearchCV
except:
    from sklearn.grid_search import GridSearchCV
    
param_grid = {'C': [0.01, 0.1, 1.0, 10.0, 100.0],
              'penalty' : ['l1', 'l2']}
clf = GridSearchCV(LinearSVC(loss='squared_hinge', 
                    dual=False, random_state=101), param_grid)

clf.fit(X, y)
print ("Best parameters: %s" % clf.best_params_)

clf = CalibratedClassifierCV(clf.best_estimator_ ).fit(X, y)


Best parameters: {'C': 0.1, 'penalty': 'l2'}


In [15]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
solution = clf.predict(Xt)
probabilities = clf.predict_proba(Xt)[:,1]
print("Achieved accuracy: %0.3f" % accuracy_score(yt, solution))
print("Achieved roc auc score: %0.3f" % roc_auc_score(yt, probabilities))
print("\n", classification_report(yt, solution, target_names=['Negative', 'Positive']))

Achieved accuracy: 0.812
Achieved roc auc score: 0.872

              precision    recall  f1-score   support

   Negative       0.81      0.80      0.81       122
   Positive       0.81      0.82      0.82       128

avg / total       0.81      0.81      0.81       250



In [16]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

NB_multi = MultinomialNB()
NB_bern  = BernoulliNB()

NB_multi = NB_multi.fit(X, y)
NB_bern = NB_bern.fit(X, y)

In [17]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
solution = NB_bern.predict(Xt)
probabilities = NB_bern.predict_proba(Xt)[:,1]
print("Achieved accuracy: %0.3f" % accuracy_score(yt, solution))
print("Achieved roc auc score: %0.3f" % roc_auc_score(yt, probabilities))
print("\n", classification_report(yt, solution, target_names=['Negative', 'Positive']))

Achieved accuracy: 0.760
Achieved roc auc score: 0.916

              precision    recall  f1-score   support

   Negative       0.67      0.98      0.80       122
   Positive       0.97      0.55      0.70       128

avg / total       0.83      0.76      0.75       250



In [18]:
print("\n", classification_report(yt, probabilities>0.2, target_names=['Negative', 'Positive']))
# try probabilities > 0.2
# remember to tune on a validation set, not on the test set


              precision    recall  f1-score   support

   Negative       0.81      0.86      0.84       122
   Positive       0.86      0.81      0.84       128

avg / total       0.84      0.84      0.84       250



In [19]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
solution = NB_multi.predict(Xt)
probabilities = NB_multi.predict_proba(Xt)[:,1]
print("Achieved accuracy: %0.3f" % accuracy_score(yt, solution))
print("Achieved roc auc score: %0.3f" % roc_auc_score(yt, probabilities))
print("\n", classification_report(yt, solution, target_names=['Negative', 'Positive']))

Achieved accuracy: 0.824
Achieved roc auc score: 0.903

              precision    recall  f1-score   support

   Negative       0.81      0.83      0.82       122
   Positive       0.83      0.82      0.83       128

avg / total       0.82      0.82      0.82       250



In [20]:
for text, right_answer, prob_pos in zip(test_corpus[yt!=solution], yt[yt!=solution], probabilities[yt!=solution]):
    print ("'%s' answer=%s prob_pos=%0.3f\n" % (text.strip(), right_answer, prob_pos))

'But in terms of the writing it's very fresh and bold.' answer=1 prob_pos=0.480

'This is the kind of money that is wasted properly.' answer=1 prob_pos=0.302

'At any rate this film stinks, its not funny, and Fulci should have stayed with giallo and supernatural zombie movies.' answer=0 prob_pos=0.611

'Speaking of the music, it is unbearably predictably and kitchy.' answer=0 prob_pos=0.539

'It really created a unique feeling though.' answer=1 prob_pos=0.385

'It's one of the movies I need to see whenever it comes on TV...never mind the fact that I already have it memorized!' answer=1 prob_pos=0.477

'The camera really likes her in this movie.' answer=1 prob_pos=0.500

'I saw "Mirrormask" last night and it was an unsatisfactory experience.' answer=0 prob_pos=0.582

'Rating: 1 out of 10.' answer=0 prob_pos=0.560

'I'm so sorry but I really can't recommend it to anyone.' answer=0 prob_pos=0.503

'A world better than 95% of the garbage in the theatres today.' answer=1 prob_pos=0.430

'Bu