## Scoring Opinions and Sentiments

### Understanding How Machines Read

In [1]:
text_1 = 'The quick brown fox jumps over the lazy dog.'
text_2 = 'My dog is quick and can jump over fences.'
text_3 = 'Your dog is so lazy that it sleeps all the day.'
corpus = [text_1, text_2, text_3]

In [2]:
from sklearn.feature_extraction import text
vectorizer = text.CountVectorizer(binary=True)
vectorizer.fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense())

[[0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 1 0 0 0 1 0]
 [0 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 0 0 0 0 0]
 [1 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 1]]


In [3]:
print(vectorizer.vocabulary_)

{'the': 19, 'quick': 15, 'brown': 2, 'fox': 7, 'jumps': 11, 'over': 14, 'lazy': 12, 'dog': 5, 'my': 13, 'is': 8, 'and': 1, 'can': 3, 'jump': 10, 'fences': 6, 'your': 20, 'so': 17, 'that': 18, 'it': 9, 'sleeps': 16, 'all': 0, 'day': 4}


### Processing and Enhancing Text

In [4]:
text_4 = 'A black dog just passed by but my dog is brown.'
corpus.append(text_4)
vectorizer = text.CountVectorizer()
vectorizer.fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense()[-1])

[[0 0 1 1 1 1 0 0 2 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0]]


In [5]:
TfidF = text.TfidfTransformer(norm='l1')
tfidf_mtx = TfidF.fit_transform(vectorized_text)

phrase = 3 # choose a number from 0 to 3

total = 0
for word in vectorizer.vocabulary_:
    pos = vectorizer.vocabulary_[word]
    value = list(tfidf_mtx.toarray()[phrase])[pos]
    if value !=0:
        print ("%10s: %0.3f" % (word, value))
        total += value
print('\nSummed values of a phrase: %0.1f' % total)

     brown: 0.095
       dog: 0.126
        my: 0.095
        is: 0.077
     black: 0.121
      just: 0.121
    passed: 0.121
        by: 0.121
       but: 0.121

Summed values of a phrase: 1.0


In [6]:
bigrams = text.CountVectorizer(ngram_range=(2, 2))
print(bigrams.fit(corpus).vocabulary_)

{'the quick': 30, 'quick brown': 24, 'brown fox': 3, 'fox jumps': 9, 'jumps over': 15, 'over the': 21, 'the lazy': 29, 'lazy dog': 17, 'my dog': 19, 'dog is': 7, 'is quick': 11, 'quick and': 23, 'and can': 1, 'can jump': 6, 'jump over': 14, 'over fences': 20, 'your dog': 31, 'is so': 12, 'so lazy': 26, 'lazy that': 18, 'that it': 27, 'it sleeps': 13, 'sleeps all': 25, 'all the': 0, 'the day': 28, 'black dog': 2, 'dog just': 8, 'just passed': 16, 'passed by': 22, 'by but': 5, 'but my': 4, 'is brown': 10}


### Stemming and removing stop words

In [7]:
from sklearn.feature_extraction import text

import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

stemmer = PorterStemmer()
stop_words = stopwords.words('english')

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # Tokenizing
    tokens = word_tokenize(text)
    # Removing stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Stemming
    stems = stem_tokens(tokens, stemmer)
    return stems

vocab = ['Sam loves swimming so he swims all the time']
vect = text.CountVectorizer(tokenizer=tokenize)
vec = vect.fit(vocab)

sentence1 = vec.transform(['George loves swimming too!'])

print(vec.get_feature_names())
print(sentence1.toarray())

['love', 'sam', 'swim', 'time']
[[1 0 1 0]]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Luca\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Luca\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Scraping Textual Datasets from the Web

In [8]:
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request as urllib2

wiki = "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population"
header = {'User-Agent': 'Mozilla/5.0'} 
query = urllib2.Request(wiki, headers=header)
page = urllib2.urlopen(query)
soup = BeautifulSoup(page, "lxml")

In [9]:
table = soup.find("table", { "class" : "wikitable sortable" })
final_table = list()

def extract_txt(cell):
    """Extracting only text"""
    cells = [c.strip() for c in cell.findAll(text=True) if '[' not in c]
    return ' '.join(cells).strip()

def filter_sq(txt):
    """Extracting squared meter values"""
    return txt.split('sq')[0].strip()

cols = [extract_txt(cell) for cell in table.findAll("th")]
columns = [cols[1], cols[2], cols[3], cols[4], cols[6]]

for row in table.findAll('tr'):
    cells = row.findAll("td")
    if len(cells)>0:
        final_table.append([extract_txt(cells[1]), 
                            extract_txt(cells[2]), 
                            extract_txt(cells[3]), 
                            extract_txt(cells[4]), 
                            filter_sq(extract_txt(cells[6]))
                           ])
        
df = pd.DataFrame(final_table, columns=columns)

In [10]:
print(df.head(5))

          City       State 2019 estimate 2010 Census 2016 land area
0     New York    New York     8,336,817   8,175,133          301.5
1  Los Angeles  California     3,979,576   3,792,621          468.7
2      Chicago    Illinois     2,693,976   2,695,598          227.3
3      Houston       Texas     2,320,268   2,100,263          637.5
4      Phoenix     Arizona     1,680,992   1,445,632          517.6


### Using Scoring and Classification

In [11]:
import pandas as pd

filename = 'https://github.com/lmassaron/datasets/releases/'
filename += 'download/1.0/shakespeare_lines_in_plays.feather'
shakespeare = pd.read_feather(filename)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=1.0, min_df=3, stop_words='english')
tfidf = vectorizer.fit_transform(shakespeare.lines)

In [13]:
n_topics = 10

In [14]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=n_topics, max_iter=999, random_state=101)
nmf.fit(tfidf)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=999,
    n_components=10, random_state=101, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [15]:
def print_topic_words(features, topics, top=5):
    for idx, topic in enumerate(topics):
        words = " ".join([features[i] 
                for i in topic.argsort()[:(-top-1):-1]])
        print(f"Topic #{idx:2.0f}: {words}")

print_topic_words(vectorizer.get_feature_names(), nmf.components_)

Topic # 0: thou thy thee love ll
Topic # 1: enter exeunt exit scene ii
Topic # 2: king thy lord warwick henry
Topic # 3: caesar antony brutus cassius shall
Topic # 4: antipholus dromio chain sir syracuse
Topic # 5: page ford sir master mistress
Topic # 6: rome marcius titus lavinia andronicus
Topic # 7: lord good shall sir know
Topic # 8: cassio iago desdemona othello moor
Topic # 9: hector troilus achilles ajax troy


In [16]:
import numpy as np
index = shakespeare.play + ' act:' + shakespeare.act

def find_top_match(model, data, index, topic_no):
    topic_scores = model.transform(data)[:, topic_no]
    best_score = np.argmax(topic_scores)
    print(index[best_score])

find_top_match(nmf, tfidf, index, topic_no=8)

Othello act:5


### Analyzing reviews from e-commerce

In [17]:
import pandas as pd

filename = 'https://github.com/lmassaron/datasets/'
filename += 'releases/download/1.0/imdb_50k.feather'
reviews = pd.read_feather(filename)

In [18]:
print(reviews.review.sample(1).values[0])

Conquerer of Shamballa shows what happens when creators of an Anime fail to understand what their fans want. I as a fan did not want a 1920's Evil Nazi movie. What I would have liked to see is a real final showdown between Ed and Dante, as we don't REALLY know what became of her. I also would have liked to get Ed back to his world much sooner and have him stay there, to finally get a chance to be normal. You know, raise a family with a certain blonde mechanic, that sort of thing. No, instead I got a convoluted plot involving Nazi mystics, Fritz Lang and about ten minutes of Al, a joke of a Cameo by Roy Mustang and only one Armstrong joke, one short joke and no Winry hitting Ed with a wrench. Above all, it just didn't feel like Fullmetal Alchemist to me.


In [19]:
train = reviews.sample(30000, random_state=42)
sampled_idx = train.index
valid = (reviews[~reviews.index.isin(train.index)]
         .sample(10000, random_state=42))
sampled_idx.append(valid.index)
test = reviews[~reviews.index.isin(sampled_idx)]

In [20]:
import tensorflow.keras as keras

tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train.review)

def tokenize_and_pad(df, tokenizer, maxlen):
    sequences = tokenizer.texts_to_sequences(df.review)
    pad = keras.preprocessing.sequence.pad_sequences
    padded_seqs = pad(sequences, maxlen)
    return padded_seqs, df.sentiment.values

X, y = tokenize_and_pad(train, tokenizer, maxlen=256)
Xv, yv = tokenize_and_pad(valid, tokenizer, maxlen=256)
Xt, yt = tokenize_and_pad(test, tokenizer, maxlen=256)

In [21]:
model = keras.models.Sequential()
voc = len(tokenizer.index_word) + 1
feats = 8
seq_len = 256
model.add(keras.layers.Embedding(voc, feats, 
                          input_length=seq_len))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['acc'])
model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 256, 8)            790040    
_________________________________________________________________
flatten (Flatten)            (None, 2048)              0         
_________________________________________________________________
dropout (Dropout)            (None, 2048)              0         
_________________________________________________________________
dense (Dense)                (None, 1)                 2049      
Total params: 792,089
Trainable params: 792,089
Non-trainable params: 0
_____________________

In [22]:
history = model.fit(X, y, epochs=2, batch_size=16, 
                    validation_data=(Xv, yv))

Train on 30000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2


In [23]:
from sklearn.metrics import accuracy_score
predictions = (model.predict(Xt)>=0.5).astype(int)
test_accuracy = accuracy_score(yt, predictions)
print(f"Accuracy on test set: {test_accuracy}")

Accuracy on test set: 0.89455
