## Scoring Opinions and Sentiments

### Understanding How Machines Read

In [7]:
text_1 = 'The quick brown fox jumps over the lazy dog.'
text_2 = 'My dog is quick and can jump over fences.'
text_3 = 'Your dog is so lazy that it sleeps all the day.'
corpus = [text_1, text_2, text_3]

In [8]:
from sklearn.feature_extraction import text
vectorizer = text.CountVectorizer(binary=True).fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense())

[[0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 1 0 0 0 1 0]
 [0 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 0 0 0 0 0]
 [1 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 1]]


In [9]:
print(vectorizer.vocabulary_)

{'the': 19, 'quick': 15, 'brown': 2, 'fox': 7, 'jumps': 11, 'over': 14, 'lazy': 12, 'dog': 5, 'my': 13, 'is': 8, 'and': 1, 'can': 3, 'jump': 10, 'fences': 6, 'your': 20, 'so': 17, 'that': 18, 'it': 9, 'sleeps': 16, 'all': 0, 'day': 4}


### Processing and Enhancing Text

In [10]:
text_4 = 'A black dog just passed by but my dog is brown.'
corpus.append(text_4)
vectorizer = text.CountVectorizer().fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense()[-1])

[[0 0 1 1 1 1 0 0 2 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0]]


In [11]:
TfidF = text.TfidfTransformer(norm='l1')
tfidf_mtx = TfidF.fit_transform(vectorized_text)

phrase = 3 # choose a number from 0 to 3

total = 0
for word in vectorizer.vocabulary_:
    pos = vectorizer.vocabulary_[word]
    value = list(tfidf_mtx.toarray()[phrase])[pos]
    if value !=0:
        print ("%10s: %0.3f" % (word, value))
        total += value
print('\nSummed values of a phrase: %0.1f' % total)

     brown: 0.095
       dog: 0.126
        my: 0.095
        is: 0.077
     black: 0.121
      just: 0.121
    passed: 0.121
        by: 0.121
       but: 0.121

Summed values of a phrase: 1.0


In [12]:
bigrams = text.CountVectorizer(ngram_range=(2,2))
print(bigrams.fit(corpus).vocabulary_)

{'the quick': 30, 'quick brown': 24, 'brown fox': 3, 'fox jumps': 9, 'jumps over': 15, 'over the': 21, 'the lazy': 29, 'lazy dog': 17, 'my dog': 19, 'dog is': 7, 'is quick': 11, 'quick and': 23, 'and can': 1, 'can jump': 6, 'jump over': 14, 'over fences': 20, 'your dog': 31, 'is so': 12, 'so lazy': 26, 'lazy that': 18, 'that it': 27, 'it sleeps': 13, 'sleeps all': 25, 'all the': 0, 'the day': 28, 'black dog': 2, 'dog just': 8, 'just passed': 16, 'passed by': 22, 'by but': 5, 'but my': 4, 'is brown': 10}


### Stemming and removing stop words

In [13]:
from sklearn.feature_extraction import text

import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

stemmer = PorterStemmer()
stop_words = stopwords.words('english')

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # Tokenizing
    tokens = word_tokenize(text)
    # Removing stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Stemming
    stems = stem_tokens(tokens, stemmer)
    return stems

vocab = ['Sam loves swimming so he swims all the time']
vect = text.CountVectorizer(tokenizer=tokenize)
vec = vect.fit(vocab)

sentence1 = vec.transform(['George loves swimming too!'])

print(vec.get_feature_names())
print(sentence1.toarray())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Luca\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Luca\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


['love', 'sam', 'swim', 'time']
[[1 0 1 0]]




### Scraping Textual Datasets from the Web

In [16]:
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request as urllib2 # Python 3.x

wiki = "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population"
header = {'User-Agent': 'Mozilla/5.0'} 
query = urllib2.Request(wiki, headers=header)
page = urllib2.urlopen(query)
soup = BeautifulSoup(page, "lxml")

In [19]:
table = soup.find("table", { "class" : "wikitable sortable" })
final_table = list()

def extract_txt(cell):
    """Extracting only text"""
    cells = [c.strip() for c in cell.findAll(text=True) if '[' not in c]
    return ' '.join(cells).strip()

def filter_sq(txt):
    """Extracting squared meter values"""
    return txt.split('sq')[0].strip()

cols = [extract_txt(cell) for cell in table.findAll("th")]
columns = [cols[1], cols[2], cols[3], cols[4], cols[6]]

for row in table.findAll('tr'):
    cells = row.findAll("td")
    if len(cells)>0:
        final_table.append([extract_txt(cells[1]), 
                            extract_txt(cells[2]), 
                            extract_txt(cells[3]), 
                            extract_txt(cells[4]), 
                            filter_sq(extract_txt(cells[6]))
                           ])
        
df = pd.DataFrame(final_table, columns=columns)

In [20]:
df.head(5)

Unnamed: 0,City,State,2019 estimate,2010 Census,2016 land area
0,New York,New York,8336817,8175133,301.5
1,Los Angeles,California,3979576,3792621,468.7
2,Chicago,Illinois,2693976,2695598,227.3
3,Houston,Texas,2320268,2100263,637.5
4,Phoenix,Arizona,1680992,1445632,517.6


### Using Scoring and Classification


In [30]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, 
                             categories = ['misc.forsale'],
                             remove=('headers', 'footers', 'quotes'),
                             random_state=101)
print('Posts: %i' % len(dataset.data))

Posts: 585


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.95, 
                             min_df=2, 
                             stop_words='english')
tfidf = vectorizer.fit_transform(dataset.data)

In [32]:
from sklearn.decomposition import NMF
n_topics = 5
nmf = NMF(n_components=n_topics, random_state=101)
nmf.fit(tfidf)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=5, random_state=101, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [33]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=n_topics, 
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=101)
lda.fit(tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=5,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=101, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [36]:
feature_names = vectorizer.get_feature_names()
n_top_words = 15
for topic_idx, topic in enumerate(lda.components_):
    print("Topic #%d:" % (topic_idx+1),)
    print(" ".join([feature_names[i] for i in 
            topic.argsort()[:-n_top_words - 1:-1]]))

Topic #1:
promo sleeve picture garth u2 brooks chuvashia inguiry eridan er1 cd su roger 10 waters
Topic #2:
ticket tickets ios hell life junk airline people interested chicago 21 round return offer trip
Topic #3:
refrigerator video wanted jump improper tx actually watch dance etiquette used pairs postage add wrapped
Topic #4:
amd regards steve intel gatech prism dial real junk chips 486 improper etiquette mailings mark
Topic #5:
00 sale new offer shipping drive condition price email like sell used edu card mail
