In [1]:
# mostly following this video example of Latent Semantic Analysis and applying it to Darwin Corpus: https://www.youtube.com/watch?v=BJ0MnawUpaU




# 'On the Origin of Species (sixth edition)' html: http://www.gutenberg.org/files/2009/2009-h/2009-h.htm

# 'The Descent of Man': http://www.gutenberg.org/cache/epub/2300/pg2300.html

# 'The Expression of the Emotions in Man and Animals': http://www.gutenberg.org/files/1227/1227-h/1227-h.htm


# Importing requests, BeautifulSoup and nltk
import requests
from bs4 import BeautifulSoup
import nltk

In [2]:
# get the html for all three books from Project Gutenberg website

origin = requests.get('http://www.gutenberg.org/files/2009/2009-h/2009-h.htm')
descent = requests.get('http://www.gutenberg.org/cache/epub/2300/pg2300.html')
emotions = requests.get('http://www.gutenberg.org/files/1227/1227-h/1227-h.htm')

In [9]:
# Setting the correct text encoding of the HTML page for origin 
origin.encoding = 'utf-8'

# Extracting the HTML from the request object
origin_html = origin.text

# Creating a BeautifulSoup object from the origin HTML
origin_soup = BeautifulSoup(origin_html, 'html.parser')

# Getting the text out of the origin soup
origin_text = origin_soup.get_text()



# TOKENIZATION WITH NLTK COMMENTED OUT BECAUSE IT IS NOT NECESSARY IF USING sci-kit learn 

# Creating a tokenizer
# tokenizer = nltk.tokenize.RegexpTokenizer('\w+')

# Tokenizing the origin text
# origin_tokens = tokenizer.tokenize(origin_text)

In [35]:
[type(origin.encoding), type(origin_html), type(origin_soup), type(origin_text)]

[str, str, bs4.BeautifulSoup, str]

In [10]:
# Setting the correct text encoding of the HTML page for descent
descent.encoding = 'utf-8'

# Extracting the HTML from the request object
descent_html = descent.text

# Creating a BeautifulSoup object from the descent HTML
descent_soup = BeautifulSoup(descent_html, 'html.parser')

# Getting the text out of the descent soup
descent_text = descent_soup.get_text()

# Tokenizing the descent text
# descent_tokens = tokenizer.tokenize(descent_text)

In [11]:
# Setting the correct text encoding of the HTML page for emotions
emotions.encoding = 'utf-8'

# Extracting the HTML from the request object
emotions_html = emotions.text

# Creating a BeautifulSoup object from the emotions HTML
emotions_soup = BeautifulSoup(emotions_html, 'html.parser')

# Getting the text out of the emotions soup
emotions_text = emotions_soup.get_text()

# Tokenizing the emotions text
# emotions_tokens = tokenizer.tokenize(emotions_text)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords

stopwords = nltk.corpus.stopwords.words('english')

In [16]:
# make a single list containing all three texts (untokenized) books that will be treated as a corpus for tf-idf

darwin_corpus = [origin_text, descent_text, emotions_text]

In [17]:
# instantiate the TfidfVectorizer allowing for ngrams of 1, 2 and 3 words in length to be used as features

vectorizer = TfidfVectorizer(stop_words=stopwords, use_idf=True, ngram_range=(1,3))


# fit the TfidfVectorizer to darwin_corpus and then generate document-term sparse matrix ('X') with Tf-idf scores

X = vectorizer.fit_transform(darwin_corpus)

In [47]:
# X is 3 x 552738 matrix (3 books --the rows-- in corpus and 552738 tokens/features/words --the columns-- extracted from corpus)

X.shape

(3, 552738)

In [18]:
X[0]

<1x552738 sparse matrix of type '<class 'numpy.float64'>'
	with 189860 stored elements in Compressed Sparse Row format>

In [19]:
print(X[0])

  (0, 361095)	0.0191015987011
  (0, 466036)	0.460826068665
  (0, 456387)	0.00238769983764
  (0, 156569)	0.00429785970776
  (0, 93249)	0.00358154975646
  (0, 126948)	0.00191015987011
  (0, 69202)	0.0157588189284
  (0, 308544)	0.00716309951293
  (0, 53776)	0.000614919655066
  (0, 183970)	0.000404272724813
  (0, 496122)	0.00214892985388
  (0, 27345)	0.00215221879273
  (0, 269083)	0.000716309951293
  (0, 252614)	0.000614919655066
  (0, 6223)	0.000307459827533
  (0, 506312)	0.00143261990259
  (0, 7310)	0.000614919655066
  (0, 70965)	0.00310400978894
  (0, 228837)	0.000307459827533
  (0, 228840)	0.000307459827533
  (0, 228843)	0.000307459827533
  (0, 228846)	0.000307459827533
  (0, 228849)	0.000307459827533
  (0, 228852)	0.000307459827533
  (0, 87206)	0.000922379482599
  :	:
  (0, 533792)	0.000238769983764
  (0, 456063)	0.000238769983764
  (0, 296133)	0.000238769983764
  (0, 377157)	0.000238769983764
  (0, 433768)	0.000404272724813
  (0, 182509)	0.000404272724813
  (0, 228752)	0.000238769983

In [53]:
# feature names of tokens are stored in TfidfVectorizer object 

vectorizer.get_feature_names()

['000',
 '000 000',
 '000 000 soldiers',
 '000 15',
 '000 15 000',
 '000 according',
 '000 according loose',
 '000 feet',
 '000 feet deeply',
 '000 feet informs',
 '000 feet professor',
 '000 feet solid',
 '000 feet valleys',
 '000 feet yet',
 '000 number',
 '000 number soon',
 '000 padding',
 '000 padding 5em',
 '000 particularly',
 '000 particularly important',
 '000 pieces',
 '000 pieces shell',
 '000 pigeons',
 '000 pigeons taken',
 '000 soldiers',
 '000 soldiers served',
 '000 specimens',
 '000 specimens apus',
 '000 years',
 '000 years ago',
 '000 years respect',
 '019',
 '019 81',
 '019 81 1860',
 '050',
 '050 1832',
 '050 1832 several',
 '071',
 '071 males',
 '071 males 25',
 '08',
 '08 100',
 '08 100 time',
 '084',
 '084 18',
 '084 18 1866',
 '09',
 '09 inch',
 '09 inch therefore',
 '10',
 '10 000',
 '10 000 15',
 '10 100',
 '10 100 plants',
 '10 12',
 '10 12 35',
 '10 1857',
 '10 1857 regard',
 '10 1868',
 '10 1868 724',
 '10 1869',
 '10 1869 lately',
 '10 773',
 '10 773 fema

In [20]:
X.shape

(3, 552738)

In [49]:
# need to convert sparse document-term matrix 'X' to an numpy array to allow it to be put into pandas df
# this can be done on-the-fly below when making the dataframe.  Just showing it explicitly here with type()

X_arr = X.toarray()

type(X_arr)

numpy.ndarray

In [54]:
# generate a pandas dataframe from the doc-term matrix containing the docs and features with the Tf-idf scores are the entries 

import pandas as pd

darwin_corpus_tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [55]:
darwin_corpus_tfidf_df.head()

Unnamed: 0,000,000 000,000 000 soldiers,000 15,000 15 000,000 according,000 according loose,000 feet,000 feet deeply,000 feet informs,...,über das darwin,über das langenwachsthum,über den,über den vogelschwanz,über die,über die darwin,über die entstehung,über die heuschrecken,über die knochernen,über die richtung
0,0.002388,0.0,0.0,0.0,0.0,0.0,0.0,0.001537,0.000404,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.001552,0.000292,0.000292,0.000292,0.000292,0.000292,0.000292,0.000222,0.0,0.000292,...,0.000292,0.000292,0.000292,0.000292,0.001752,0.000292,0.000292,0.000292,0.000292,0.000584
2,0.000995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
# transposed dataframe is now the term-document matrix cast as a df

darwin_corpus_tfidf_df.transpose().head(20)

Unnamed: 0,0,1,2
000,0.002388,0.001552,0.000995
000 000,0.0,0.000292,0.0
000 000 soldiers,0.0,0.000292,0.0
000 15,0.0,0.000292,0.0
000 15 000,0.0,0.000292,0.0
000 according,0.0,0.000292,0.0
000 according loose,0.0,0.000292,0.0
000 feet,0.001537,0.000222,0.0
000 feet deeply,0.000404,0.0,0.0
000 feet informs,0.0,0.000292,0.0


In [58]:
darwin_corpus_tfidf_df.transpose().tail(20)

Unnamed: 0,0,1,2
étalons qui,0.0,0.000292,0.0
étalons qui eprennent,0.0,0.000292,0.0
étant,0.0,0.000292,0.0
étant plus,0.0,0.000292,0.0
étant plus terne,0.0,0.000292,0.0
études,0.0,0.000876,0.0
études sur,0.0,0.000876,0.0
études sur les,0.0,0.000876,0.0
über,0.0,0.002628,0.0
über das,0.0,0.000584,0.0


In [61]:
# top 20 tf-idf scores for 'On the Origin of Species' relative to the three book darwin_corpus

darwin_corpus_tfidf_df.transpose()[0].sort_values(ascending=False)[:20]

species              0.460826
one                  0.193642
would                0.167855
may                  0.161886
many                 0.147321
forms                0.134905
selection            0.133950
natural              0.127742
varieties            0.116042
two                  0.112699
plants               0.112461
animals              0.104104
natural selection    0.097418
thus                 0.093359
several              0.087867
different            0.087390
great                0.086435
distinct             0.085718
life                 0.083808
case                 0.081898
Name: 0, dtype: float64

In [62]:
# top 20 tf-idf scores for 'The Descent of Man' relative to the three book darwin_corpus

darwin_corpus_tfidf_df.transpose()[1].sort_values(ascending=False)[:20]

male         0.274739
males        0.219722
man          0.210927
female       0.197992
species      0.189713
sexes        0.180400
females      0.170225
one          0.163843
birds        0.160739
mr           0.155392
would        0.150046
many         0.130902
sexual       0.126590
may          0.122624
animals      0.115380
selection    0.107102
young        0.096926
two          0.096754
vol          0.094339
much         0.093994
Name: 1, dtype: float64

In [63]:
# top 20 tf-idf scores for 'The Expression of the Emotions in Man and Animal' relative to the three book darwin_corpus

darwin_corpus_tfidf_df.transpose()[2].sort_values(ascending=False)[:20]

may           0.173657
one           0.173160
man           0.169179
muscles       0.167686
return        0.160223
expression    0.154251
eyes          0.134846
mr            0.128377
much          0.120913
often         0.117430
movements     0.112454
tears         0.106996
mouth         0.103995
blush         0.096043
eyebrows      0.095469
action        0.094541
would         0.094044
seen          0.094044
thus          0.092053
manner        0.090561
Name: 2, dtype: float64

In [24]:
# Do a matrix decomposition to do latent semantic analysis (create three concepts-- single "concept" for each book)

from sklearn.decomposition import TruncatedSVD

lsa = TruncatedSVD(n_components=3, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=3, n_iter=100,
       random_state=None, tol=0.0)

In [25]:
# concept (rows) by terms (three columns) matrix that is part of decomposition can be
# accessed by lsa.components_[]

lsa.components_[0]

array([ 0.00200425,  0.00012308,  0.00012308, ...,  0.00012308,
        0.00012308,  0.00024616])

In [27]:
lsa.components_.shape

(3, 552738)

In [29]:
# print top 20 terms for each concept

terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_):
    termsInComp = zip(terms,comp)
    sortedTerms= sorted(termsInComp, key=lambda x: x[1], reverse=True)[:20]
    print('Concept %d:' % i)
    for term in sortedTerms:
        print(term[0])
    print(' ')

Concept 0:
species
one
may
would
man
many
mr
male
animals
much
thus
two
see
selection
males
often
birds
case
female
certain
 
Concept 1:
muscles
return
expression
eyes
movements
tears
mouth
blush
eyebrows
man
face
mind
action
habit
seen
body
laughter
contraction
often
blushing
 
Concept 2:
male
males
female
sexes
females
man
birds
sexual
vol
colour
mr
colours
coloured
young
plumage
sexual selection
men
horns
sex
shewn
 


In [None]:
# Following Data School (Kevin Markham) tutorial on using scikit learn with text

# https://www.youtube.com/watch?v=8QmkFAthuPU

# Basic workflow of scikit learn ML model building:

# i.) Import Python libraries 
# ii.) Instantiate model
# iii.) Fit model to training data
# iv.)  Predict on test data and other out-of-sample data (or transform in case of working with text)

In [38]:
# import and instantiate CountVectorizer (with default parameters)

from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [39]:
# fitting CountVectorizer() to darwin_corpus (no preprocessing on text besides just extracting text from html).  
# Note that CountVectorizer() expects a list or iterable argument
# Fitting learns the 'vocabulary' of the training data (occurs in-place)

# NOTICE THERE ARE NO STOPWORDS REMOVED AS DEFAULT SETTING

# ALSO, fitting happens in place (no need to store output with variable)


vect.fit(darwin_corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [40]:
# examine the fitted vocabulary
# returns vocabulary in numerical and then alphabetical order

vect.get_feature_names()

['000',
 '019',
 '050',
 '071',
 '08',
 '084',
 '09',
 '10',
 '100',
 '1000',
 '1001',
 '1002',
 '1003',
 '1004',
 '1005',
 '1006',
 '1007',
 '1008',
 '1009',
 '101',
 '1010',
 '1011',
 '1012',
 '1013',
 '1014',
 '1015',
 '1016',
 '1017',
 '102',
 '103',
 '104',
 '1040',
 '105',
 '106',
 '1068',
 '107',
 '108',
 '1085',
 '109',
 '10th',
 '11',
 '110',
 '1101',
 '1102',
 '1103',
 '1104',
 '1105',
 '1106',
 '1107',
 '1108',
 '1109',
 '111',
 '1110',
 '1111',
 '1112',
 '1113',
 '1114',
 '1115',
 '1116',
 '1117',
 '1118',
 '1119',
 '112',
 '1120',
 '1121',
 '1122',
 '1123',
 '1124',
 '1125',
 '1126',
 '1129',
 '113',
 '1134',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '12',
 '120',
 '1201',
 '1202',
 '1203',
 '1204',
 '1205',
 '1206',
 '1207',
 '1208',
 '1209',
 '121',
 '1210',
 '1211',
 '1212',
 '1213',
 '1214',
 '1215',
 '1216',
 '1217',
 '1218',
 '1219',
 '122',
 '1220',
 '1221',
 '1222',
 '1223',
 '1224',
 '1225',
 '1226',
 '1227',
 '1228',
 '123',
 '124',
 '125',
 '125th',
 '126

In [42]:
# transform training data into a 'document-term matrix'
# this will be a sparse array where documents (each of the three books) are rows and terms (i.e. vocabulary found) are columns

darwin_corpus_dtm = vect.transform(darwin_corpus)
darwin_corpus_dtm

<3x19828 sparse matrix of type '<class 'numpy.int64'>'
	with 32176 stored elements in Compressed Sparse Row format>

In [43]:
# convert sparse matrix to a dense matrix with toarray() method

darwin_corpus_dtm.toarray()

array([[10,  0,  0, ...,  0,  0,  0],
       [ 9,  1,  1, ...,  1,  3,  9],
       [ 2,  0,  0, ...,  0,  0,  0]], dtype=int64)

In [46]:
# examine the vocabulary and document-term matrix together by putting documents and terms (with unnormalized frequency counts) 
# into pandas dataframe 
# each document is now a "bag-of-words" (bag-of-words is result of basic tokenization, counting and often normalized frequency)

import pandas as pd

pd.DataFrame(darwin_corpus_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,000,019,050,071,08,084,09,10,100,1000,...,égard,élégance,émouvoir,époque,éprouve,étalage,étalons,étant,études,über
0,10,0,0,0,0,0,0,7,8,0,...,0,0,0,0,0,0,0,0,0,0
1,9,1,1,1,1,1,1,34,57,3,...,1,1,1,2,1,1,1,1,3,9
2,2,0,0,0,0,0,0,12,1,0,...,0,0,0,0,0,0,0,0,0,0
