In [1]:
import nltk
from nltk.corpus import gutenberg
nltk.download('gutenberg')
nltk.download('punkt')
import re
import pandas as pd
from sklearn.model_selection import train_test_split

#reading in the data, this time in the form of paragraphs
emma=gutenberg.paras('austen-emma.txt')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\fergu\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fergu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
words = ['a','very','few','minutes','more','however','completed','the','present','trial']

In [4]:
# Where does the '--' appear?
indices = []
for i in range(0, len(emma)):
    for j in range(0, len(emma[i])):
        if '--' in (emma[i][j]):
            indices.append((i,j))
len(indices)

963

In [45]:
emma[469200:470900]

'st natural to pay your visit, then"--\n\nHe was silent.  She believed he was looking at her; probably reflecting\non what she had said, and trying to understand the manner.\nShe heard him sigh.  It was natural for him to feel that he had\n_cause_ to sigh.  He could not believe her to be encouraging him.\nA few awkward moments passed, and he sat down again; and in a more\ndetermined manner said,\n\n"It was something to feel that all the rest of my time might be\ngiven to Hartfield.  My regard for Hartfield is most warm"--\n\nHe stopt again, rose again, and seemed quite embarrassed.--\nHe was more in love with her than Emma had supposed; and who can say\nhow it might have ended, if his father had not made his appearance?\nMr. Woodhouse soon followed; and the necessity of exertion made\nhim composed.\n\nA very few minutes more, however, completed the present trial.\nMr. Weston, always alert when business was to be done, and as\nincapable of procrastinating any evil that was inevitable,\n

In [5]:
#processing
emma_paras=[]
for paragraph in emma:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    emma_paras.append(' '.join(para))

print(emma_paras[0:4])

['[ Emma by Jane Austen 1816 ]', 'VOLUME I', 'CHAPTER I', 'Emma Woodhouse , handsome , clever , and rich , with a comfortable home and happy disposition , seemed to unite some of the best blessings of existence ; and had lived nearly twenty - one years in the world with very little to distress or vex her .']


In [6]:
indices = []
for i in range(0, len(emma)):
    for j in range(0, len(emma[i])):
        if '--' in (emma[i][j]):
            indices.append((i,j))
len(indices)

963

In [7]:
print(emma_paras[10])

The evil of the actual disparity in their ages ( and Mr . Woodhouse had not married early ) was much increased by his constitution and habits ; for having been a valetudinarian all his life , without activity of mind or body , he was a much older man in ways than in years ; and though everywhere beloved for the friendliness of his heart and his amiable temper , his talents could not have recommended him at any time .


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test = train_test_split(emma_paras, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
emma_paras_tfidf=vectorizer.fit_transform(emma_paras)
print("Number of features: %d" % emma_paras_tfidf.get_shape()[1])


Number of features: 1948


In [10]:
emma_paras_tfidf

<2371x1948 sparse matrix of type '<class 'numpy.float64'>'
	with 16742 stored elements in Compressed Sparse Row format>

In [91]:
emma_paras_tfidf.shape

(2371, 1948)

In [13]:
emma_paras_tfidf[3]

<1x1948 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [14]:
#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(emma_paras_tfidf, test_size=0.4, random_state=0)

#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j ]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[9])
print('Tf_idf vector:', tfidf_bypara[9])

Original sentence: " There appeared such a perfectly good understanding among them all " he began rather quickly , but checking himself , added , " however , it is impossible for me to say on what terms they really were  how it might all be behind the scenes .
Tf_idf vector: {'quickly': 0.38070256670926012, 'appeared': 0.31256680566382117, 'impossible': 0.31612334890976324, 'perfectly': 0.30038109873611568, 'added': 0.30316823321090014, 'terms': 0.36919964367970953, 'understanding': 0.31612334890976324, 'good': 0.2174154906397317, 'began': 0.28226673772842847, 'say': 0.21851232841598259, 'really': 0.25369233488450216}


In [81]:
stop = list(stopwords.words('english'))

In [19]:
len(terms)

1948

In [102]:
terms

['_has_',
 '_her_',
 '_him_',
 '_home_',
 '_i_',
 '_is_',
 '_me_',
 '_my_',
 '_not_',
 '_one_',
 '_she_',
 '_that_',
 '_then_',
 '_very_',
 '_we_',
 '_you_',
 'abbey',
 'able',
 'abroad',
 'absence',
 'absent',
 'absolutely',
 'accept',
 'acceptable',
 'accepted',
 'accepting',
 'accompanied',
 'accomplished',
 'account',
 'acknowledge',
 'acknowledged',
 'acquaintance',
 'acquainted',
 'acquiescence',
 'acquirements',
 'act',
 'acting',
 'activity',
 'actually',
 'added',
 'address',
 'addressed',
 'addressing',
 'admiration',
 'admire',
 'admired',
 'admit',
 'admitted',
 'advanced',
 'advantage',
 'advantages',
 'adventure',
 'advice',
 'advise',
 'advised',
 'affair',
 'affected',
 'affection',
 'affectionate',
 'afford',
 'afforded',
 'affronted',
 'afraid',
 'afternoon',
 'age',
 'agitated',
 'agitation',
 'ago',
 'agree',
 'agreeable',
 'agreed',
 'ah',
 'aimed',
 'air',
 'alarm',
 'alarming',
 'alertness',
 'alike',
 'alleviation',
 'alliance',
 'allow',
 'allowed',
 'allowing'

In [32]:
X_train_tfidf.shape

(1422, 1948)

In [30]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import pandas as pd

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 45.2237796232
Component 0:
" Oh !    0.999289
" Oh !    0.999289
" Oh !    0.999289
" Oh !    0.999289
" Oh !    0.999289
" Oh !    0.999289
" Oh !    0.999289
" Oh !    0.999289
" Oh !    0.999289
" Oh !    0.999289
Name: 0, dtype: float64
Component 1:
" You have made her too tall , Emma ," said Mr . Knightley .                                                                                                                0.634392
" You get upon delicate subjects , Emma ," said Mrs . Weston smiling ; " remember that I am here . Mr .                                                                     0.589447
" You are right , Mrs . Weston ," said Mr . Knightley warmly , " Miss Fairfax is as capable as any of us of forming a just opinion of Mrs . Elton .                         0.567127
" I do not know what your opinion may be , Mrs . Weston ," said Mr . Knightley , " of this great intimacy between Emma and Harriet Smith , but I think it a ba

In [34]:
paras_by_component.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,128,129
That is _court_ .,0.391944,-0.509545,0.0305358,-0.011556,0.254937,0.024881,-0.088924,-0.206576,0.244483,-0.258231,...,-0.057597,0.003398,0.042977,0.024329,-0.013637,-0.031724,0.003121,0.092914,0.0562,-0.046291
""" Yes , sir , I did indeed ; and I am very much obliged by your kind solicitude about me .""",0.01407,0.125261,4.132881e-07,0.003231,0.041845,0.039685,-0.034034,0.127426,0.190266,-0.006022,...,-0.035948,0.051358,-0.07562,0.010487,0.0289,0.019664,0.007657,-0.018987,-0.036149,-0.039627


### Applying the Model to the Test Set
Here we see the following: <BR/><BR/>
Component 0: Similar to training set<BR/>
Component 1: Similar to training set<BR/>
Component 2: Similar to Component 3 of training set<BR/>
Component 3: Similar to Component 1<BR/>
Component 4: A collection of responses in the affirmative. Not similar to any of  Components 0-4 of training set. Possibly similar to another component of training set.

In [35]:
# Run SVD on the training data, then project the training data.
X_test_lsa = lsa.fit_transform(X_test_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_test_lsa,index=X_test)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 49.3509674668
Component 0:
" Oh !       0.99992
" Oh !       0.99992
" Oh no !    0.99992
" Oh !       0.99992
" Oh !       0.99992
" Oh !       0.99992
" Oh !       0.99992
" Oh !       0.99992
" Oh !"      0.99992
" Oh !       0.99992
Name: 0, dtype: float64
Component 1:
" Well , Mrs . Weston ," said Emma triumphantly when he left them , " what do you say now to Mr . Knightley ' s marrying Jane Fairfax ?"                                                                                                                                                                                                                                                                                                             0.617004
After tea , Mr . and Mrs . Weston , and Mr . Elton sat down with Mr . Woodhouse to cards .                                                                                                                                                 

### Tweaking tf-idf

#### 1. Change norm to None<BR/>
Comparing to the training set in the first example (norm=u'l2'). i.e. longer and shorter paragraphs do NOT get treated equally: <BR/><BR/>
This results in descending sentence lengths as you go through the component.

In [37]:
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=None, #if used, would apply a correction factor so longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
emma_paras_tfidf=vectorizer.fit_transform(emma_paras)
print("Number of features: %d" % emma_paras_tfidf.get_shape()[1])


Number of features: 1948


In [38]:
#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(emma_paras_tfidf, test_size=0.4, random_state=0)

#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j ]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[9])
print('Tf_idf vector:', tfidf_bypara[9])

Original sentence: " There appeared such a perfectly good understanding among them all " he began rather quickly , but checking himself , added , " however , it is impossible for me to say on what terms they really were  how it might all be behind the scenes .
Tf_idf vector: {'quickly': 7.3851943989977258, 'appeared': 6.0634385590154061, 'impossible': 6.1324314305023577, 'perfectly': 5.8270497809511754, 'added': 5.8811170022214512, 'terms': 7.1620508476835161, 'understanding': 6.1324314305023577, 'good': 4.2176118685170749, 'began': 5.4756518941132866, 'say': 4.2388892669643603, 'really': 4.9213411584075573}


In [39]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 47.9970767757
Component 0:
" I do not know what your opinion may be , Mrs . Weston ," said Mr . Knightley , " of this great intimacy between Emma and Harriet Smith , but I think it a bad thing ."    0.529928
" You are right , Mrs . Weston ," said Mr . Knightley warmly , " Miss Fairfax is as capable as any of us of forming a just opinion of Mrs . Elton .                         0.526824
" In one respect , perhaps , Mr . Elton ' s manners are superior to Mr . Knightley ' s or Mr . Weston ' s .                                                                 0.505335
" You have made her too tall , Emma ," said Mr . Knightley .                                                                                                                0.488342
" You get upon delicate subjects , Emma ," said Mrs . Weston smiling ; " remember that I am here . Mr .                                                                     0.476461
" Emma ," said Mr . Kni

#### 2. Change max_df to 0.1<BR/>
max_df = 0.1 means that we drop words that occur in more than 10% of paragraphs. Setting df at 0.2 or higher did not seem to make much difference to the results.<BR/><BR/>
This resulted in changes to Components 1 and 4, which now consist of phrases that are different but similar to their analogues in the base case (max_df = 0.5). Components 0, 2 and 3 remain unchanged which is logical when you note that they consist of mostly a single, relatively uncommon, word ('Oh', 'CHAPTER' and 'Ah' respectively). 

In [43]:
vectorizer = TfidfVectorizer(max_df=0.1, # drop words that occur in more than 20% of the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
emma_paras_tfidf=vectorizer.fit_transform(emma_paras)
print("Number of features: %d" % emma_paras_tfidf.get_shape()[1])

Number of features: 1945


In [44]:
#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(emma_paras_tfidf, test_size=0.4, random_state=0)

#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j ]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[9])
print('Tf_idf vector:', tfidf_bypara[9])

Original sentence: " There appeared such a perfectly good understanding among them all " he began rather quickly , but checking himself , added , " however , it is impossible for me to say on what terms they really were  how it might all be behind the scenes .
Tf_idf vector: {'quickly': 0.38070256670926012, 'appeared': 0.31256680566382117, 'impossible': 0.31612334890976324, 'perfectly': 0.30038109873611568, 'added': 0.30316823321090014, 'terms': 0.36919964367970953, 'understanding': 0.31612334890976324, 'good': 0.2174154906397317, 'began': 0.28226673772842847, 'say': 0.21851232841598259, 'really': 0.25369233488450216}


In [45]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 44.2762000554
Component 0:
" Oh !     0.999307
" Oh !     0.999307
" Oh !     0.999307
" Oh !     0.999307
" Oh !     0.999307
" Oh !"    0.999307
" Oh !     0.999307
" Oh !"    0.999307
" Oh !     0.999307
" Oh !     0.999307
Name: 0, dtype: float64
Component 1:
Miss Bates and Miss Fairfax , escorted by the two gentlemen , walked into the room ; and Mrs . Elton seemed to think it as much her duty as Mrs . Weston ' s to receive them .                                                                                                                                                                                                                                           0.662046
" Such attentions as Mrs . Elton ' s , I should have imagined , would rather disgust than gratify Miss Fairfax .                                                                                                                                                                 

#### 3. Change the Regularization Coefficient to l1<BR/>
Changing the regularization coefficient to l1 has not made much change to the 'Oh', 'Ah' and 'CHAPTER' components. Component 3 contains several references to Emma (with higher tf-idf scores than when Norm=u'l2', while Component 4 contains several uses of the words 'Thank you' with high tf-idf scores. This component is the only one that does not appear among the top 5 components when l2 regularization is used.

In [59]:
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half of the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l1', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
emma_paras_tfidf=vectorizer.fit_transform(emma_paras)
print("Number of features: %d" % emma_paras_tfidf.get_shape()[1])

Number of features: 1948


In [60]:
#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(emma_paras_tfidf, test_size=0.4, random_state=0)

#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j ]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[9])
print('Tf_idf vector:', tfidf_bypara[9])

Original sentence: " There appeared such a perfectly good understanding among them all " he began rather quickly , but checking himself , added , " however , it is impossible for me to say on what terms they really were  how it might all be behind the scenes .
Tf_idf vector: {'quickly': 0.1164173940498016, 'appeared': 0.095581737986133022, 'impossible': 0.096669315356826713, 'perfectly': 0.091855395247126798, 'added': 0.092707690347801963, 'terms': 0.11289984402477532, 'understanding': 0.096669315356826713, 'good': 0.066484828471530602, 'began': 0.086316092684444623, 'say': 0.066820237283481229, 'really': 0.077578149191251389}


In [61]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(20)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 33.1869559097
Component 0:
" Oh !     0.999947
" Oh !     0.999947
" Oh !     0.999947
" Oh !     0.999947
" Oh !     0.999947
" Oh !"    0.999947
" Oh !     0.999947
" Oh !     0.999947
" Oh !     0.999947
" Oh !     0.999947
Name: 0, dtype: float64
Component 1:
" Ah !      0.999994
" Ah !      0.999994
" Ah !      0.999994
" Ah !      0.999994
" Ah !"     0.999994
" Ah !      0.999994
" Ah !      0.999994
" Ah !      0.999994
" Ah !      0.999994
But ah !    0.999994
Name: 1, dtype: float64
Component 2:
CHAPTER I       1.0
CHAPTER X       1.0
CHAPTER V       1.0
CHAPTER V       1.0
CHAPTER X       1.0
CHAPTER I       1.0
CHAPTER V       1.0
CHAPTER X       1.0
CHAPTER I       1.0
CHAPTER XVII    1.0
Name: 2, dtype: float64
Component 3:
" I am afraid ," said he , composing his features , " I am very much afraid , my dear Emma , that you will not smile when you hear it ."                                                                       