## Goals:

## 1. Use named entity recognition to create a data frame of all people

## 2. Conduct named entity recognition (NER) and sentiment analysis on 'The Adventures of Sherlock Holmes', by Arthur Conan Doyle, collected from Project Gutenberg (https://www.gutenberg.org/ebooks/1661).

In [2]:
# acquire text (as string) using 'requests' module

import requests

sherlock_text = (requests.get('https://www.gutenberg.org/files/1661/1661-0.txt')).text
#sherlock_text

In [3]:
# regular expressions module (a tool for searching for patterns in texts)
# ie remove all punctuation
# ie identify all words that begin with a capitalized letter
# ie identify phone numbers by searching for the pattern ###-###-####) 

import re

start_phrase = "\*\*\* START OF THE PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES \*\*\*"
start_range = re.search(start_phrase, sherlock_text) # for re, * means any letter in the alphabet so we need to use \
start_index = start_range.end()
start_index

915

In [4]:
end_phrase = "\*\*\* END OF THE PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES \*\*\*"
end_range = re.search(end_phrase, sherlock_text) # for re, * means any letter in the alphabet so we need to use \
end_index = end_range.start()
end_index

588654

In [5]:
# select the part of text that we are interested in (just the content of the book itself)

text = sherlock_text[start_index:end_index]
#text

In [6]:
splittext = text.split()
splittext

['cover',
 'The',
 'Adventures',
 'of',
 'Sherlock',
 'Holmes',
 'by',
 'Arthur',
 'Conan',
 'Doyle',
 'Contents',
 'I.',
 'A',
 'Scandal',
 'in',
 'Bohemia',
 'II.',
 'The',
 'Red-Headed',
 'League',
 'III.',
 'A',
 'Case',
 'of',
 'Identity',
 'IV.',
 'The',
 'Boscombe',
 'Valley',
 'Mystery',
 'V.',
 'The',
 'Five',
 'Orange',
 'Pips',
 'VI.',
 'The',
 'Man',
 'with',
 'the',
 'Twisted',
 'Lip',
 'VII.',
 'The',
 'Adventure',
 'of',
 'the',
 'Blue',
 'Carbuncle',
 'VIII.',
 'The',
 'Adventure',
 'of',
 'the',
 'Speckled',
 'Band',
 'IX.',
 'The',
 'Adventure',
 'of',
 'the',
 'Engineerâ\x80\x99s',
 'Thumb',
 'X.',
 'The',
 'Adventure',
 'of',
 'the',
 'Noble',
 'Bachelor',
 'XI.',
 'The',
 'Adventure',
 'of',
 'the',
 'Beryl',
 'Coronet',
 'XII.',
 'The',
 'Adventure',
 'of',
 'the',
 'Copper',
 'Beeches',
 'I.',
 'A',
 'SCANDAL',
 'IN',
 'BOHEMIA',
 'I.',
 'To',
 'Sherlock',
 'Holmes',
 'she',
 'is',
 'always',
 '_the_',
 'woman.',
 'I',
 'have',
 'seldom',
 'heard',
 'him',
 'ment

In [7]:
import collections
from collections import Counter
tokfreqs = collections.Counter(splittext)
tokfreqs.most_common()

[('the', 5244),
 ('and', 2729),
 ('to', 2630),
 ('of', 2611),
 ('I', 2533),
 ('a', 2519),
 ('in', 1648),
 ('that', 1545),
 ('was', 1359),
 ('his', 1096),
 ('is', 1051),
 ('he', 1012),
 ('you', 979),
 ('it', 967),
 ('my', 901),
 ('have', 889),
 ('had', 807),
 ('with', 797),
 ('as', 767),
 ('which', 746),
 ('at', 724),
 ('for', 676),
 ('be', 594),
 ('not', 579),
 ('from', 471),
 ('upon', 458),
 ('said', 447),
 ('but', 436),
 ('me', 414),
 ('we', 405),
 ('been', 385),
 ('this', 370),
 ('very', 369),
 ('her', 366),
 ('your', 350),
 ('â\x80\x9cI', 349),
 ('were', 336),
 ('on', 324),
 ('an', 323),
 ('so', 315),
 ('by', 313),
 ('would', 313),
 ('she', 305),
 ('all', 303),
 ('are', 296),
 ('It', 288),
 ('one', 282),
 ('could', 279),
 ('no', 278),
 ('has', 275),
 ('there', 275),
 ('into', 272),
 ('out', 271),
 ('He', 264),
 ('what', 263),
 ('Mr.', 259),
 ('The', 257),
 ('little', 257),
 ('when', 256),
 ('him', 253),
 ('who', 250),
 ('up', 247),
 ('will', 244),
 ('some', 227),
 ('should', 205),


In [8]:
# remove stop words with nltk

import nltk
from nltk.corpus import stopwords
sw_nltk = stopwords.words('english')
words = [word for word in text.split() if word.lower() not in sw_nltk]
text = " ".join(words)
print(text)



In [9]:
noswlist = text.split()
noswlist

['cover',
 'Adventures',
 'Sherlock',
 'Holmes',
 'Arthur',
 'Conan',
 'Doyle',
 'Contents',
 'I.',
 'Scandal',
 'Bohemia',
 'II.',
 'Red-Headed',
 'League',
 'III.',
 'Case',
 'Identity',
 'IV.',
 'Boscombe',
 'Valley',
 'Mystery',
 'V.',
 'Five',
 'Orange',
 'Pips',
 'VI.',
 'Man',
 'Twisted',
 'Lip',
 'VII.',
 'Adventure',
 'Blue',
 'Carbuncle',
 'VIII.',
 'Adventure',
 'Speckled',
 'Band',
 'IX.',
 'Adventure',
 'Engineerâ\x80\x99s',
 'Thumb',
 'X.',
 'Adventure',
 'Noble',
 'Bachelor',
 'XI.',
 'Adventure',
 'Beryl',
 'Coronet',
 'XII.',
 'Adventure',
 'Copper',
 'Beeches',
 'I.',
 'SCANDAL',
 'BOHEMIA',
 'I.',
 'Sherlock',
 'Holmes',
 'always',
 '_the_',
 'woman.',
 'seldom',
 'heard',
 'mention',
 'name.',
 'eyes',
 'eclipses',
 'predominates',
 'whole',
 'sex.',
 'felt',
 'emotion',
 'akin',
 'love',
 'Irene',
 'Adler.',
 'emotions,',
 'one',
 'particularly,',
 'abhorrent',
 'cold,',
 'precise',
 'admirably',
 'balanced',
 'mind.',
 'was,',
 'take',
 'it,',
 'perfect',
 'reason

In [10]:
tokfreqs = collections.Counter(noswlist)
pairs = tokfreqs.most_common()
pairs

[('upon', 458),
 ('said', 447),
 ('â\x80\x9cI', 349),
 ('would', 313),
 ('one', 282),
 ('could', 279),
 ('Mr.', 259),
 ('little', 257),
 ('Holmes', 197),
 ('man', 193),
 ('may', 186),
 ('see', 183),
 ('shall', 168),
 ('must', 159),
 ('came', 142),
 ('know', 136),
 ('think', 132),
 ('two', 128),
 ('us', 125),
 ('Holmes,', 125),
 ('â\x80\x9cIt', 124),
 ('might', 118),
 ('come', 117),
 ('â\x80\x9cYou', 112),
 ('it.', 110),
 ('much', 106),
 ('back', 106),
 ('heard', 103),
 ('time', 102),
 ('made', 102),
 ('â\x80\x9cAnd', 99),
 ('found', 97),
 ('Sherlock', 94),
 ('never', 92),
 ('it,', 90),
 ('like', 90),
 ('however,', 89),
 ('quite', 89),
 ('good', 87),
 ('saw', 84),
 ('tell', 84),
 ('took', 84),
 ('me,', 83),
 ('away', 82),
 ('him.', 82),
 ('go', 80),
 ('face', 79),
 ('St.', 79),
 ('Holmes.', 78),
 ('nothing', 78),
 ('way', 77),
 ('Miss', 77),
 ('left', 76),
 ('every', 75),
 ('matter', 75),
 ('door', 75),
 ('take', 74),
 ('me.', 74),
 ('last', 74),
 ('you,', 74),
 ('find', 74),
 ('â\x80\x

In [11]:
justwordsofpairs = []
for atuple in pairs:
    justwordsofpairs.append(atuple[0])
justwordsofpairs # the type is STILL DOC, u just took the first item and its type stayed the same, did not turn to string

['upon',
 'said',
 'â\x80\x9cI',
 'would',
 'one',
 'could',
 'Mr.',
 'little',
 'Holmes',
 'man',
 'may',
 'see',
 'shall',
 'must',
 'came',
 'know',
 'think',
 'two',
 'us',
 'Holmes,',
 'â\x80\x9cIt',
 'might',
 'come',
 'â\x80\x9cYou',
 'it.',
 'much',
 'back',
 'heard',
 'time',
 'made',
 'â\x80\x9cAnd',
 'found',
 'Sherlock',
 'never',
 'it,',
 'like',
 'however,',
 'quite',
 'good',
 'saw',
 'tell',
 'took',
 'me,',
 'away',
 'him.',
 'go',
 'face',
 'St.',
 'Holmes.',
 'nothing',
 'way',
 'Miss',
 'left',
 'every',
 'matter',
 'door',
 'take',
 'me.',
 'last',
 'you,',
 'find',
 'â\x80\x9cThe',
 'small',
 'make',
 'young',
 'case',
 'â\x80\x9cBut',
 'say',
 'long',
 'without',
 'he,',
 'first',
 'put',
 'â\x80\x9cWell,',
 'then,',
 'round',
 'seemed',
 'him,',
 'thought',
 'even',
 'went',
 'seen',
 'right',
 'old',
 'he.',
 'hand',
 'still',
 'three',
 'ever',
 'though',
 'something',
 'rather',
 'eyes',
 'look',
 'â\x80\x9cOh,',
 'â\x80\x9cYes,',
 'room',
 'you.',
 'get',
 '

In [27]:
import spacy

needstring = ' '.join(justwordsofpairs)

nlp = spacy.load("en_core_web_sm")
stuff = nlp(needstring) # spacy tokenizer makes a spacy.tokens.doc.Doc type

people_list = []        
        
for ent in stuff.ents:
    if ent.label_ == "PERSON":
        people_list.append(ent.text) # NOT ent, it needs to be ent.text

people_list



['Holmes',
 'â\x80\x9cIt',
 'Watson',
 'â\x80\x9cNo',
 'Baker',
 'McCarthy Lestrade',
 'Hosmer',
 'Watson',
 'John',
 'Neville',
 'James',
 'Mary',
 'Frank Irene',
 'Arthur',
 'Briony',
 'Watson',
 'George',
 'Henry',
 'Stoke Roylott',
 'Majesty',
 'Stoner Simon',
 'Lestrade',
 'Swandam',
 'Lascar',
 'Grimesby Flora League',
 'â\x80\x9cNo',
 'Duncan Holmes',
 'Lysander Stark',
 'Ross',
 'Merryweather',
 'Merryweather',
 'Lone',
 'ADVENTURE Peterson',
 'Hunter',
 'Adler',
 'Duke',
 'Godfrey',
 'now?â\x80\x9d Adler',
 'mean?â\x80\x9d',
 'Vincent mind',
 'William minute',
 'matter.â\x80\x9d conclusion',
 'Jones',
 'not.â\x80\x9d â\x80\x9cNo',
 'McCarthy',
 'Waterloo',
 'Ryder Baker',
 'Eyford fullerâ\x80\x99s-earth',
 'Winchester Bohemia Valley',
 'rang',
 'Bohemia',
 'Watson',
 'Ross',
 'way.â\x80\x9d fond',
 'Reading',
 'Hudson you!â\x80\x9d',
 'Whitney',
 'Hugh',
 'do.â\x80\x9d',
 'Bradstreet',
 'gasped',
 'Iâ\x80\x99d',
 'be.â\x80\x9d',
 'Robert ceremony',
 'Hunter',
 'Hunter',
 'Ruca

In [31]:
tokfreqs = collections.Counter(people_list)
common = tokfreqs.most_common()
common

[('Watson', 4),
 ('Lee', 4),
 ('Baker', 3),
 ('John', 3),
 ('Ross', 3),
 ('Hunter', 3),
 ('Jack', 3),
 ('Monica', 3),
 ('Holmes', 2),
 ('â\x80\x9cNo', 2),
 ('James', 2),
 ('Mary', 2),
 ('Lestrade', 2),
 ('Merryweather', 2),
 ('Adler', 2),
 ('McCarthy', 2),
 ('Waterloo', 2),
 ('Reading', 2),
 ('landau', 2),
 ('Surrey', 2),
 ('Roylott', 2),
 ('Ferguson', 2),
 ('Streatham', 2),
 ('Lane', 2),
 ('Kilburn', 2),
 ('Englishman', 2),
 ('Frank', 2),
 ('Morris', 2),
 ('Yard', 2),
 ('Herefordshire', 2),
 ('Kate', 2),
 ('â\x80\x94I', 2),
 ('â\x80\x9cIt', 1),
 ('McCarthy Lestrade', 1),
 ('Hosmer', 1),
 ('Neville', 1),
 ('Frank Irene', 1),
 ('Arthur', 1),
 ('Briony', 1),
 ('George', 1),
 ('Henry', 1),
 ('Stoke Roylott', 1),
 ('Majesty', 1),
 ('Stoner Simon', 1),
 ('Swandam', 1),
 ('Lascar', 1),
 ('Grimesby Flora League', 1),
 ('Duncan Holmes', 1),
 ('Lysander Stark', 1),
 ('Lone', 1),
 ('ADVENTURE Peterson', 1),
 ('Duke', 1),
 ('Godfrey', 1),
 ('now?â\x80\x9d Adler', 1),
 ('mean?â\x80\x9d', 1),
 ('Vi

In [32]:
import pandas as pd

df = pd.DataFrame(common, columns=['person', 'frequency'])
df

Unnamed: 0,person,frequency
0,Watson,4
1,Lee,4
2,Baker,3
3,John,3
4,Ross,3
...,...,...
392,fed Quick,1
393,miss,1
394,wasted.â pray,1
395,â Toller,1


Sentiment Analysis:

In [33]:
# functions that I will use

def emo_score(lemma, freq, emolex_words):
    '''
    INPUT: lemmatized, lower-cased word (lemma) + frequency this lemma appears in a text
    OUTPUT: list of 10 NRC emotions scores (8 emotions + 2 sentiments [positive / negative])

    reads in a lemmatized word, looks it up in NRC lexicon dictionary (emolex_words) and returns the score of the word
    for the 8 emotions and 2 sentiments in the lexicon

    then takes the resulting list of 10 scores and multiples each by the frequency of the word
    returning the new list

    '''
    if (emolex_words[ 'word'].eq(lemma)).any():   #if any word in the "word" column in emolex_words equals the inputted lemma:
        #print(lemma)
        emolex_row = emolex_words[emolex_words['word'] == lemma].loc[:,'anger':'trust']
        #print(emolex_row)
        emolex_list = [item for item in emolex_row.iloc[0]]
        new_emo_scores = [item * freq for item in emolex_list]
        #print(new_emo_scores, "\n")
        #emo_scores = [x + y for x,y in zip(emo_scores, new_emo_scores)]
        return(new_emo_scores)
    else:                                          #else: lemma not in emolex_words dataframe
        return(None)

def emo_score_by_text (lemmadict, emolex_words):
    '''
    INPUT: reads in a frequency dictionary of lemmatized words (lemmadict)
    OUTPUT: returns a list of 10 cumulative scores for each of the 8 emotions and 2 sentiments in the NRC lexicon


    '''
    cum_emo_score = [0,0,0,0,0,0,0,0,0,0]
    for key, val in lemmadict.items():
        temp_emo_score = emo_score(key, val, emolex_words)
        if temp_emo_score is not None:
            cum_emo_score = [x + y for x, y in zip(cum_emo_score, temp_emo_score)]
    return(cum_emo_score)

def corpus_emo_scores(df, freq_lemmas_col):
    '''
    INPUT: a dataframe of texts (df), with a column name (freq_lemmas_col) indicating a column
    with a frequency dictionary of lemmatized words for each text
    OUTPUT: returns df with a new column recording the total 10 NRC scores for each text
    '''     
    new_df = df.copy()
    filepath = Path('NRC-Emotion-Lexicon','NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')
    
    emolex_df = pd.read_csv(filepath,
                            names=["word", "emotion", "association"],
                            sep='\t')
    emolex_words = emolex_df.pivot(index='word',
                                   columns='emotion',
                                   values='association').reset_index()
    #emotions = emolex_words.columns.drop('word')
    #emo_df = pd.DataFrame(0, index = sotudf.index, columns = emotions)    ## Let's replace that with the name of our dataframe ("sotudf")

    #new_df['emo_score'] = new_df[freq_lemmas_col].apply(emo_score_by_text)
    new_df['emo_score'] = new_df.apply(lambda x: emo_score_by_text(x[freq_lemmas_col], emolex_words), axis = 1)
    return(new_df)


In [38]:
# make df for sentiment analysis

K = 30 # divide text into 30 "chapters"
 
# compute chunk length
chnk_len = len(text) // K
 
res = []
for idx in range(0, len(text), chnk_len):
     
    # appending sliced string
    res.append(text[idx : idx + chnk_len])
    
type(res)

list

In [43]:
df = pd.DataFrame(res, columns = ["text_chunk"])
df

Unnamed: 0,text_chunk
0,cover Adventures Sherlock Holmes Arthur Conan ...
1,"hotograph. it. know it. know her, soul steel. ..."
2,"et ulster. all, thought, injuring her. prevent..."
3,hat name obliging youth?â asked Sherlock Hol...
4,derground far Aldersgate; short walk took us S...
5,"I small expense matter, shall expect bank ref..."
6,"ite like that, Mr. Holmes. seemed funny ask le..."
7,ever instant entered mind. flattered gentleman...
8,? âWitness: conveyed meaning me. thought del...
9,s dwelling. Hatherley side pool woods grew thi...


In [51]:
from nltk.stem import WordNetLemmatizer
nltk_lemmatizer = WordNetLemmatizer()

lemmas = df['text_chunk'].apply(lambda x: [nltk_lemmatizer.lemmatize(x)])
lemmas

0     [cover Adventures Sherlock Holmes Arthur Conan...
1     [hotograph. it. know it. know her, soul steel....
2     [et ulster. all, thought, injuring her. preven...
3     [hat name obliging youth?â asked Sherlock Ho...
4     [derground far Aldersgate; short walk took us ...
5     [I small expense matter, shall expect bank re...
6     [ite like that, Mr. Holmes. seemed funny ask l...
7     [ever instant entered mind. flattered gentlema...
8     [? âWitness: conveyed meaning me. thought de...
9     [s dwelling. Hatherley side pool woods grew th...
10    [n records. Among headings one twelve months f...
11    [pocket, and, drawing piece discoloured, blue-...
12    [police Savannah three gentlemen badly wanted ...
13    [ tide least four half feet water. bedroom win...
14    [t lifeless city dream. âIt points singular ...
15    [ suggestive might been,â remarked, âand y...
16    [ follow bitter end. Faces south, then, quick ...
17    [promise secrecy made time, freed last mon

In [55]:
df["text_chunk"] = lemmas
df

Unnamed: 0,text_chunk
0,[cover Adventures Sherlock Holmes Arthur Conan...
1,"[hotograph. it. know it. know her, soul steel...."
2,"[et ulster. all, thought, injuring her. preven..."
3,[hat name obliging youth?â asked Sherlock Ho...
4,[derground far Aldersgate; short walk took us ...
5,"[I small expense matter, shall expect bank re..."
6,"[ite like that, Mr. Holmes. seemed funny ask l..."
7,[ever instant entered mind. flattered gentlema...
8,[? âWitness: conveyed meaning me. thought de...
9,[s dwelling. Hatherley side pool woods grew th...


In [None]:
df['freqlemmas'] = df['lemmas'].apply(lambda x: collections.Counter(x))

In [None]:
emo_scores = corpus_emo_scores(df, "freqlemmas") 