## Goals:

## 1. Use named entity recognition to create a data frame of all people

## 2. Conduct named entity recognition (NER) and sentiment analysis on 'The Adventures of Sherlock Holmes', by Arthur Conan Doyle, collected from Project Gutenberg (https://www.gutenberg.org/ebooks/1661).

In [1]:
# acquire text (as string) using 'requests' module

import requests

sherlock_text = (requests.get('https://www.gutenberg.org/files/1661/1661-0.txt')).text
sherlock_text



In [2]:
# regular expressions module (a tool for searching for patterns in texts)
# ie remove all punctuation
# ie identify all words that begin with a capitalized letter
# ie identify phone numbers by searching for the pattern ###-###-####) 

import re

start_phrase = "\*\*\* START OF THE PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES \*\*\*"
start_range = re.search(start_phrase, sherlock_text) # for re, * means any letter in the alphabet so we need to use \
start_index = start_range.end()
start_index

915

In [3]:
end_phrase = "\*\*\* END OF THE PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES \*\*\*"
end_range = re.search(end_phrase, sherlock_text) # for re, * means any letter in the alphabet so we need to use \
end_index = end_range.start()
end_index

588654

In [4]:
# select the part of text that we are interested in (just the content of the book itself)

text = sherlock_text[start_index:end_index]
text



In [5]:
splittext = text.split()
splittext

['cover',
 'The',
 'Adventures',
 'of',
 'Sherlock',
 'Holmes',
 'by',
 'Arthur',
 'Conan',
 'Doyle',
 'Contents',
 'I.',
 'A',
 'Scandal',
 'in',
 'Bohemia',
 'II.',
 'The',
 'Red-Headed',
 'League',
 'III.',
 'A',
 'Case',
 'of',
 'Identity',
 'IV.',
 'The',
 'Boscombe',
 'Valley',
 'Mystery',
 'V.',
 'The',
 'Five',
 'Orange',
 'Pips',
 'VI.',
 'The',
 'Man',
 'with',
 'the',
 'Twisted',
 'Lip',
 'VII.',
 'The',
 'Adventure',
 'of',
 'the',
 'Blue',
 'Carbuncle',
 'VIII.',
 'The',
 'Adventure',
 'of',
 'the',
 'Speckled',
 'Band',
 'IX.',
 'The',
 'Adventure',
 'of',
 'the',
 'Engineerâ\x80\x99s',
 'Thumb',
 'X.',
 'The',
 'Adventure',
 'of',
 'the',
 'Noble',
 'Bachelor',
 'XI.',
 'The',
 'Adventure',
 'of',
 'the',
 'Beryl',
 'Coronet',
 'XII.',
 'The',
 'Adventure',
 'of',
 'the',
 'Copper',
 'Beeches',
 'I.',
 'A',
 'SCANDAL',
 'IN',
 'BOHEMIA',
 'I.',
 'To',
 'Sherlock',
 'Holmes',
 'she',
 'is',
 'always',
 '_the_',
 'woman.',
 'I',
 'have',
 'seldom',
 'heard',
 'him',
 'ment

In [6]:
import collections
from collections import Counter
tokfreqs = collections.Counter(splittext)
tokfreqs.most_common()

[('the', 5244),
 ('and', 2729),
 ('to', 2630),
 ('of', 2611),
 ('I', 2533),
 ('a', 2519),
 ('in', 1648),
 ('that', 1545),
 ('was', 1359),
 ('his', 1096),
 ('is', 1051),
 ('he', 1012),
 ('you', 979),
 ('it', 967),
 ('my', 901),
 ('have', 889),
 ('had', 807),
 ('with', 797),
 ('as', 767),
 ('which', 746),
 ('at', 724),
 ('for', 676),
 ('be', 594),
 ('not', 579),
 ('from', 471),
 ('upon', 458),
 ('said', 447),
 ('but', 436),
 ('me', 414),
 ('we', 405),
 ('been', 385),
 ('this', 370),
 ('very', 369),
 ('her', 366),
 ('your', 350),
 ('â\x80\x9cI', 349),
 ('were', 336),
 ('on', 324),
 ('an', 323),
 ('so', 315),
 ('by', 313),
 ('would', 313),
 ('she', 305),
 ('all', 303),
 ('are', 296),
 ('It', 288),
 ('one', 282),
 ('could', 279),
 ('no', 278),
 ('has', 275),
 ('there', 275),
 ('into', 272),
 ('out', 271),
 ('He', 264),
 ('what', 263),
 ('Mr.', 259),
 ('The', 257),
 ('little', 257),
 ('when', 256),
 ('him', 253),
 ('who', 250),
 ('up', 247),
 ('will', 244),
 ('some', 227),
 ('should', 205),


In [7]:
# remove stop words with nltk

import nltk
from nltk.corpus import stopwords
sw_nltk = stopwords.words('english')
words = [word for word in text.split() if word.lower() not in sw_nltk]
text = " ".join(words)
print(text)



In [8]:
noswlist = text.split()
noswlist

['cover',
 'Adventures',
 'Sherlock',
 'Holmes',
 'Arthur',
 'Conan',
 'Doyle',
 'Contents',
 'I.',
 'Scandal',
 'Bohemia',
 'II.',
 'Red-Headed',
 'League',
 'III.',
 'Case',
 'Identity',
 'IV.',
 'Boscombe',
 'Valley',
 'Mystery',
 'V.',
 'Five',
 'Orange',
 'Pips',
 'VI.',
 'Man',
 'Twisted',
 'Lip',
 'VII.',
 'Adventure',
 'Blue',
 'Carbuncle',
 'VIII.',
 'Adventure',
 'Speckled',
 'Band',
 'IX.',
 'Adventure',
 'Engineerâ\x80\x99s',
 'Thumb',
 'X.',
 'Adventure',
 'Noble',
 'Bachelor',
 'XI.',
 'Adventure',
 'Beryl',
 'Coronet',
 'XII.',
 'Adventure',
 'Copper',
 'Beeches',
 'I.',
 'SCANDAL',
 'BOHEMIA',
 'I.',
 'Sherlock',
 'Holmes',
 'always',
 '_the_',
 'woman.',
 'seldom',
 'heard',
 'mention',
 'name.',
 'eyes',
 'eclipses',
 'predominates',
 'whole',
 'sex.',
 'felt',
 'emotion',
 'akin',
 'love',
 'Irene',
 'Adler.',
 'emotions,',
 'one',
 'particularly,',
 'abhorrent',
 'cold,',
 'precise',
 'admirably',
 'balanced',
 'mind.',
 'was,',
 'take',
 'it,',
 'perfect',
 'reason

In [9]:
tokfreqs = collections.Counter(noswlist)
pairs = tokfreqs.most_common()
pairs

[('upon', 458),
 ('said', 447),
 ('â\x80\x9cI', 349),
 ('would', 313),
 ('one', 282),
 ('could', 279),
 ('Mr.', 259),
 ('little', 257),
 ('Holmes', 197),
 ('man', 193),
 ('may', 186),
 ('see', 183),
 ('shall', 168),
 ('must', 159),
 ('came', 142),
 ('know', 136),
 ('think', 132),
 ('two', 128),
 ('us', 125),
 ('Holmes,', 125),
 ('â\x80\x9cIt', 124),
 ('might', 118),
 ('come', 117),
 ('â\x80\x9cYou', 112),
 ('it.', 110),
 ('much', 106),
 ('back', 106),
 ('heard', 103),
 ('time', 102),
 ('made', 102),
 ('â\x80\x9cAnd', 99),
 ('found', 97),
 ('Sherlock', 94),
 ('never', 92),
 ('it,', 90),
 ('like', 90),
 ('however,', 89),
 ('quite', 89),
 ('good', 87),
 ('saw', 84),
 ('tell', 84),
 ('took', 84),
 ('me,', 83),
 ('away', 82),
 ('him.', 82),
 ('go', 80),
 ('face', 79),
 ('St.', 79),
 ('Holmes.', 78),
 ('nothing', 78),
 ('way', 77),
 ('Miss', 77),
 ('left', 76),
 ('every', 75),
 ('matter', 75),
 ('door', 75),
 ('take', 74),
 ('me.', 74),
 ('last', 74),
 ('you,', 74),
 ('find', 74),
 ('â\x80\x

In [10]:
justwordsofpairs = []
for atuple in pairs:
    justwordsofpairs.append(atuple[0])
justwordsofpairs

['upon',
 'said',
 'â\x80\x9cI',
 'would',
 'one',
 'could',
 'Mr.',
 'little',
 'Holmes',
 'man',
 'may',
 'see',
 'shall',
 'must',
 'came',
 'know',
 'think',
 'two',
 'us',
 'Holmes,',
 'â\x80\x9cIt',
 'might',
 'come',
 'â\x80\x9cYou',
 'it.',
 'much',
 'back',
 'heard',
 'time',
 'made',
 'â\x80\x9cAnd',
 'found',
 'Sherlock',
 'never',
 'it,',
 'like',
 'however,',
 'quite',
 'good',
 'saw',
 'tell',
 'took',
 'me,',
 'away',
 'him.',
 'go',
 'face',
 'St.',
 'Holmes.',
 'nothing',
 'way',
 'Miss',
 'left',
 'every',
 'matter',
 'door',
 'take',
 'me.',
 'last',
 'you,',
 'find',
 'â\x80\x9cThe',
 'small',
 'make',
 'young',
 'case',
 'â\x80\x9cBut',
 'say',
 'long',
 'without',
 'he,',
 'first',
 'put',
 'â\x80\x9cWell,',
 'then,',
 'round',
 'seemed',
 'him,',
 'thought',
 'even',
 'went',
 'seen',
 'right',
 'old',
 'he.',
 'hand',
 'still',
 'three',
 'ever',
 'though',
 'something',
 'rather',
 'eyes',
 'look',
 'â\x80\x9cOh,',
 'â\x80\x9cYes,',
 'room',
 'you.',
 'get',
 '

In [28]:
import spacy

needstring = ' '.join(justwordsofpairs)

nlp = spacy.load("en_core_web_sm")
stuff = nlp(needstring) # spacy tokenizer makes a spacy.tokens.doc.Doc type

people_list = []

for ent in stuff.ents:
    if ent.label_ == "PERSON":
        people_list.append(ent)
people_list

[Holmes,
 âIt,
 Watson,
 âNo,
 Baker,
 McCarthy Lestrade,
 Hosmer,
 Watson,
 John,
 Neville,
 James,
 Mary,
 Frank Irene,
 Arthur,
 Briony,
 Watson,
 George,
 Henry,
 Stoke Roylott,
 Majesty,
 Stoner Simon,
 Lestrade,
 Swandam,
 Lascar,
 Grimesby Flora League,
 âNo,
 Duncan Holmes,
 Lysander Stark,
 Ross,
 Merryweather,
 Merryweather,
 Lone,
 ADVENTURE Peterson,
 Hunter,
 Adler,
 Duke,
 Godfrey,
 now?â Adler,
 mean?â,
 Vincent mind,
 William minute,
 matter.â conclusion,
 Jones,
 not.â âNo,
 McCarthy,
 Waterloo,
 Ryder Baker,
 Eyford fullerâs-earth,
 Winchester Bohemia Valley,
 rang,
 Bohemia,
 Watson,
 Ross,
 way.â fond,
 Reading,
 Hudson you!â,
 Whitney,
 Hugh,
 do.â,
 Bradstreet,
 gasped,
 Iâd,
 be.â,
 Robert ceremony,
 Hunter,
 Hunter,
 Rucastles Fowler II,
 akin,
 Beyond Study Scarlet,
 armchair,
 in!â,
 New Majesty,
 landau,
 dark.â much.â,
 yet.â,
 merry RED-HEADED,
 Albert,
 hereâs,
 â ââWell,
 cared north,
 labyrinth,
 satisfy escape

In [31]:
#HELP!!!

tokfreqs = collections.Counter(people_list)
tokfreqs.most_common()

[(Holmes, 1),
 (âIt, 1),
 (Watson, 1),
 (âNo, 1),
 (Baker, 1),
 (McCarthy Lestrade, 1),
 (Hosmer, 1),
 (Watson, 1),
 (John, 1),
 (Neville, 1),
 (James, 1),
 (Mary, 1),
 (Frank Irene, 1),
 (Arthur, 1),
 (Briony, 1),
 (Watson, 1),
 (George, 1),
 (Henry, 1),
 (Stoke Roylott, 1),
 (Majesty, 1),
 (Stoner Simon, 1),
 (Lestrade, 1),
 (Swandam, 1),
 (Lascar, 1),
 (Grimesby Flora League, 1),
 (âNo, 1),
 (Duncan Holmes, 1),
 (Lysander Stark, 1),
 (Ross, 1),
 (Merryweather, 1),
 (Merryweather, 1),
 (Lone, 1),
 (ADVENTURE Peterson, 1),
 (Hunter, 1),
 (Adler, 1),
 (Duke, 1),
 (Godfrey, 1),
 (now?â Adler, 1),
 (mean?â, 1),
 (Vincent mind, 1),
 (William minute, 1),
 (matter.â conclusion, 1),
 (Jones, 1),
 (not.â âNo, 1),
 (McCarthy, 1),
 (Waterloo, 1),
 (Ryder Baker, 1),
 (Eyford fullerâs-earth, 1),
 (Winchester Bohemia Valley, 1),
 (rang, 1),
 (Bohemia, 1),
 (Watson, 1),
 (Ross, 1),
 (way.â fond, 1),
 (Reading, 1),
 (Hudson you!â, 1),
 (Whitney, 1),
 (Hugh, 1),
 (do.â, 1),
 (

In [14]:
import pandas as pd

df = pd.DataFrame(people_list, columns=['person'])
df

ValueError: 1 columns passed, passed data had 7 columns

Sentiment Analysis:

In [17]:
# functions that I will use

def emo_score(lemma, freq, emolex_words):
    '''
    INPUT: lemmatized, lower-cased word (lemma) + frequency this lemma appears in a text
    OUTPUT: list of 10 NRC emotions scores (8 emotions + 2 sentiments [positive / negative])

    reads in a lemmatized word, looks it up in NRC lexicon dictionary (emolex_words) and returns the score of the word
    for the 8 emotions and 2 sentiments in the lexicon

    then takes the resulting list of 10 scores and multiples each by the frequency of the word
    returning the new list

    '''
    if (emolex_words[ 'word'].eq(lemma)).any():   #if any word in the "word" column in emolex_words equals the inputted lemma:
        #print(lemma)
        emolex_row = emolex_words[emolex_words['word'] == lemma].loc[:,'anger':'trust']
        #print(emolex_row)
        emolex_list = [item for item in emolex_row.iloc[0]]
        new_emo_scores = [item * freq for item in emolex_list]
        #print(new_emo_scores, "\n")
        #emo_scores = [x + y for x,y in zip(emo_scores, new_emo_scores)]
        return(new_emo_scores)
    else:                                          #else: lemma not in emolex_words dataframe
        return(None)

def emo_score_by_text (lemmadict, emolex_words):
    '''
    INPUT: reads in a frequency dictionary of lemmatized words (lemmadict)
    OUTPUT: returns a list of 10 cumulative scores for each of the 8 emotions and 2 sentiments in the NRC lexicon


    '''
    cum_emo_score = [0,0,0,0,0,0,0,0,0,0]
    for key, val in lemmadict.items():
        temp_emo_score = emo_score(key, val, emolex_words)
        if temp_emo_score is not None:
            cum_emo_score = [x + y for x, y in zip(cum_emo_score, temp_emo_score)]
    return(cum_emo_score)

def corpus_emo_scores(df, freq_lemmas_col):
    '''
    INPUT: a dataframe of texts (df), with a column name (freq_lemmas_col) indicating a column
    with a frequency dictionary of lemmatized words for each text
    OUTPUT: returns df with a new column recording the total 10 NRC scores for each text
    '''     
    new_df = df.copy()
    filepath = Path('NRC-Emotion-Lexicon','NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')
    
    emolex_df = pd.read_csv(filepath,
                            names=["word", "emotion", "association"],
                            sep='\t')
    emolex_words = emolex_df.pivot(index='word',
                                   columns='emotion',
                                   values='association').reset_index()
    #emotions = emolex_words.columns.drop('word')
    #emo_df = pd.DataFrame(0, index = sotudf.index, columns = emotions)    ## Let's replace that with the name of our dataframe ("sotudf")

    #new_df['emo_score'] = new_df[freq_lemmas_col].apply(emo_score_by_text)
    new_df['emo_score'] = new_df.apply(lambda x: emo_score_by_text(x[freq_lemmas_col], emolex_words), axis = 1)
    return(new_df)
