## NER Letter Spacy

## Resources

In [1]:
import spacy
import pandas as pd
from collections import Counter
from spacy import displacy

## Get Data

In [2]:
# Sentence Data
df = pd.read_csv("20240611_PhD_Data4NER-Letter.csv", index_col=0) 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 676 entries, 0 to 675
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          676 non-null    int64  
 1   docid             676 non-null    object 
 2   docyear           676 non-null    int64  
 3   docmonth          669 non-null    float64
 4   authorName        623 non-null    object 
 5   docauthorid       676 non-null    object 
 6   authorLocation    676 non-null    object 
 7   authorGender      676 non-null    object 
 8   nationalOrigin    676 non-null    object 
 9   irish             676 non-null    bool   
 10  otherUK           676 non-null    bool   
 11  relMin            339 non-null    object 
 12  catholic          339 non-null    object 
 13  otherChristian    339 non-null    object 
 14  U                 378 non-null    object 
 15  M                 387 non-null    object 
 16  S                 376 non-null    object 
 1

## Test the NER of various models on the texts

In [3]:
# The next few cells are run multiple times to check performance of various pre-trained models. 
# Do not run the this cell until after the first pass
nlp = spacy.load("en_core_web_md")

In [4]:
# I started with 0 and checked 
chunk = 0

In [5]:
# Place narratives into a list representing the corpus
texts = df.text.values.tolist()
texts[chunk]

'July 18 1891 Dear Sister I have waited until I could send you a long letter I am married The Cermony Ceremony was performed in San Francisco by a Justice of the peace I will in my next letter send you the proof not one but the County Clerk was witness. So your Sister name is Isabella Moore now with very High Honour I expect to hear from Willie very soon and will tell him to be sure and come by here I will send all the aprons and the nightdresses I do not know of any thing else To day is so much like Summer we have ripe strawberries in the market I have had new potatoes twice There is so many flowers in bloom this place looks like a little Eden The air is fragrant with the perfume I am going to Church to night alone as usual Write soon and tell me when you intend to start If I can get to send some other things I will. I will close now with a long good Bye Isabella Weir \n'

In [6]:
# Test on first item
item = texts[chunk]

# Run the language model on the 1st narrative
narrative = nlp(item)

# Find the mentions to people in the narrative

for ent in narrative.ents:

    mentions = [ent.text for ent in narrative.ents if ent.label_ == 'PERSON']
        
    counts = {}
    for person in mentions:
        counts[person] = counts.get(person, 0) + 1
    
    individuals = set(mentions)
    
print("Number of mentions:", len(mentions), "\n")
print(counts, "\n")    
print("Number of individuals:", len(individuals), "\n")
print("Individuals:", individuals, "\n")    
    

Number of mentions: 3 

{'Isabella Moore': 1, 'Willie': 1, 'Bye Isabella': 1} 

Number of individuals: 3 

Individuals: {'Isabella Moore', 'Willie', 'Bye Isabella'} 



## Named entity extraction for the texts

In [7]:
mentsTot = [] 
mentsDis = []
indsTot = []

for item in texts:

# Run the language model on the 1st narrative
    narrative = nlp(item)

# Find the mentions to people in the narrative

    for ent in narrative.ents:

        mentions = [ent.text for ent in narrative.ents if ent.label_ == 'PERSON']
        
        counts = {}
        for person in mentions:
            counts[person] = counts.get(person, 0) + 1
    
        individuals = set(mentions)
    
    mentsTot.append(len(mentions))
    mentsDis.append(counts)
    indsTot.append(len(individuals))
    
                   
print(len(mentsTot)) 
print(len(indsTot))
print(len(mentsDis))

print(mentsTot[0]) 
print(indsTot[0])
print(mentsDis[0])


676
676
676
3
3
{'Isabella Moore': 1, 'Willie': 1, 'Bye Isabella': 1}


## Self-references

Now for 1st person singular pronounds, subjective and objective only per Tackman, A. M., Sbarra, D. A., Carey, A. L., Donnellan, M. B., Horn, A. B., Holtzman, N. S., Edwards, T. S., Pennebaker, J. W., & Mehl, M. R. (2019). Depression, Negative Emotionality, and Self-Referential Language: A Multi-Lab, Multi-Measure, and Multi-Language-Task Research Synthesis. Journal of Personality and Social Psychology, 116(5), 817–834. https://doi.org/10.1037/pspp0000187.


In [8]:
pronounAll = ["I ", 
               "I'm ", 
               "I've ", 
               "I'll ", 
               "I'd ", 
               " me ", 
               "Me ", 
               " myself ", 
               "Myself "]
pronounAll

['I ', "I'm ", "I've ", "I'll ", "I'd ", ' me ', 'Me ', ' myself ', 'Myself ']

In [9]:
pronounSub = ["I ", "I'm ", "I've ", "I'll ", "I'd "]
pronounSub

['I ', "I'm ", "I've ", "I'll ", "I'd "]

In [10]:
pronounObj = [" me ", 
               "Me ", 
               " myself ", 
               "Myself "]
pronounObj

[' me ', 'Me ', ' myself ', 'Myself ']

## Now test

In [11]:
chunk = 0

In [12]:
#texts = [x.lower() for x in texts]

In [13]:
texts[chunk]

'July 18 1891 Dear Sister I have waited until I could send you a long letter I am married The Cermony Ceremony was performed in San Francisco by a Justice of the peace I will in my next letter send you the proof not one but the County Clerk was witness. So your Sister name is Isabella Moore now with very High Honour I expect to hear from Willie very soon and will tell him to be sure and come by here I will send all the aprons and the nightdresses I do not know of any thing else To day is so much like Summer we have ripe strawberries in the market I have had new potatoes twice There is so many flowers in bloom this place looks like a little Eden The air is fragrant with the perfume I am going to Church to night alone as usual Write soon and tell me when you intend to start If I can get to send some other things I will. I will close now with a long good Bye Isabella Weir \n'

In [14]:
# Subjective

Count = 0

for i in pronounSub:
    Count = texts[chunk].count(i) + Count

print(Count)

12


In [15]:
# Objective

Count = 0

for i in pronounObj:
    Count = texts[chunk].count(i) + Count

print(Count)

1


In [16]:
# All pronouns

Count = 0

for i in pronounAll:
    Count = texts[chunk].count(i) + Count

print(Count)

13


## Now run on all

In [17]:
# Now the rest

fppAll_Ct = []

for item in texts:
    Count = 0
    for i in pronounAll:
        #print(texts[0].count(i))
        Count = item.count(i) + Count
    
    fppAll_Ct.append(Count)

print(len(fppAll_Ct))
print(fppAll_Ct[0:9])

676
[13, 11, 13, 8, 23, 11, 68, 38, 67]


In [18]:
# Now just subjective pronouns

fppSub_Ct = []

for item in texts:
    Count = 0
    for i in pronounSub:
        #print(texts[0].count(i))
        Count = item.count(i) + Count
    
    fppSub_Ct.append(Count)

print(len(fppSub_Ct))
print(fppSub_Ct[0:9])

676
[12, 9, 9, 8, 22, 11, 55, 34, 49]


In [19]:
# Now just subjective pronouns

fppObj_Ct = []

for item in texts:
    Count = 0
    for i in pronounObj:
        #print(texts[0].count(i))
        Count = item.count(i) + Count
    
    fppObj_Ct.append(Count)

print(len(fppObj_Ct))
print(fppObj_Ct[0:9])

676
[1, 2, 4, 0, 1, 0, 13, 4, 18]


## Add new variables to metadata

In [20]:
df['mentsDis'] = [', '.join(x) for x in mentsDis]
df['mentsTot'] = mentsTot
df['indsTot'] = indsTot
df['fppAll_Ct'] = fppAll_Ct
df['fppSub_Ct'] = fppSub_Ct
df['fppObj_Ct'] = fppObj_Ct
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 676 entries, 0 to 675
Data columns (total 34 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          676 non-null    int64  
 1   docid             676 non-null    object 
 2   docyear           676 non-null    int64  
 3   docmonth          669 non-null    float64
 4   authorName        623 non-null    object 
 5   docauthorid       676 non-null    object 
 6   authorLocation    676 non-null    object 
 7   authorGender      676 non-null    object 
 8   nationalOrigin    676 non-null    object 
 9   irish             676 non-null    bool   
 10  otherUK           676 non-null    bool   
 11  relMin            339 non-null    object 
 12  catholic          339 non-null    object 
 13  otherChristian    339 non-null    object 
 14  U                 378 non-null    object 
 15  M                 387 non-null    object 
 16  S                 376 non-null    object 
 1

In [21]:
df.head()

Unnamed: 0,docID-AT,docid,docyear,docmonth,authorName,docauthorid,authorLocation,authorGender,nationalOrigin,irish,...,scoreNeu,scorePos,scoreCom,topicNumber,mentsDis,mentsTot,indsTot,fppAll_Ct,fppSub_Ct,fppObj_Ct
0,1,20910,1891,7.0,Isabella Weir Moore,IED0107,USA,F,Irish,True,...,0.855,0.145,0.5151,0,"Isabella Moore, Willie, Bye Isabella",3,3,13,12,1
1,2,21062,1871,11.0,E. Rothwell,IED0179,Canada,F,Irish,True,...,0.710125,0.25475,0.25835,0,"Kate, Lydia, Maria, Bissin, Garnetts, Tom Fitz...",13,11,11,9,2
2,3,21324,1892,5.0,Isabella Weir Moore,IED0107,USA,F,Irish,True,...,0.799,0.155,0.9423,0,"Anna, Brotherinlaw, Husband",4,3,13,9,4
3,4,21334,1891,10.0,Mary Savage,IED0621,USA,F,Irish,True,...,0.812,0.142889,0.1452,2,"Lizzie, Johns, James Wm, William N, Nick John,...",15,14,8,8,0
4,5,21354,1890,2.0,William J. Weir,IED0958,USA,M,Irish,True,...,0.846,0.1255,0.8871,4,"bella, W. William J. Weir",2,2,23,22,1


In [22]:
df.to_csv("20240611_PhD_FinalData-Ltr.csv")