## NER Letter Spacy

## Resources

In [1]:
import spacy
import pandas as pd
from collections import Counter
from spacy import displacy

## Get Data

In [2]:
# Sentence Data
df = pd.read_csv("20240411_PhD_Data4NER-Letter.csv", index_col=0) 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 492 entries, 0 to 491
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          492 non-null    int64  
 1   docauthorid       492 non-null    object 
 2   docauthorname     492 non-null    object 
 3   docid             492 non-null    object 
 4   docyear           489 non-null    float64
 5   docmonth          477 non-null    float64
 6   authorgender      492 non-null    object 
 7   agewriting        380 non-null    float64
 8   agedeath          365 non-null    float64
 9   relMin            396 non-null    object 
 10  nationalOrigin    491 non-null    object 
 11  authorLocation    492 non-null    object 
 12  U                 442 non-null    object 
 13  M                 442 non-null    object 
 14  S                 442 non-null    object 
 15  F                 442 non-null    object 
 16  L                 442 non-null    object 
 1

## Test the NER of various models on the texts

In [3]:
# The next few cells are run multiple times to check performance of various pre-trained models. 
# Do not run the this cell until after the first pass
nlp = spacy.load("en_core_web_md")

In [4]:
# I started with 0 and checked 
chunk = 0

In [5]:
# Place narratives into a list representing the corpus
texts = df.text.values.tolist()
texts[chunk]

' TRINIDAD On Train from Steubenville Ohio to Cincinnati. Nov 30 1872. My Darling Sister Justina: How interestedly you Sister M Louis and myself read Eugénie de Guérin\'s Journal and her daily anxieties to save her brother from being a spiritual outcast! This Journal which I propose keeping for you will deal with incidents occurring on my journey to Trinidad and happenings in that far-off land to which I am consigned. The Journal will begin with the first act. Here is Mother Josephine\'s letter: Mt St Vincent O Nov 27 1872. Sister Blandina Steubenville O My Dear Child: You are missioned to Trinidad. You will leave Cincinnati Wednesday and alone. Mother Regina will attend to your needs. Devotedly Mother Josephine. This letter thrilled us both. I was delighted to make the sacrifice and you were hiding your feelings that I might not lose any merit. Neither of us could find Trinidad on the map except in the island of Cuba. So we concluded that Cuba was my destination. I was to leave Steube

In [6]:
# Test on first item
item = texts[chunk]

# Run the language model on the 1st narrative
narrative = nlp(item)

# Find the mentions to people in the narrative

for ent in narrative.ents:

    mentions = [ent.text for ent in narrative.ents if ent.label_ == 'PERSON']
        
    counts = {}
    for person in mentions:
        counts[person] = counts.get(person, 0) + 1
    
    individuals = set(mentions)
    
print("Number of mentions:", len(mentions), "\n")
print(counts, "\n")    
print("Number of individuals:", len(individuals), "\n")
print("Individuals:", individuals, "\n")    
    

Number of mentions: 37 

{'Darling Sister': 1, 'Justina': 1, 'Sister M Louis': 1, "Mother Josephine's": 1, 'Mother Regina': 3, 'Tait': 1, 'McCann': 1, 'Sister Anthony': 2, 'Bigelow': 1, 'McCabe': 1, 'Segale': 1, 'Henry': 1, 'Seminary': 1, 'Sisters Gabriella': 1, 'Gabriella': 1, 'Benedicta': 1, 'Leverone': 1, 'Garibaldi': 2, 'the Misses Gardelli': 1, 'Mary': 2, 'John': 1, "John Leverone's": 1, 'Genoese': 1, 'St Francis Xavier': 1, 'Grace': 2, 'Rev Dr Callaghan': 1, 'monica': 1, 'Sister Benedicta': 1, 'Sisters Antonia': 1, 'Gonzaga': 1, 'Sister Antonia': 1} 

Number of individuals: 31 

Individuals: {"John Leverone's", 'Segale', 'Sisters Gabriella', 'Darling Sister', 'Rev Dr Callaghan', 'Garibaldi', 'Grace', 'the Misses Gardelli', 'Bigelow', 'Sister Benedicta', 'Mary', 'Sisters Antonia', 'Sister Antonia', 'Mother Regina', 'Gabriella', 'Benedicta', 'monica', 'Genoese', 'Gonzaga', 'McCabe', 'Henry', 'McCann', "Mother Josephine's", 'Leverone', 'John', 'Tait', 'St Francis Xavier', 'Justina',

## Named entity extraction for the texts

In [7]:
mentsTot = [] 
mentsDis = []
indsTot = []

for item in texts:

# Run the language model on the 1st narrative
    narrative = nlp(item)

# Find the mentions to people in the narrative

    for ent in narrative.ents:

        mentions = [ent.text for ent in narrative.ents if ent.label_ == 'PERSON']
        
        counts = {}
        for person in mentions:
            counts[person] = counts.get(person, 0) + 1
    
        individuals = set(mentions)
    
    mentsTot.append(len(mentions))
    mentsDis.append(counts)
    indsTot.append(len(individuals))
    
                   
print(len(mentsTot)) 
print(len(indsTot))
print(len(mentsDis))

print(mentsTot[0]) 
print(indsTot[0])
print(mentsDis[0])


492
492
492
37
31
{'Darling Sister': 1, 'Justina': 1, 'Sister M Louis': 1, "Mother Josephine's": 1, 'Mother Regina': 3, 'Tait': 1, 'McCann': 1, 'Sister Anthony': 2, 'Bigelow': 1, 'McCabe': 1, 'Segale': 1, 'Henry': 1, 'Seminary': 1, 'Sisters Gabriella': 1, 'Gabriella': 1, 'Benedicta': 1, 'Leverone': 1, 'Garibaldi': 2, 'the Misses Gardelli': 1, 'Mary': 2, 'John': 1, "John Leverone's": 1, 'Genoese': 1, 'St Francis Xavier': 1, 'Grace': 2, 'Rev Dr Callaghan': 1, 'monica': 1, 'Sister Benedicta': 1, 'Sisters Antonia': 1, 'Gonzaga': 1, 'Sister Antonia': 1}


## Self-references

Now for 1st person singular pronounds, subjective and objective only per Tackman, A. M., Sbarra, D. A., Carey, A. L., Donnellan, M. B., Horn, A. B., Holtzman, N. S., Edwards, T. S., Pennebaker, J. W., & Mehl, M. R. (2019). Depression, Negative Emotionality, and Self-Referential Language: A Multi-Lab, Multi-Measure, and Multi-Language-Task Research Synthesis. Journal of Personality and Social Psychology, 116(5), 817–834. https://doi.org/10.1037/pspp0000187.


In [8]:
pronounAll = ["I ", 
               "I'm ", 
               "I've ", 
               "I'll ", 
               "I'd ", 
               " me ", 
               "Me ", 
               " myself ", 
               "Myself "]
pronounAll

['I ', "I'm ", "I've ", "I'll ", "I'd ", ' me ', 'Me ', ' myself ', 'Myself ']

In [9]:
pronounSub = ["I ", "I'm ", "I've ", "I'll ", "I'd "]
pronounSub

['I ', "I'm ", "I've ", "I'll ", "I'd "]

In [10]:
pronounObj = [" me ", 
               "Me ", 
               " myself ", 
               "Myself "]
pronounObj

[' me ', 'Me ', ' myself ', 'Myself ']

## Now test

In [11]:
chunk = 0

In [12]:
#texts = [x.lower() for x in texts]

In [13]:
texts[chunk]

' TRINIDAD On Train from Steubenville Ohio to Cincinnati. Nov 30 1872. My Darling Sister Justina: How interestedly you Sister M Louis and myself read Eugénie de Guérin\'s Journal and her daily anxieties to save her brother from being a spiritual outcast! This Journal which I propose keeping for you will deal with incidents occurring on my journey to Trinidad and happenings in that far-off land to which I am consigned. The Journal will begin with the first act. Here is Mother Josephine\'s letter: Mt St Vincent O Nov 27 1872. Sister Blandina Steubenville O My Dear Child: You are missioned to Trinidad. You will leave Cincinnati Wednesday and alone. Mother Regina will attend to your needs. Devotedly Mother Josephine. This letter thrilled us both. I was delighted to make the sacrifice and you were hiding your feelings that I might not lose any merit. Neither of us could find Trinidad on the map except in the island of Cuba. So we concluded that Cuba was my destination. I was to leave Steube

In [14]:
# Subjective

Count = 0

for i in pronounSub:
    Count = texts[chunk].count(i) + Count

print(Count)

104


In [15]:
# Objective

Count = 0

for i in pronounObj:
    Count = texts[chunk].count(i) + Count

print(Count)

30


In [16]:
# All pronouns

Count = 0

for i in pronounAll:
    Count = texts[chunk].count(i) + Count

print(Count)

134


## Now run on all

In [17]:
# Now the rest

fppAll_Ct = []

for item in texts:
    Count = 0
    for i in pronounAll:
        #print(texts[0].count(i))
        Count = item.count(i) + Count
    
    fppAll_Ct.append(Count)

print(len(fppAll_Ct))
print(fppAll_Ct[0:9])

492
[134, 135, 97, 78, 89, 75, 41, 21, 32]


In [18]:
# Now just subjective pronouns

fppSub_Ct = []

for item in texts:
    Count = 0
    for i in pronounSub:
        #print(texts[0].count(i))
        Count = item.count(i) + Count
    
    fppSub_Ct.append(Count)

print(len(fppSub_Ct))
print(fppSub_Ct[0:9])

492
[104, 99, 76, 60, 70, 59, 34, 17, 26]


In [19]:
# Now just subjective pronouns

fppObj_Ct = []

for item in texts:
    Count = 0
    for i in pronounObj:
        #print(texts[0].count(i))
        Count = item.count(i) + Count
    
    fppObj_Ct.append(Count)

print(len(fppObj_Ct))
print(fppObj_Ct[0:9])

492
[30, 36, 21, 18, 19, 16, 7, 4, 6]


## Add new variables to metadata

In [20]:
df['mentsDis'] = [', '.join(x) for x in mentsDis]
df['mentsTot'] = mentsTot
df['indsTot'] = indsTot
df['fppAll_Ct'] = fppAll_Ct
df['fppSub_Ct'] = fppSub_Ct
df['fppObj_Ct'] = fppObj_Ct
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 492 entries, 0 to 491
Data columns (total 32 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          492 non-null    int64  
 1   docauthorid       492 non-null    object 
 2   docauthorname     492 non-null    object 
 3   docid             492 non-null    object 
 4   docyear           489 non-null    float64
 5   docmonth          477 non-null    float64
 6   authorgender      492 non-null    object 
 7   agewriting        380 non-null    float64
 8   agedeath          365 non-null    float64
 9   relMin            396 non-null    object 
 10  nationalOrigin    491 non-null    object 
 11  authorLocation    492 non-null    object 
 12  U                 442 non-null    object 
 13  M                 442 non-null    object 
 14  S                 442 non-null    object 
 15  F                 442 non-null    object 
 16  L                 442 non-null    object 
 1

In [21]:
df.head()

Unnamed: 0,docID-AT,docauthorid,docauthorname,docid,docyear,docmonth,authorgender,agewriting,agedeath,relMin,...,totalTokens,uniqueTokens,lexicalDiversity,topicNumber,mentsDis,mentsTot,indsTot,fppAll_Ct,fppSub_Ct,fppObj_Ct
0,1,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D002,1872.0,11.0,F,22.0,91.0,True,...,1998,773,0.386887,4,"Darling Sister, Justina, Sister M Louis, Mothe...",37,31,134,104,30
1,2,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D004,1872.0,12.0,F,22.0,91.0,True,...,2474,841,0.339935,5,"Sisters, Mass, Sister Gabriella, Martha, Siste...",30,17,135,99,36
2,3,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D005,1872.0,12.0,F,22.0,91.0,True,...,2281,828,0.362999,5,"Kit Carson, Mrs Mullen, Seller, Otero, leastwi...",24,22,97,76,21
3,4,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D006,1872.0,12.0,F,22.0,91.0,True,...,2272,863,0.379842,5,"Ida Chené, Mrs Chené, Mass, Et introibo, Kyrie...",35,17,78,60,18
4,5,per0001043,"Segale, Sister Blandina, 1850-1941",S1019-D007,1873.0,3.0,F,23.0,91.0,True,...,2117,805,0.380255,5,"Sister Marcella, Bishop Salpointe, Sisters, Si...",17,12,89,70,19


In [22]:
df.to_csv("20240414_PhD_FinalData-Ltr.csv")