## NER Letter Chunk Spacy

## Resources

In [1]:
import spacy
import pandas as pd
from collections import Counter
from spacy import displacy

## Get Data

In [2]:
# Sentence Data
df = pd.read_csv("20240701_PhD_Data4NER-DiaChk.csv", index_col=0) 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1023 entries, 0 to 1022
Data columns (total 31 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          1023 non-null   int64  
 1   docid             1023 non-null   object 
 2   docyear           1023 non-null   int64  
 3   docmonth          0 non-null      float64
 4   authorName        1023 non-null   object 
 5   docauthorid       1023 non-null   object 
 6   authorLocation    1023 non-null   object 
 7   authorGender      1023 non-null   object 
 8   nationalOrigin    921 non-null    object 
 9   irish             921 non-null    object 
 10  otherUK           921 non-null    object 
 11  relMin            1023 non-null   bool   
 12  catholic          1023 non-null   bool   
 13  otherChristian    1023 non-null   bool   
 14  U                 1023 non-null   bool   
 15  M                 1023 non-null   bool   
 16  S                 1023 non-null   bool   


## Test the NER of various models on the texts

In [3]:
# The next few cells are run multiple times to check performance of various pre-trained models. 
# Do not run the this cell until after the first pass
nlp = spacy.load("en_core_web_md")

In [4]:
# I started with 0 and checked 
chunk = 1000

In [5]:
# Place narratives into a list representing the corpus
texts = df.text.values.tolist()
texts[chunk]

'disappointed ie our trip for the time present. Go for a walky met Mrs Davey Dicks school master he is most polide. Dick has to be correcded for not taking off his hat. Papan wlo boasts that he has not eaten food for three day. he has eved by sctioon think he brandy is better this evening Jan 17 I surns out a nice cool morning 0 Pap who finds he is munch wanted at Glynde Starts. Think he Stirling stow make the mtdings bing Custard and Blinbarl stewed which Polly and I pad walked to a garden not for from Dicks school. We came nom i different wy and loager I make myself a newt hair bow and follow me boy of Bink alon Polly had given me. It is co cold that I am able to put on my winter dress We go for a nise walk little life bacd Hanson trap Orinen Hansone Beil and sun dades Mr Mrs Harvey in a good buggy and pair. The looks so frest and prett go to Mrs Games to lee her get honer Aunstand taking the hony He has a get Green beil and red woollst gloves the bees are all on his back. The home i

In [6]:
# Test on first item
item = texts[chunk]

# Run the language model on the 1st narrative
narrative = nlp(item)

# Find the mentions to people in the narrative

for ent in narrative.ents:

    mentions = [ent.text for ent in narrative.ents if ent.label_ == 'PERSON']
        
    counts = {}
    for person in mentions:
        counts[person] = counts.get(person, 0) + 1
    
    individuals = set(mentions)
    
print("Number of mentions:", len(mentions), "\n")
print(counts, "\n")    
print("Number of individuals:", len(individuals), "\n")
print("Individuals:", individuals, "\n")    
    

Number of mentions: 14 

{'Davey Dicks': 1, 'Dick': 1, 'surns': 1, 'Polly': 1, 'Hanson': 1, 'Hansone Beil': 1, 'Mrs Harvey': 1, 'prett': 1, 'Mrs Games': 1, 'lee': 1, 'red woollst': 1, 'sil': 1, 'Mamas converiato': 1, 'getteng': 1} 

Number of individuals: 14 

Individuals: {'Mrs Games', 'Polly', 'lee', 'Dick', 'surns', 'Davey Dicks', 'prett', 'sil', 'getteng', 'Hansone Beil', 'Hanson', 'Mamas converiato', 'red woollst', 'Mrs Harvey'} 



In [7]:
text = texts[chunk]
doc = nlp(text)
displacy.render(doc, style="ent", options = {"ents": ["PERSON"]})

## Named entity extraction for the texts

In [8]:
nlp = spacy.load("en_core_web_md")

In [9]:
mentsTot = [] 
mentsDis = []
indsTot = []

for item in texts:

# Run the language model on the 1st narrative
    narrative = nlp(item)

# Find the mentions to people in the narrative

    for ent in narrative.ents:

        mentions = [ent.text for ent in narrative.ents if ent.label_ == 'PERSON']
        
        counts = {}
        for person in mentions:
            counts[person] = counts.get(person, 0) + 1
    
        individuals = set(mentions)
    
    mentsTot.append(len(mentions))
    mentsDis.append(counts)
    indsTot.append(len(individuals))
    
                   
print(len(mentsTot)) 
print(len(indsTot))
print(len(mentsDis))

print(mentsTot[0]) 
print(indsTot[0])
print(mentsDis[0])


1023
1023
1023
8
6
{'Mrs Roberts': 3, 'Tom': 1, 'Hiern': 1, 'Worrier': 1, 'Jeff Miller': 1, 'Jimmy Roberts': 1}


## Self-references

Now for 1st person singular pronounds, subjective and objective only per Tackman, A. M., Sbarra, D. A., Carey, A. L., Donnellan, M. B., Horn, A. B., Holtzman, N. S., Edwards, T. S., Pennebaker, J. W., & Mehl, M. R. (2019). Depression, Negative Emotionality, and Self-Referential Language: A Multi-Lab, Multi-Measure, and Multi-Language-Task Research Synthesis. Journal of Personality and Social Psychology, 116(5), 817–834. https://doi.org/10.1037/pspp0000187.


In [10]:
pronounAll = ["I ", 
               "I'm ", 
               "I've ", 
               "I'll ", 
               "I'd ", 
               " me ", 
               "Me ", 
               " myself ", 
               "Myself "]
pronounAll

['I ', "I'm ", "I've ", "I'll ", "I'd ", ' me ', 'Me ', ' myself ', 'Myself ']

In [11]:
pronounSub = ["I ", "I'm ", "I've ", "I'll ", "I'd "]
pronounSub

['I ', "I'm ", "I've ", "I'll ", "I'd "]

In [12]:
pronounObj = [" me ", 
               "Me ", 
               " myself ", 
               "Myself "]
pronounObj

[' me ', 'Me ', ' myself ', 'Myself ']

## Now test

In [19]:
chunk = 1000

In [20]:
#texts = [x.lower() for x in texts]

In [21]:
texts[chunk]

'disappointed ie our trip for the time present. Go for a walky met Mrs Davey Dicks school master he is most polide. Dick has to be correcded for not taking off his hat. Papan wlo boasts that he has not eaten food for three day. he has eved by sctioon think he brandy is better this evening Jan 17 I surns out a nice cool morning 0 Pap who finds he is munch wanted at Glynde Starts. Think he Stirling stow make the mtdings bing Custard and Blinbarl stewed which Polly and I pad walked to a garden not for from Dicks school. We came nom i different wy and loager I make myself a newt hair bow and follow me boy of Bink alon Polly had given me. It is co cold that I am able to put on my winter dress We go for a nise walk little life bacd Hanson trap Orinen Hansone Beil and sun dades Mr Mrs Harvey in a good buggy and pair. The looks so frest and prett go to Mrs Games to lee her get honer Aunstand taking the hony He has a get Green beil and red woollst gloves the bees are all on his back. The home i

In [22]:
# Subjective

Count = 0

for i in pronounSub:
    Count = texts[chunk].count(i) + Count

print(Count)

6


In [23]:
# Objective

Count = 0

for i in pronounObj:
    Count = texts[chunk].count(i) + Count

print(Count)

2


In [24]:
# All pronouns

Count = 0

for i in pronounAll:
    Count = texts[chunk].count(i) + Count

print(Count)

8


## Now run on all

In [25]:
# Now the rest

fppAll_Ct = []

for item in texts:
    Count = 0
    for i in pronounAll:
        #print(texts[0].count(i))
        Count = item.count(i) + Count
    
    fppAll_Ct.append(Count)

print(len(fppAll_Ct))
print(fppAll_Ct[0:9])

1023
[10, 8, 8, 8, 10, 10, 4, 7, 10]


In [26]:
# Now just subjective pronouns

fppSub_Ct = []

for item in texts:
    Count = 0
    for i in pronounSub:
        #print(texts[0].count(i))
        Count = item.count(i) + Count
    
    fppSub_Ct.append(Count)

print(len(fppSub_Ct))
print(fppSub_Ct[0:9])

1023
[7, 6, 7, 6, 10, 8, 1, 4, 6]


In [27]:
# Now just subjective pronouns

fppObj_Ct = []

for item in texts:
    Count = 0
    for i in pronounObj:
        #print(texts[0].count(i))
        Count = item.count(i) + Count
    
    fppObj_Ct.append(Count)

print(len(fppObj_Ct))
print(fppObj_Ct[0:9])

1023
[3, 2, 1, 2, 0, 2, 3, 3, 4]


## Add new variables to metadata

In [28]:
df['mentsDis'] = [', '.join(x) for x in mentsDis]
df['mentsTot'] = mentsTot
df['indsTot'] = indsTot
df['fppAll_Ct'] = fppAll_Ct
df['fppSub_Ct'] = fppSub_Ct
df['fppObj_Ct'] = fppObj_Ct
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1023 entries, 0 to 1022
Data columns (total 37 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docID-AT          1023 non-null   int64  
 1   docid             1023 non-null   object 
 2   docyear           1023 non-null   int64  
 3   docmonth          0 non-null      float64
 4   authorName        1023 non-null   object 
 5   docauthorid       1023 non-null   object 
 6   authorLocation    1023 non-null   object 
 7   authorGender      1023 non-null   object 
 8   nationalOrigin    921 non-null    object 
 9   irish             921 non-null    object 
 10  otherUK           921 non-null    object 
 11  relMin            1023 non-null   bool   
 12  catholic          1023 non-null   bool   
 13  otherChristian    1023 non-null   bool   
 14  U                 1023 non-null   bool   
 15  M                 1023 non-null   bool   
 16  S                 1023 non-null   bool   


In [29]:
df.head()

Unnamed: 0,docID-AT,docid,docyear,docmonth,authorName,docauthorid,authorLocation,authorGender,nationalOrigin,irish,...,scoreCom,chunks,position,topicNumber,mentsDis,mentsTot,indsTot,fppAll_Ct,fppSub_Ct,fppObj_Ct
0,1,D0002,1883,,Anne F. Richards,D0002,Australia,F,English,False,...,-0.379767,447,0.002237,8,"Mrs Roberts, Tom, Hiern, Worrier, Jeff Miller,...",8,6,10,7,3
1,2,D0002,1883,,Anne F. Richards,D0002,Australia,F,English,False,...,-0.058833,447,0.004474,8,"Jeff Miller, Jimmy Roberts, Mrs Roberts, Rober...",10,7,8,6,2
2,3,D0002,1883,,Anne F. Richards,D0002,Australia,F,English,False,...,-0.137725,447,0.006711,8,"Frank, Tom, Charra, Brandy, Mrs Roberts, Weath...",9,8,8,7,1
3,4,D0002,1883,,Anne F. Richards,D0002,Australia,F,English,False,...,-0.21692,447,0.008949,8,"Mrs Roberts, Weather, Albert Campbell, William...",14,11,8,6,2
4,5,D0002,1883,,Anne F. Richards,D0002,Australia,F,English,False,...,-0.371971,447,0.011186,8,"Hoskin, Tom, Worrier, Frank, Hiern, Jeff Mille...",15,9,10,10,0


In [30]:
df.to_csv("20240701_PhD_FinalData-DiaryChk.csv")