## NER Letter Spacy

## Resources

In [1]:
import spacy
import pandas as pd
from collections import Counter
from spacy import displacy

## Get Data

In [2]:
# Sentence Data
df = pd.read_csv("20240628_PhD_Data4TopicModel-Diary.csv", index_col=0) 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 1 to 4
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docid             4 non-null      object 
 1   docyear           4 non-null      int64  
 2   docmonth          0 non-null      float64
 3   authorName        4 non-null      object 
 4   docauthorid       4 non-null      object 
 5   authorLocation    4 non-null      object 
 6   authorGender      4 non-null      object 
 7   nationalOrigin    3 non-null      object 
 8   irish             3 non-null      object 
 9   otherUK           3 non-null      object 
 10  relMin            4 non-null      bool   
 11  catholic          4 non-null      bool   
 12  otherChristian    4 non-null      bool   
 13  U                 4 non-null      bool   
 14  M                 4 non-null      bool   
 15  S                 4 non-null      bool   
 16  F                 4 non-null      bool   
 17  L

## Test the NER of various models on the texts

In [3]:
# The next few cells are run multiple times to check performance of various pre-trained models. 
# Do not run the this cell until after the first pass
nlp = spacy.load("en_core_web_md")

In [4]:
# I started with 0 and checked 
chunk = 1

In [5]:
# Place narratives into a list representing the corpus
texts = df.text.values.tolist()
texts[chunk]

'May 6th Very wet morning it has stopped me from out shooting; it did not rain very late but and windy. I went down to see if the boat was all right; I found it stuck in the mud so I let it stop. A man came to borrow my gun to shoot an Eagle ha wk which he had wounded with a stick; he came back in a little while and said he could not see it but had shot a parrot. In the evening I was playing at draughts but he beat me three games so I gave it up. Sunday May 9th I went across to Hindmarsh Islan d and saw such a lot of ducks that I wished I had taken the gun with me; it is very strange you see so many on Sunday when I go out on a week day I don’t see so many. May 10th I went out very early shooting and shot a teal coming home I picke d up a Cod fish which I suppose got killed with the salt water. We gave it to a black woman. When I got back I found that poor little Herbert had died while I was away; in the afternoon I went to see if Mr Roberts would bury him. I went from there down to th

## Named entity extraction for the texts

In [6]:
mentsTot = [] 
mentsDis = []
indsTot = []

for item in texts:

# Run the language model on the 1st narrative
    narrative = nlp(item)

# Find the mentions to people in the narrative

    for ent in narrative.ents:

        mentions = [ent.text for ent in narrative.ents if ent.label_ == 'PERSON']
        
        counts = {}
        for person in mentions:
            counts[person] = counts.get(person, 0) + 1
    
        individuals = set(mentions)
    
    mentsTot.append(len(mentions))
    mentsDis.append(counts)
    indsTot.append(len(individuals))
    
                   
print(len(mentsTot)) 
print(len(indsTot))
print(len(mentsDis))

print(mentsTot[0]) 
print(indsTot[0])
print(mentsDis[0])


4
4
4
3801
1267
{'Mrs Roberts': 19, 'Tom': 692, 'Hiern': 3, 'Worrier': 23, 'Jeff Miller': 2, 'Jimmy Roberts': 2, 'Roberts': 24, 'Frank': 271, 'Charra': 24, 'Brandy': 9, 'Weather': 4, 'Albert Campbell': 1, 'William Hasken': 1, 'Sailed': 1, 'Watsons': 3, 'Hierns': 1, 'Hoskin': 6, 'Frank Worrier Tom': 1, 'Boys Harnessed': 1, 'S.B. Frank': 1, 'Billy': 16, 'Mare': 4, 'Mrs Higgins': 40, 'Penong Young Jackson': 1, 'Charley Beck': 2, '7.A.M. Frank': 1, 'Bookabie': 11, 'Miller': 9, 'Boarey Scrub': 1, 'Said': 1, 'Bread': 51, 'Murray': 89, 'Beadon Came': 1, 'Bishop': 12, 'Beadon': 13, 'Wenyss': 7, 'Cook': 14, 'Le Mesurier': 91, 'Scroggins': 5, 'Wemyss': 34, 'Sundy': 1, 'Bob': 22, 'Roberts Mare': 1, 'Tom Aldom': 1, 'Melville': 47, 'Andrew Hawkes': 1, 'Gaskell': 62, 'Penong': 19, 'stin': 1, 'Capt Tullock': 7, 'coole': 1, 'Mrs Higgs': 1, 'Boy': 14, '8.A.M. Tom Aldom': 1, 'Jemany': 1, 'William': 6, 'Laura Bay': 3, 'Tom nigad': 1, 'waite': 7, 'ribbon carings': 1, 'Higgins': 29, 'breake': 1, 'S. Sat': 

## Self-references

Now for 1st person singular pronounds, subjective and objective only per Tackman, A. M., Sbarra, D. A., Carey, A. L., Donnellan, M. B., Horn, A. B., Holtzman, N. S., Edwards, T. S., Pennebaker, J. W., & Mehl, M. R. (2019). Depression, Negative Emotionality, and Self-Referential Language: A Multi-Lab, Multi-Measure, and Multi-Language-Task Research Synthesis. Journal of Personality and Social Psychology, 116(5), 817–834. https://doi.org/10.1037/pspp0000187.


In [7]:
pronounAll = ["I ", 
               "I'm ", 
               "I've ", 
               "I'll ", 
               "I'd ", 
               " me ", 
               "Me ", 
               " myself ", 
               "Myself "]
pronounAll

['I ', "I'm ", "I've ", "I'll ", "I'd ", ' me ', 'Me ', ' myself ', 'Myself ']

In [8]:
pronounSub = ["I ", "I'm ", "I've ", "I'll ", "I'd "]
pronounSub

['I ', "I'm ", "I've ", "I'll ", "I'd "]

In [9]:
pronounObj = [" me ", 
               "Me ", 
               " myself ", 
               "Myself "]
pronounObj

[' me ', 'Me ', ' myself ', 'Myself ']

## Now run on all

In [10]:
# Now the rest

fppAll_Ct = []

for item in texts:
    Count = 0
    for i in pronounAll:
        #print(texts[0].count(i))
        Count = item.count(i) + Count
    
    fppAll_Ct.append(Count)

print(len(fppAll_Ct))
print(fppAll_Ct[0:9])

4
[2082, 1121, 1030, 913]


In [11]:
# Now just subjective pronouns

fppSub_Ct = []

for item in texts:
    Count = 0
    for i in pronounSub:
        #print(texts[0].count(i))
        Count = item.count(i) + Count
    
    fppSub_Ct.append(Count)

print(len(fppSub_Ct))
print(fppSub_Ct[0:9])

4
[1522, 1013, 787, 802]


In [12]:
# Now just subjective pronouns

fppObj_Ct = []

for item in texts:
    Count = 0
    for i in pronounObj:
        #print(texts[0].count(i))
        Count = item.count(i) + Count
    
    fppObj_Ct.append(Count)

print(len(fppObj_Ct))
print(fppObj_Ct[0:9])

4
[560, 108, 243, 111]


## Add new variables to metadata

In [13]:
df['mentsDis'] = [', '.join(x) for x in mentsDis]
df['mentsTot'] = mentsTot
df['indsTot'] = indsTot
df['fppAll_Ct'] = fppAll_Ct
df['fppSub_Ct'] = fppSub_Ct
df['fppObj_Ct'] = fppObj_Ct
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 1 to 4
Data columns (total 32 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   docid             4 non-null      object 
 1   docyear           4 non-null      int64  
 2   docmonth          0 non-null      float64
 3   authorName        4 non-null      object 
 4   docauthorid       4 non-null      object 
 5   authorLocation    4 non-null      object 
 6   authorGender      4 non-null      object 
 7   nationalOrigin    3 non-null      object 
 8   irish             3 non-null      object 
 9   otherUK           3 non-null      object 
 10  relMin            4 non-null      bool   
 11  catholic          4 non-null      bool   
 12  otherChristian    4 non-null      bool   
 13  U                 4 non-null      bool   
 14  M                 4 non-null      bool   
 15  S                 4 non-null      bool   
 16  F                 4 non-null      bool   
 17  L

In [14]:
df.head()

Unnamed: 0,docid,docyear,docmonth,authorName,docauthorid,authorLocation,authorGender,nationalOrigin,irish,otherUK,...,scoreNeg,scoreNeu,scorePos,scoreCom,mentsDis,mentsTot,indsTot,fppAll_Ct,fppSub_Ct,fppObj_Ct
1,D0002,1883,,Anne F. Richards,D0002,Australia,F,English,False,True,...,0.057039,0.89367,0.049316,-0.178757,"Mrs Roberts, Tom, Hiern, Worrier, Jeff Miller,...",3801,1267,2082,1522,560
2,D0003,1858,,Henry H. Adams,D0003,Australia,M,,,,...,0.02814,0.916614,0.053863,0.070318,"Hindmarsh Islan, Herbert, Roberts, Whiting, Mi...",574,221,1121,1013,108
3,D0007,1865,,John Hart,D0007,Australia,M,English,False,True,...,0.050477,0.896772,0.052744,-0.017036,"John Hart, Mrs Singleton, Jo hn, Farr, Daly, R...",1706,712,1030,787,243
4,D0009,1871,,Edith C. Gwynne,D0009,Australia,F,English,False,True,...,0.039007,0.910292,0.050688,0.025624,"Edith Gwynne, Julia, Willie, Furguson, Emily B...",1209,817,913,802,111


In [15]:
df.to_csv("20240701_PhD_FinalData-Diary.csv")