In [25]:
import pandas as pd
import spacy
from spacy import displacy
import re

**#NER TOKEN GUIDE**

PERSON:      People, including fictional. <br>
NORP:        Nationalities or religious or political groups.<br>
FAC:         Buildings, airports, highways, bridges, etc.<br>
ORG:         Companies, agencies, institutions, etc.<br>
GPE:         Countries, cities, states.<br>
LOC:         Non-GPE locations, mountain ranges, bodies of water.<br>
PRODUCT:     Objects, vehicles, foods, etc. (Not services.)<br>
EVENT:       Named hurricanes, battles, wars, sports events, etc.<br>
WORK_OF_ART: Titles of books, songs, etc.<br>
LAW:         Named documents made into laws.<br>
LANGUAGE:    Any named language.<br>
DATE:        Absolute or relative dates or periods.<br>
TIME:        Times smaller than a day.<br>
PERCENT:     Percentage, including ”%“.<br>
MONEY:       Monetary values, including unit.<br>
QUANTITY:    Measurements, as of weight or distance.<br>
ORDINAL:     “first”, “second”, etc.<br>
CARDINAL:    Numerals that do not fall under another type.<br>

In [49]:
from google.colab import files
uploaded = files.upload()

Saving Translated_conventbook_ChatGPT.csv to Translated_conventbook_ChatGPT.csv


In [50]:
dataset = pd.read_csv('/content/Translated_conventbook_ChatGPT.csv')
dataset.head()

Unnamed: 0,id,year,item_raw,item,nchar,charcum,groupanalysis,token_count,text_chunks,num_chunks,GPT_translated_text
0,1,1475,C [5v],<item_start> C [5v] <item_end>,30,30,1.0,15,['C [5v]'],1,"It looks like you have provided ""C [5v]"" but t..."
1,2,1475,"in dem 14xli iar ward ich, sr. Engel Varnbler...","<item_start> in dem 14xli iar ward ich, sr. En...",574,604,1.0,239,"['in dem 14xli iar ward ich, sr. Engel Varnb\u...",1,"In the year 1441, I, Sister Engel Varnblerin,..."
2,3,1475,in dem lviiij iar fiengend wir ain gemaind ain...,<item_start> in dem lviiij iar fiengend wir ai...,317,921,1.0,137,['in dem lviiij iar fiengend wir ain gemaind a...,1,"In the 59th year, a community was founded. Thi..."
3,4,1475,do wz priorin sr. Anna Krumin vnd sr. Vrsel Äb...,<item_start> do wz priorin sr. Anna Krumin vnd...,615,1536,2.0,273,['do wz priorin sr. Anna Krumin vnd sr. Vrsel ...,1,to the prior Sister Anna Krumin and the subpri...
4,5,1475,in dem 14lxxvj iar starb nser erwirdigi mter...,<item_start> in dem 14lxxvj iar starb nser er...,192,1728,2.0,92,['in dem 14lxxvj iar starb \uf050nser erwirdig...,1,"""In the year 1476, our revered mother prioress..."


All functions needed for the NLP task using spaCy

In [51]:
nlp = spacy.load("en_core_web_sm")

Run this only for Convent Book

In [53]:
# Define the phrases to filter
phrases_to_filter = ["text", "the text", "this text"]

# Create a boolean mask for rows not containing any of the phrases
mask = ~dataset['GPT_translated_text'].str.contains('|'.join(phrases_to_filter), case=False, na=False)

# Create a new DataFrame without the filtered rows
filtered_dataset = dataset[mask]

In [54]:
def preprocess_text(text):
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [55]:
texts = dataset['GPT_translated_text'].tolist()

In [56]:
preprocessed_texts = [preprocess_text(text) for text in texts]

In [57]:
def perform_ner(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

In [58]:
# Perform NER on all texts
all_ner_results = []
for text in preprocessed_texts:
    ner_results = perform_ner(text)
    all_ner_results.append(ner_results)

In [59]:
# Add NER results to the dataframe
dataset['NER_spaCy'] = all_ner_results

# Example: Print the first few rows with NER results
print(dataset[['GPT_translated_text', 'NER_spaCy']].head())

                                 GPT_translated_text  \
0  It looks like you have provided "C [5v]" but t...   
1  In the year 1441, I, Sister Engel Varnblerin,...   
2  In the 59th year, a community was founded. Thi...   
3  to the prior Sister Anna Krumin and the subpri...   
4  "In the year 1476, our revered mother prioress...   

                                           NER_spaCy  
0              [(5v, CARDINAL), (English, LANGUAGE)]  
1  [(the year 1441, DATE), (Engel Varnblerin, PE...  
2  [(the 59th year, DATE), (About 10, CARDINAL), ...  
3  [(Anna Krumin, PERSON), (Ursula Äberli, PERSON...  
4  [(the year 1476, DATE), (the day, DATE), (St. ...  


In [60]:
print(dataset.head())

   id  year                                           item_raw  \
0   1  1475                                             C [5v]   
1   2  1475  in dem 14xli iar ward ich, sr. Engel Varnbler...   
2   3  1475  in dem lviiij iar fiengend wir ain gemaind ain...   
3   4  1475  do wz priorin sr. Anna Krumin vnd sr. Vrsel Äb...   
4   5  1475  in dem 14lxxvj iar starb nser erwirdigi mter...   

                                                item  nchar  charcum  \
0                     <item_start> C [5v] <item_end>     30       30   
1  <item_start> in dem 14xli iar ward ich, sr. En...    574      604   
2  <item_start> in dem lviiij iar fiengend wir ai...    317      921   
3  <item_start> do wz priorin sr. Anna Krumin vnd...    615     1536   
4  <item_start> in dem 14lxxvj iar starb nser er...    192     1728   

   groupanalysis  token_count  \
0            1.0           15   
1            1.0          239   
2            1.0          137   
3            2.0          273   
4    

In [61]:
def extract_entities(ner_results, entity_type):
    return [entity[0] for entity in ner_results if entity[1] == entity_type]

# Add columns for specific entity types
dataset['spaCy_PERSON'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'PERSON'))
dataset['spaCy_ORG'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'ORG'))
dataset['spaCy_GPE'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'GPE')) #geopolitical entities
dataset['spaCy_LOC'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'LOC'))
#dataset['EVENT'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'EVENT'))
#dataset['WORK_OF_ART'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'WORK_OF_ART'))
#dataset['PRODUCT'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'PRODUCT'))

In [62]:
dataset.head()

Unnamed: 0,id,year,item_raw,item,nchar,charcum,groupanalysis,token_count,text_chunks,num_chunks,GPT_translated_text,NER_spaCy,spaCy_PERSON,spaCy_ORG,spaCy_GPE,spaCy_LOC
0,1,1475,C [5v],<item_start> C [5v] <item_end>,30,30,1.0,15,['C [5v]'],1,"It looks like you have provided ""C [5v]"" but t...","[(5v, CARDINAL), (English, LANGUAGE)]",[],[],[],[]
1,2,1475,"in dem 14xli iar ward ich, sr. Engel Varnbler...","<item_start> in dem 14xli iar ward ich, sr. En...",574,604,1.0,239,"['in dem 14xli iar ward ich, sr. Engel Varnb\u...",1,"In the year 1441, I, Sister Engel Varnblerin,...","[(the year 1441, DATE), (Engel Varnblerin, PE...","[Engel Varnblerin, Saint Margaret, Elsbet Rai...",[],[Milan],[]
2,3,1475,in dem lviiij iar fiengend wir ain gemaind ain...,<item_start> in dem lviiij iar fiengend wir ai...,317,921,1.0,137,['in dem lviiij iar fiengend wir ain gemaind a...,1,"In the 59th year, a community was founded. Thi...","[(the 59th year, DATE), (About 10, CARDINAL), ...",[],[],[],[]
3,4,1475,do wz priorin sr. Anna Krumin vnd sr. Vrsel Äb...,<item_start> do wz priorin sr. Anna Krumin vnd...,615,1536,2.0,273,['do wz priorin sr. Anna Krumin vnd sr. Vrsel ...,1,to the prior Sister Anna Krumin and the subpri...,"[(Anna Krumin, PERSON), (Ursula Äberli, PERSON...","[Anna Krumin, Ursula Äberli, Barbara Küchinmai...","[Sister Elsbet Blarerin, Sister Agnes Burgower...",[Pentecost],[]
4,5,1475,in dem 14lxxvj iar starb nser erwirdigi mter...,<item_start> in dem 14lxxvj iar starb nser er...,192,1728,2.0,92,['in dem 14lxxvj iar starb \uf050nser erwirdig...,1,"""In the year 1476, our revered mother prioress...","[(the year 1476, DATE), (the day, DATE), (St. ...",[],[],[St. Martin's],[]


In [63]:
dataset.to_csv('Translated_ConventBook_ChatGPT_NER.csv', index=False)

The second approach will use BERT for NER


In [64]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

In [65]:
# Load pre-trained NER model and tokenizer
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [66]:
nlp_BERT = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [67]:
def preprocess_text(text):
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [68]:
texts = dataset['GPT_translated_text'].tolist()

In [69]:
preprocessed_texts = [preprocess_text(text) for text in texts]

In [70]:
def perform_ner(text):
    return nlp_BERT(text)

In [71]:
BERT_ner_results = []
for text in preprocessed_texts:
    ner_results = perform_ner(text)
    BERT_ner_results.append(ner_results)

In [72]:
# Add NER results to the dataframe
dataset['NER_BERT'] = BERT_ner_results

# Example: Print the first few rows with NER results
print(dataset[['GPT_translated_text', 'NER_BERT']].head())

                                 GPT_translated_text  \
0  It looks like you have provided "C [5v]" but t...   
1  In the year 1441, I, Sister Engel Varnblerin,...   
2  In the 59th year, a community was founded. Thi...   
3  to the prior Sister Anna Krumin and the subpri...   
4  "In the year 1476, our revered mother prioress...   

                                            NER_BERT  
0  [{'entity_group': 'MISC', 'score': 0.99695885,...  
1  [{'entity_group': 'PER', 'score': 0.94694614, ...  
2  [{'entity_group': 'PER', 'score': 0.96859765, ...  
3  [{'entity_group': 'PER', 'score': 0.9900964, '...  
4  [{'entity_group': 'MISC', 'score': 0.9784345, ...  


In [73]:
def extract_entities(ner_results, entity_type):
    return [entity[0] for entity in ner_results if entity[1] == entity_type]

# Add columns for specific entity types
dataset['BERT_PERSON'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'PERSON'))
dataset['BERT_ORG'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'ORG'))
dataset['BERT_GPE'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'GPE')) #geopolitical entities
dataset['BERT_LOC'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'LOC'))
#dataset['EVENT'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'EVENT'))
#dataset['WORK_OF_ART'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'WORK_OF_ART'))
#dataset['PRODUCT'] = dataset['NER_spaCy'].apply(lambda x: extract_entities(x, 'PRODUCT'))

In [74]:
dataset.to_csv('Translated_ConventBook_ChatGPT_NER.csv', index=False)