In [1]:
### Named Entity Linking to identify characteristics from arbitrary piece of text.
### Using Flair Library for NER Paper: https://www.aclweb.org/anthology/N19-4010.pdf

In [2]:
from flair.data import Sentence
from flair.models import SequenceTagger

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

### load NER tagger model
tagger = SequenceTagger.load('ner')

2021-04-11 22:01:50,205 --------------------------------------------------------------------------------
2021-04-11 22:01:50,205 The model key 'ner' now maps to 'https://huggingface.co/flair/ner-english' on the HuggingFace ModelHub
2021-04-11 22:01:50,206  - The most current version of the model is automatically downloaded from there.
2021-04-11 22:01:50,206  - (you can alternatively manually download the original model at https://nlp.informatik.hu-berlin.de/resources/models/ner/en-ner-conll03-v0.4.pt)
2021-04-11 22:01:50,207 --------------------------------------------------------------------------------
2021-04-11 22:01:50,391 loading file /Users/qasimwani/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4


In [3]:
df = pd.read_csv('law_data.csv')
del df['Unnamed: 0'] #remove first index

In [4]:
df.head()

Unnamed: 0,Header,State,Year,Title,Category,Topic,Summary,StateCode
0,Ala. Code 37-16-1 et seq.,Alabama,2019,Broadband Using Electric Easements Accessibili...,Category: Competition and regulation,Topic: Cooperatives,Allows electric utilities to also provide broa...,(a) To the extent not otherwise authorized by ...
1,Ala. Code 37-16-2,Alabama,2019,Broadband Using Electric Easements Accessibili...,Category: Other,Topic: Legislative intent,Declares that the state intends to encourage t...,(a) The Legislature finds and declares the fol...
2,Ala. Code 37-16-3,Alabama,2019,Broadband Using Electric Easements Accessibili...,Category: Definitions,Topic: Definition - Broadband (As defined by F...,"Defines ""advanced communications capabilities""...",(l) Advanced Communications Capabilities. The ...
3,Ala. Code 37-16-3,Alabama,2019,Broadband Using Electric Easements Accessibili...,Category: Definitions,Topic: Definition - Broadband (Other speed),Defines broadband as an internet connection th...,(5) Broadband Services. The provision of conne...
4,Ala. Code 37-16-7,Alabama,2019,Broadband Using Electric Easements Accessibili...,Category: Infrastructure access,Topic: Right of way (Easements),Specifies the terms under which a property own...,(a) If the owner of an interest in real proper...


In [5]:
summaries = np.array(df.Summary) #gather all summaries

In [6]:
def generate_entities(text):
    """
    Generate NER for a given piece of text
    Return: 
    - Token, Label. Shape: ~O(n x 2), where n is number of elements in text.
    """
    # make a sentence
    sentence = Sentence(text)
    # run NER over sentence
    tagger.predict(sentence)
    # iterate over entities
    tokens = []; labels = []
    for entity in sentence.get_spans('ner'):
        tokens.append(entity.to_plain_string())
        labels += [*entity.labels]
    return tokens, labels

In [7]:
tokens = []; labels = []

#generate NER report
for summary in tqdm(summaries):
    tk, lb = generate_entities(summary)
    tokens.append(", ".join(tk) or None)
    labels.append(", ".join(lb) or None)

  0%|          | 0/851 [00:00<?, ?it/s]

In [8]:
df['NER_tokens'] = tokens
df['NER_labels'] = labels

In [9]:
df.head()

Unnamed: 0,Header,State,Year,Title,Category,Topic,Summary,StateCode,NER_tokens,NER_labels
0,Ala. Code 37-16-1 et seq.,Alabama,2019,Broadband Using Electric Easements Accessibili...,Category: Competition and regulation,Topic: Cooperatives,Allows electric utilities to also provide broa...,(a) To the extent not otherwise authorized by ...,,
1,Ala. Code 37-16-2,Alabama,2019,Broadband Using Electric Easements Accessibili...,Category: Other,Topic: Legislative intent,Declares that the state intends to encourage t...,(a) The Legislature finds and declares the fol...,,
2,Ala. Code 37-16-3,Alabama,2019,Broadband Using Electric Easements Accessibili...,Category: Definitions,Topic: Definition - Broadband (As defined by F...,"Defines ""advanced communications capabilities""...",(l) Advanced Communications Capabilities. The ...,[FCC],[ORG (0.9971)]
3,Ala. Code 37-16-3,Alabama,2019,Broadband Using Electric Easements Accessibili...,Category: Definitions,Topic: Definition - Broadband (Other speed),Defines broadband as an internet connection th...,(5) Broadband Services. The provision of conne...,,
4,Ala. Code 37-16-7,Alabama,2019,Broadband Using Electric Easements Accessibili...,Category: Infrastructure access,Topic: Right of way (Easements),Specifies the terms under which a property own...,(a) If the owner of an interest in real proper...,,


In [10]:
df.to_csv('updated_law_text.csv', index=False) #save file