In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(101)

import sklearn
import tensorflow as tf

In [6]:
print(f'tensorflow version is {tf.__version__}')
print(f'sklearn version is {sklearn.__version__}')

tensorflow version is 2.14.0
sklearn version is 1.2.1


In [12]:
data = pd.read_csv("data/ner_datasetreference.csv",encoding='unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [13]:
data.shape

(1048575, 4)

In [15]:
data["Sentence #"].isna().sum()

1000616

In [17]:
data.isna().sum()

Sentence #    1000616
Word                0
POS                 0
Tag                 0
dtype: int64

In [18]:
data["POS"].value_counts()

NN      145807
NNP     131426
IN      120996
DT       98454
JJ       78412
NNS      75840
.        47831
VBD      39379
,        32757
VBN      32328
VBZ      24960
CD       24695
VB       24211
CC       23716
TO       23061
RB       20252
VBG      19125
VBP      16158
PRP      13318
POS      11257
PRP$      8655
MD        6973
``        3728
WDT       3698
JJS       3034
JJR       2967
WP        2542
NNPS      2521
RP        2490
WRB       2184
$         1149
RBR       1055
:          795
RRB        679
LRB        678
EX         663
RBS        296
;          214
PDT        147
WP$         99
UH          24
FW           1
Name: POS, dtype: int64

In [46]:
data["Tag"].value_counts().index

Index(['O', 'B-geo', 'B-tim', 'B-org', 'I-per', 'B-per', 'I-org', 'B-gpe',
       'I-geo', 'I-tim', 'B-art', 'B-eve', 'I-art', 'I-eve', 'B-nat', 'I-gpe',
       'I-nat'],
      dtype='object')

In [32]:
data[data["Tag"]=="B-geo"].head()

Unnamed: 0,Sentence #,Word,POS,Tag
6,,London,NNP,B-geo
12,,Iraq,NNP,B-geo
65,,Hyde,NNP,B-geo
94,,Britain,NNP,B-geo
106,,Brighton,NNP,B-geo


In [47]:
#data[(data["Tag"]==i and data["Tag"]!="O") ]
#(data["Tag"]==i and data["Tag"]!="O")

In [62]:
for i in data["Tag"].value_counts().index[1:]:
    print(i)
    try:
        print(data[(data["Tag"]==i)].head(2))
    except:
        print()
    print()
    

B-geo
   Sentence #    Word  POS    Tag
6         NaN  London  NNP  B-geo
12        NaN    Iraq  NNP  B-geo

B-tim
    Sentence #       Word  POS    Tag
167        NaN  Wednesday  NNP  B-tim
211        NaN  Wednesday  NNP  B-tim

B-org
    Sentence #           Word  POS    Tag
97         NaN          Labor  NNP  B-org
154        NaN  International  NNP  B-org

I-per
    Sentence #         Word  POS    Tag
271        NaN      Mahmoud  NNP  I-per
272        NaN  Ahmadinejad  NNP  I-per

B-per
    Sentence #       Word  POS    Tag
42         NaN       Bush  NNP  B-per
270        NaN  President  NNP  B-per

I-org
    Sentence #    Word  POS    Tag
98         NaN   Party  NNP  I-org
155        NaN  Atomic  NNP  I-org

B-gpe
    Sentence #     Word POS    Tag
18         NaN  British  JJ  B-gpe
102        NaN  English  JJ  B-gpe

I-geo
    Sentence #   Word  POS    Tag
66         NaN   Park  NNP  I-geo
347        NaN  State  NNP  I-geo

I-tim
     Sentence # Word POS    Tag
1479        NaN   

In [51]:
data["Word"][:10]

0        Thousands
1               of
2    demonstrators
3             have
4          marched
5          through
6           London
7               to
8          protest
9              the
Name: Word, dtype: object

# Name Entity Recognition

NER is a subtask of information extraction that locates and classifies named entities in a text. The named entities could be organizations, persons, locations, times, etc.

For example,

Hydroxychloroquine has been studied for the treatment and prevention of coronavirus disease 2019. The named entities in this sentence are

- Hydroxychloroquine - Drugs
- coronavirus disease - Diseases

# Formulate an NER Problem

- Define the entity classes you want to extract. For example, gene, disease, drug, etc.
- Collect relevant text data and annotate the data
- Train NER model to predict those entities and classify them into the given categories

# Sequence Model Approach to NER
**Training**
- Collect a set of representative training examples (documents)
- Annotate each token for its entity class or other (O) using BIO encoding scheme
- Feature engineering
- Train a sequence tagger to predict the labels

**Test**
- Receive/sample a set of test examples
- Run sequence model inference to label each token
- Return the recognized entities

In natural language processing (NLP), particularly in Named Entity Recognition (NER) tasks, the BIO (Begin, Inside, Outside) tagging scheme is commonly used to annotate and encode sequences of words or tokens to indicate the boundaries of named entities within the text. The BIO scheme is used for labeling each token in a sequence as the beginning of an entity (B), inside an entity (I), or outside an entity (O).

### Encoding in the BIO Scheme:

- **B (Begin):** Indicates the beginning of an entity within a sequence. It marks the first token of a named entity.

- **I (Inside):** Indicates tokens inside an entity other than the first token. It follows a "B" tag and identifies subsequent tokens that are part of the same entity.

- **O (Outside):** Represents tokens that are not part of any named entity.

### Example of BIO Encoding:

Consider the sentence: "John Smith works at Google in New York."

For a named entity recognition task targeting entities like person names, organizations, and locations, the BIO encoding might look like this:

| Token | Tag       |
|-------|-----------|
| John  | B-Person  |
| Smith | I-Person  |
| works | O         |
| at    | O         |
| Google| B-Organization |
| in    | O         |
| New   | B-Location|
| York  | I-Location|
| .     | O         |

Here, the words "John" and "Smith" form a person entity, "Google" is an organization, and "New York" represents a location. Each token is tagged with its corresponding label using the BIO scheme, indicating the boundaries of the named entities in the text.

### Key Points:

- The BIO scheme allows for granular labeling of tokens within named entities, helping NER models to learn entity boundaries.
- It ensures that each token in a sequence is assigned a label indicating whether it is part of an entity, the beginning of an entity, or outside any entity.
- The BIO encoding scheme is commonly used in training data preparation for NER tasks, enabling the development of models that can recognize and extract named entities from text.

# Exploring the dataset
In this tutorial, ner dataset provided by Kaggle. It consists of four columns:

- sentence number
- word
- part of speech tag of the word,
- NER tag associated with each word.

Essential info about the tagged entities in the given dataset:

- geo = Geographical Entity
- org = Organization
- per = Person
- gpe = Geopolitical Entity
- tim = Time indicator
- art = Artifact
- eve = Event
- nat = Natural Phenomenon

# Data Preprocessing
We will have to preprocess the dataset, in such a way that every sentence is one row and later tokenise and encode each sentence input.

In [63]:
data=data.fillna(method='ffill')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [65]:
len(data['Tag'].value_counts())

17

In [66]:
data["Tag"].nunique()

17

We have to create a list of sentences and list of NER Tags associated with each sentence.

In [84]:
#agg_func = lambda s: [(building_name, building_number, 
#                       city,recipent,street_name,zip_code,state,country) 
#                      for building_name, building_number, city, recipent,
#                      street_name,zip_code,state,country in zip(s["building_name"].values .tolist(),
#                                                       s['building_number'].values.tolist(),
#                                                       s['city'].values.tolist(),
#                                                       s['recipent'].values.tolist(),
#                                                       s['street_name'].values.tolist(),
#                                                       s['state'].values.tolist(),
#                                                       s['country'].values.tolist(),
#                                                       s["zip_code"].values.tolist())]
#                                                       """

In [75]:
agg_func = lambda s:[(w,p,t) for w, p, t in zip(s["Word"].values.tolist(),
                                               s["POS"].values.tolist(),
                                               s["Tag"].values.tolist())] 

In [81]:
#data.groupby(['Sentence #']).count()

In [82]:
#data["Sentence #"].unique()

In [76]:
agg_func=data.groupby(['Sentence #']).apply(agg_func).reset_index().rename(columns={0:'Sentence_POS_Tag_Pair'})
agg_func.head()

Unnamed: 0,Sentence #,Sentence_POS_Tag_Pair
0,Sentence: 1,"[(Thousands, NNS, O), (of, IN, O), (demonstrat..."
1,Sentence: 10,"[(Iranian, JJ, B-gpe), (officials, NNS, O), (s..."
2,Sentence: 100,"[(Helicopter, NN, O), (gunships, NNS, O), (Sat..."
3,Sentence: 1000,"[(They, PRP, O), (left, VBD, O), (after, IN, O..."
4,Sentence: 10000,"[(U.N., NNP, B-geo), (relief, NN, O), (coordin..."


In [70]:
agg_data=data.groupby(['Address']).apply(agg_func).reset_index().rename(columns={0:'Sentence_POS_Tag_Pair'})
agg_data.head()

KeyError: 'Address'