Exploratory data analysis on the OntoNotes dataset, to gain insights towards the templating of the dataset

In [None]:
import pandas as pd
pd.options.display.max_rows = 4000
pd.set_option('display.max_colwidth', -1)

In [5]:
conll = "" # Download CoNLL-2003

df_list = []
sentence_id = 0
for sentence in conll:
   
    df = pd.DataFrame(sentence,columns = ["word","tag"])
    df["sentence_idx"] = sentence_id
    sentence_id+=1
    df_list.append(df)
ner_dataset = pd.concat(df_list)
ner_dataset.head(10)

In [6]:
TAGS_TO_IGNORE = ['CARDINAL','FAC','LAW','LANGUAGE','TIME','DATE','ORDINAL','EVENT','QUANTITY','WORK_OF_ART','MONEY','PRODUCT','PERCENT']
def remote_unwanted_tags(x):
    if len(x)>1 and x[2:] in TAGS_TO_IGNORE:
        return 'O'
    else:
        return x

ner_dataset['tag'] = ner_dataset['tag'].apply(remote_unwanted_tags)
ner_dataset[ner_dataset['sentence_idx']==3]

In [28]:
sentences = ner_dataset.groupby('sentence_idx')['word'].transform(lambda x: ' '.join(x)).unique().tolist()

In [34]:
len(sentences)
#print(sentences[:5])
with open("raw_sentences.txt","w",encoding="utf8") as f:
    for item in sentences:
        f.write("{}\n".format(item))

#### Number of labels per tag

In [261]:
ner_dataset.groupby('tag')['tag'].count()

In [264]:
ner_dataset['word'] = ner_dataset['word'].replace('-LRB-',')')\
.replace('-RRB-',')')\
.replace('``',"\"")\
.replace("''",'"')\
.replace('/.','.')

In [265]:
from collections import Counter
Counter(ner_dataset['word']).most_common(30)

#### Add lead and lag words and tags to dataset_no_punct

In [267]:
import string
punct = [c for c in string.punctuation]
punct.extend(["--","''","/."])
print(punct)
dataset_no_punct = ner_dataset[~ner_dataset.word.str.strip().isin(punct)]
dataset_no_punct['prev-word'] = dataset_no_punct.word.shift(1)
dataset_no_punct['prev-prev-word'] = dataset_no_punct['word'].shift(2)
dataset_no_punct['next-word'] = dataset_no_punct['word'].shift(-1)
dataset_no_punct['prev-tag'] = dataset_no_punct['tag'].shift(1)
dataset_no_punct['next-tag'] = dataset_no_punct['tag'].shift(-1)
dataset_no_punct.head()

#### Add features for easier manipulation

In [268]:
ner_dataset['prev-word'] = ner_dataset.word.shift(1)
ner_dataset['prev-prev-word'] = ner_dataset['word'].shift(2)
ner_dataset['next-word'] = ner_dataset['word'].shift(-1)
ner_dataset['next-next-word'] = ner_dataset['word'].shift(-2)
ner_dataset['prev-tag'] = ner_dataset['tag'].shift(1)
ner_dataset['next-tag'] = ner_dataset['tag'].shift(-1)

#### Gather statistics on the first person token

In [269]:
bper = dataset_no_punct[dataset_no_punct['tag']=='B-PERSON']

In [270]:
# histogram of B-PERSON tokens
from collections import Counter
Counter(bper['word']).most_common(20)

In [271]:
prev_bper_token = bper['prev-word'].str.lower()
Counter(prev_bper_token).most_common(20)

In [272]:
prev_prev_bper_token = bper['prev-prev-word']
two_prev_tokens = zip(prev_prev_bper_token.str.lower(), prev_bper_token.str.lower())
Counter(two_prev_tokens).most_common(20)

In [273]:
# find "the" followed by B-PERSON
the_PERSON = ner_dataset[(ner_dataset['prev-word'].str.lower()=="the") & (ner_dataset['tag']=='B-PERSON')]
print(the_PERSON['prev-word']+" "+the_PERSON['word']+" "+the_PERSON['next-word']+" "+the_PERSON['next-next-word'].values)

In [296]:
## add metadata for nationalities (to differentiate between America, Americans and US citizen)
nationalities = pd.read_csv("../raw_data/nationalities.csv")
nationalities.head()

ner_dataset['metadata'] = None

def get_nationality_as_metadata(row):
    if row['word'].lower() in nationalities['country'].values:
        return 'COUNTRY'
    elif row['word'].lower() in nationalities['nationality'].values:
        return 'NATIONALITY'
    elif row['word'].lower() in nationalities['man'].values:
        return 'NATION_MAN'
    elif row['word'].lower() in nationalities['woman'].values:
        return 'NATION_WOMAN'
    return row['metadata']

row = pd.Series({'word':'Frenchwoman','metadata':None})
print("Example: Frenchwoman -> ",get_nationality_as_metadata(row))

ner_dataset['metadata'] = ner_dataset.apply(get_nationality_as_metadata, axis=1)

In [297]:
# removing PERSON tags from sentences with a 'the' preceding the person:

def remove_tag_if_the_person(row):
    if row['prev-word'].lower() == 'the' and row['tag']=='B-PERSON':
        return 'O'
    elif row['prev-prev-word'].lower() == 'the' and row['prev-tag']=='I-PERSON' and row['tag']=='B-PERSON':
        return 'O'
    return row['tag']

def remove_tag_if_the_norp(row):
    if row['prev-word'].lower() == 'the' and row['tag']=='B-NORP' and row['metadata'] is None:
        return 'O'
    elif row['prev-prev-word'].lower() == 'the' and row['prev-tag']=='I-NORP' and row['tag']=='B-NORP' and row['metadata'] is None:
        return 'O'
    return row['tag']

ner_dataset['prev-word']=ner_dataset['prev-word'].astype('str')
ner_dataset['prev-prev-word']=ner_dataset['prev-prev-word'].astype('str')
ner_dataset['tag'] = ner_dataset.apply(remove_tag_if_the_person,axis=1)
ner_dataset['tag'] = ner_dataset.apply(remove_tag_if_the_norp,axis=1)

In [299]:
# find "the" followed by B-NORP
the_NORP = ner_dataset[(ner_dataset['prev-word'].str.lower()=="the") & (ner_dataset['tag']=='B-NORP')]
print(the_NORP['prev-word']+" "+the_NORP['word']+" "+the_NORP['next-word']+" "+the_NORP['next-next-word'].values + " (" + the_NORP['metadata'] + ")")

In [276]:
def remove_tag_if_apostraphe_after_tag(row):
    if row['prev-tag'] != 'O' and row['word']=="'s":
        return 'O'
    return row['tag']
ner_dataset['tag'] = ner_dataset.apply(remove_tag_if_apostraphe_after_tag,axis=1)

In [277]:
sentences_with_president=ner_dataset[ner_dataset['word'].str.lower() == 'president']['sentence_idx']
ner_dataset[ner_dataset['sentence_idx']==sentences_with_president.iloc[0]]

In [279]:
ner_dataset[ner_dataset['tag']=='B-PERSON']

#### Adjacent tags

In [281]:
ner_dataset['entity'] = ner_dataset['tag'].str[2:]
ner_dataset['next-entity']=ner_dataset['next-tag'].str[2:]


In [286]:
adjacent_idc = (ner_dataset['tag'] != 'O') & (ner_dataset['next-tag'] != 'O') & (ner_dataset['entity'] != ner_dataset['next-entity'])
print("sentences with duplicate different entities: ",str(len(ner_dataset[adjacent_idc])))
ner_dataset[adjacent_idc]['sentence_idx']


In [289]:
ner_dataset[ner_dataset['sentence_idx']==8759]

NORP values

In [293]:
norp_values = ner_dataset[ner_dataset['entity']=='NORP']['word']
Counter(norp_values).most_common(50)

### The country?

In [311]:
the_X_idx = (ner_dataset['prev-word']=='the') & (ner_dataset['tag'] != 'O')
the_X_sentences = ner_dataset[the_X_idx]['sentence_idx']
the_X_sentences.values[0]
ner_dataset[ner_dataset['sentence_idx']==the_X_sentences.values[0]]