In [None]:
import numpy as np
import pandas as pd

# Processing Training Data

Importing the Training data file

In [None]:
df = pd.read_csv("training.csv")
df.head(10)

Unnamed: 0,PMID_Type,Sentence_Index,Token,Tag
0,21826085_A,0,We,O
1,21826085_A,0,implemented,O
2,21826085_A,0,a,O
3,21826085_A,0,two,O
4,21826085_A,0,-,O
5,21826085_A,0,step,O
6,21826085_A,0,approach,O
7,21826085_A,0,to,O
8,21826085_A,0,detect,O
9,21826085_A,0,potential,O


#Checking Nullness of the dataset

In [None]:
df.isnull().sum()

PMID_Type          0
Sentence_Index     0
Token             19
Tag                0
dtype: int64

In [None]:
#Dealing with Null Values in Token
df['Token'].fillna('NA', inplace = True)
df.isnull().sum()

PMID_Type         0
Sentence_Index    0
Token             0
Tag               0
dtype: int64

In [None]:
#Value counts
df.Tag.value_counts()

O                 631474
B-TRIVIAL           8821
B-SYSTEMATIC        6655
B-ABBREVIATION      4535
B-FORMULA           4443
B-FAMILY            4087
I-SYSTEMATIC        2098
I-FAMILY            1403
I-TRIVIAL           1188
I-MULTIPLE           708
B-IDENTIFIER         671
I-FORMULA            595
B-MULTIPLE           201
I-IDENTIFIER          77
I-ABBREVIATION        69
B-NO CLASS            40
I-NO CLASS            10
Name: Tag, dtype: int64

In [None]:
#Lets drop the Tags [B-MULTIPLE,I-IDENTIFIER, I-ABBREVIATION, B-NO CLASS, I-NO CLASS] because of low count
tags_to_drop = ['B-MULTIPLE', 'I-IDENTIFIER', 'I-ABBREVIATION', 'B-NO CLASS', 'I-NO CLASS']
df = df[~df['Tag'].isin(tags_to_drop)]
df.Tag.value_counts()

O                 631474
B-TRIVIAL           8821
B-SYSTEMATIC        6655
B-ABBREVIATION      4535
B-FORMULA           4443
B-FAMILY            4087
I-SYSTEMATIC        2098
I-FAMILY            1403
I-TRIVIAL           1188
I-MULTIPLE           708
B-IDENTIFIER         671
I-FORMULA            595
Name: Tag, dtype: int64

In [None]:
df.head(5)

Unnamed: 0,PMID_Type,Sentence_Index,Token,Tag
0,21826085_A,0,We,O
1,21826085_A,0,implemented,O
2,21826085_A,0,a,O
3,21826085_A,0,two,O
4,21826085_A,0,-,O


In [None]:
#Removing the string "_A" from PMID_Type
df['PMID_Type'] = df['PMID_Type'].str[:-2]
df.head(5)

Unnamed: 0,PMID_Type,Sentence_Index,Token,Tag
0,21826085,0,We,O
1,21826085,0,implemented,O
2,21826085,0,a,O
3,21826085,0,two,O
4,21826085,0,-,O


In [None]:
#grouping the DataFrame df by the 'PMID_Type' column and then aggregating the values in the other columns for each group
training_df = df.groupby(['PMID_Type']).agg(lambda x: x.tolist()).reset_index()
training_df

Unnamed: 0,PMID_Type,Sentence_Index,Token,Tag
0,21826085,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[We, implemented, a, two, -, step, approach, t...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,22080034,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...","[Aflatoxicosis, is, a, cause, of, economic, lo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,22080035,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[The, aim, of, this, study, was, to, investiga...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,22080037,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Mercury, induces, the, expression, of, cycloo...","[B-SYSTEMATIC, O, O, O, O, O, O, O, B-SYSTEMAT..."
4,22258629,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Toxic, effects, of, chromium, on, tannery, wo...","[O, O, O, B-SYSTEMATIC, O, O, O, O, O, O, O, O..."
...,...,...,...,...
2911,23644214,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Synthetic, tactics, of, new, class, of, 4-ami...","[O, O, O, O, O, O, B-SYSTEMATIC, O, O, O, O, O..."
2912,23644256,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Ibuprofen, plus, Isosorbide, Dinitrate, treat...","[B-TRIVIAL, O, B-SYSTEMATIC, I-SYSTEMATIC, O, ..."
2913,23644525,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Conjugated, polymers, and, small, organic, mo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2914,23645248,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Oestrogen, action, on, thyroid, progenitor, c...","[B-FAMILY, O, O, O, O, O, O, O, O, O, O, O, O,..."


# Processing Validation Data

In [None]:
df = pd.read_csv("validation.csv")
df

Unnamed: 0,PMID_Type,Sentence_Index,Token,Tag
0,22006095_T,0,Serotonin,B-TRIVIAL
1,22006095_T,0,receptor,O
2,22006095_T,0,2A,O
3,22006095_T,0,(,O
4,22006095_T,0,HTR2A,O
...,...,...,...,...
662202,23646356_A,0,.,O
662203,23646356_A,1,Electron,O
662204,23646356_A,1,transmission,O
662205,23646356_A,1,spectroscopy,O


In [None]:
print(df.isnull().sum())
print(df['Token'].fillna('NA', inplace = True))
print(df.isnull().sum())
print(df.Tag.value_counts())
tags_to_drop = ['B-MULTIPLE', 'I-IDENTIFIER', 'I-ABBREVIATION', 'B-NO CLASS', 'I-NO CLASS']
df = df[~df['Tag'].isin(tags_to_drop)]
print(df.Tag.value_counts())
df['PMID_Type'] = df['PMID_Type'].str[:-2]
df


PMID_Type          0
Sentence_Index     0
Token             30
Tag                0
dtype: int64
None
PMID_Type         0
Sentence_Index    0
Token             0
Tag               0
dtype: int64
O                 626683
B-TRIVIAL           8963
B-SYSTEMATIC        6816
B-ABBREVIATION      4521
B-FAMILY            4223
B-FORMULA           4135
I-SYSTEMATIC        2119
I-FAMILY            1579
I-TRIVIAL           1076
I-MULTIPLE           650
B-IDENTIFIER         636
I-FORMULA            430
B-MULTIPLE           188
I-IDENTIFIER          84
I-ABBREVIATION        70
B-NO CLASS            32
I-NO CLASS             2
Name: Tag, dtype: int64
O                 626683
B-TRIVIAL           8963
B-SYSTEMATIC        6816
B-ABBREVIATION      4521
B-FAMILY            4223
B-FORMULA           4135
I-SYSTEMATIC        2119
I-FAMILY            1579
I-TRIVIAL           1076
I-MULTIPLE           650
B-IDENTIFIER         636
I-FORMULA            430
Name: Tag, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PMID_Type'] = df['PMID_Type'].str[:-2]


Unnamed: 0,PMID_Type,Sentence_Index,Token,Tag
0,22006095,0,Serotonin,B-TRIVIAL
1,22006095,0,receptor,O
2,22006095,0,2A,O
3,22006095,0,(,O
4,22006095,0,HTR2A,O
...,...,...,...,...
662202,23646356,0,.,O
662203,23646356,1,Electron,O
662204,23646356,1,transmission,O
662205,23646356,1,spectroscopy,O


In [None]:
validation_df = df.groupby(['PMID_Type']).agg(lambda x: x.tolist()).reset_index()
validation_df

Unnamed: 0,PMID_Type,Sentence_Index,Token,Tag
0,22006095,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Serotonin, receptor, 2A, (, HTR2A, ), gene, p...","[B-TRIVIAL, O, O, O, O, O, O, O, O, O, O, O, B..."
1,22056334,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Unborn, children, are, exposed, to, environme...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,22079313,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[The, fish, ingredient, N3-docosahexaenoic, ac...","[O, O, O, B-SYSTEMATIC, I-SYSTEMATIC, O, O, O,..."
3,22082827,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[The, harmful, effects, of, folkloric, uses, o...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,22236017,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[The, flower, volatile, compounds, (, FVCs, ),...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...,...
2902,23644925,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Neuritogenic, and, Neuroprotective, Effects, ...","[O, O, O, O, O, O, B-FAMILY, O, O, O, O, O, O,..."
2903,23645630,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Antagonistic, Basic, Helix-Loop-Helix, /, bZI...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2904,23646041,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Self, -, assembling, doxorubicin, silk, hydro...","[O, O, O, B-TRIVIAL, O, O, O, O, O, O, O, O, O..."
2905,23646355,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[The, efficiency, and, insight, of, global, ,,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


# Processing Evaluation Data

In [None]:

df = pd.read_csv("evaluation.csv")
df

Unnamed: 0,PMID_Type,Sentence_Index,Token,Tag
0,23208951_T,0,Microstructured,O
1,23208951_T,0,",",O
2,23208951_T,0,functional,O
3,23208951_T,0,PVA,B-ABBREVIATION
4,23208951_T,0,hydrogels,O
...,...,...,...,...
571148,23343596_A,8,upon,O
571149,23343596_A,8,shared,O
571150,23343596_A,8,neurochemical,O
571151,23343596_A,8,systems,O


In [None]:
print(df.isnull().sum())
print(df['Token'].fillna('NA', inplace = True))
print(df.isnull().sum())
print(df.Tag.value_counts())
tags_to_drop = ['B-MULTIPLE', 'I-IDENTIFIER', 'I-ABBREVIATION', 'B-NO CLASS', 'I-NO CLASS']
df = df[~df['Tag'].isin(tags_to_drop)]
print(df.Tag.value_counts())
df['PMID_Type'] = df['PMID_Type'].str[:-2]
df

PMID_Type          0
Sentence_Index     0
Token             11
Tag                0
dtype: int64
None
PMID_Type         0
Sentence_Index    0
Token             0
Tag               0
dtype: int64
O                 540641
B-TRIVIAL           7808
B-SYSTEMATIC        5665
B-ABBREVIATION      4059
B-FAMILY            3622
B-FORMULA           3441
I-SYSTEMATIC        1687
I-FAMILY            1450
I-TRIVIAL           1139
I-MULTIPLE           576
B-IDENTIFIER         513
I-FORMULA            204
B-MULTIPLE           195
I-ABBREVIATION        59
I-IDENTIFIER          47
B-NO CLASS            41
I-NO CLASS             6
Name: Tag, dtype: int64
O                 540641
B-TRIVIAL           7808
B-SYSTEMATIC        5665
B-ABBREVIATION      4059
B-FAMILY            3622
B-FORMULA           3441
I-SYSTEMATIC        1687
I-FAMILY            1450
I-TRIVIAL           1139
I-MULTIPLE           576
B-IDENTIFIER         513
I-FORMULA            204
Name: Tag, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PMID_Type'] = df['PMID_Type'].str[:-2]


Unnamed: 0,PMID_Type,Sentence_Index,Token,Tag
0,23208951,0,Microstructured,O
1,23208951,0,",",O
2,23208951,0,functional,O
3,23208951,0,PVA,B-ABBREVIATION
4,23208951,0,hydrogels,O
...,...,...,...,...
571148,23343596,8,upon,O
571149,23343596,8,shared,O
571150,23343596,8,neurochemical,O
571151,23343596,8,systems,O


In [None]:
evaluation_df = df.groupby(['PMID_Type']).agg(lambda x: x.tolist()).reset_index()
evaluation_df

Unnamed: 0,PMID_Type,Sentence_Index,Token,Tag
0,21723361,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Effects, of, docosahexaenoic, acid, and, meth...","[O, O, B-SYSTEMATIC, I-SYSTEMATIC, O, B-SYSTEM..."
1,21838705,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Garlic, phytochemicals, and, garlic, suppleme...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,22064666,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Endothelial, nitric, oxide, synthase, genotyp...","[O, B-SYSTEMATIC, I-SYSTEMATIC, O, O, O, O, O,..."
3,22075688,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[On, the, benefit, of, magnetic, magnesium, na...","[O, O, O, O, O, B-SYSTEMATIC, O, O, O, O, O, B..."
4,22082826,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Scrotal, hyperthermia, has, been, known, as, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...,...
2473,23644911,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[The, effect, of, intranasal, oxytocin, treatm...","[O, O, O, O, B-TRIVIAL, O, O, O, O, O, O, O, O..."
2474,23645211,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Role, of, quercetin, in, cadmium, -induced, o...","[O, O, B-TRIVIAL, O, B-SYSTEMATIC, O, O, O, O,..."
2475,23645360,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[RNA, editing, is, one, of, the, post, -, tran...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2476,23645536,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Analysis, of, the, inhibitory, activity, of, ...","[O, O, O, O, O, O, O, O, O, O, O, B-FAMILY, O,..."


#Combining all the 3 Datasets into 1 single dataset for simplicity

In [None]:
data = pd.concat([training_df, validation_df,evaluation_df], ignore_index=True)
data

Unnamed: 0,PMID_Type,Sentence_Index,Token,Tag
0,21826085,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[We, implemented, a, two, -, step, approach, t...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,22080034,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...","[Aflatoxicosis, is, a, cause, of, economic, lo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,22080035,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[The, aim, of, this, study, was, to, investiga...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,22080037,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Mercury, induces, the, expression, of, cycloo...","[B-SYSTEMATIC, O, O, O, O, O, O, O, B-SYSTEMAT..."
4,22258629,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Toxic, effects, of, chromium, on, tannery, wo...","[O, O, O, B-SYSTEMATIC, O, O, O, O, O, O, O, O..."
...,...,...,...,...
8296,23644911,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[The, effect, of, intranasal, oxytocin, treatm...","[O, O, O, O, B-TRIVIAL, O, O, O, O, O, O, O, O..."
8297,23645211,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Role, of, quercetin, in, cadmium, -induced, o...","[O, O, B-TRIVIAL, O, B-SYSTEMATIC, O, O, O, O,..."
8298,23645360,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[RNA, editing, is, one, of, the, post, -, tran...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
8299,23645536,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Analysis, of, the, inhibitory, activity, of, ...","[O, O, O, O, O, O, O, O, O, O, O, B-FAMILY, O,..."


Lets create a dataframe where Each row in the DataFrame will correspond to a sentence in the original dataset, and the tags will indicate the start and end positions of named entities and their type. For example, in the second row, the word 'aflatoxin' will be tagged as a member of the 'B-FAMILY' entity type at character positions 197-206, 613-622, 741-750, 1107-1116, 1342-1351, and 1420-1429.

In [None]:
words_list = data['Token']
tags_list = data['Tag']
sentence_list = []
word_tag_list = []

for i in range(len(words_list)):
  sentence = ' '.join(words_list[i])
  sentence_list.append((sentence))
  start_char = 0
  end_char = 0
  tag_list  = []
  for j in range(len(words_list[i])):
      word = words_list[i][j]
      word_tag = tags_list[i][j]
      end_char = start_char + len(word)
      if word_tag != 'O':
            list1 = [start_char, end_char, word, word_tag]
            tag_list.append(list1)
      start_char = end_char + 1
  word_tag_list.append(tag_list)

full_data = pd.DataFrame({"sentence_list":sentence_list, "Tag_list":word_tag_list})
full_data

Unnamed: 0,sentence_list,Tag_list
0,We implemented a two - step approach to detect...,"[[974, 985, haloperidol, B-TRIVIAL]]"
1,Aflatoxicosis is a cause of economic losses in...,"[[197, 206, aflatoxin, B-FAMILY], [613, 622, a..."
2,The aim of this study was to investigate the e...,"[[268, 274, copper, B-SYSTEMATIC], [557, 564, ..."
3,Mercury induces the expression of cyclooxygena...,"[[0, 7, Mercury, B-SYSTEMATIC], [65, 71, nitri..."
4,Toxic effects of chromium on tannery workers a...,"[[17, 25, chromium, B-SYSTEMATIC], [71, 79, Ch..."
...,...,...
8296,The effect of intranasal oxytocin treatment on...,"[[25, 33, oxytocin, B-TRIVIAL], [426, 434, Oxy..."
8297,Role of quercetin in cadmium -induced oxidativ...,"[[8, 17, quercetin, B-TRIVIAL], [21, 28, cadmi..."
8298,RNA editing is one of the post - transcription...,"[[411, 421, lincomycin, B-TRIVIAL], [430, 441,..."
8299,Analysis of the inhibitory activity of Abeliop...,"[[89, 95, aldose, B-FAMILY], [178, 186, phenol..."


In [None]:
full_data.head(5)

Unnamed: 0,sentence_list,Tag_list
0,We implemented a two - step approach to detect...,"[[974, 985, haloperidol, B-TRIVIAL]]"
1,Aflatoxicosis is a cause of economic losses in...,"[[197, 206, aflatoxin, B-FAMILY], [613, 622, a..."
2,The aim of this study was to investigate the e...,"[[268, 274, copper, B-SYSTEMATIC], [557, 564, ..."
3,Mercury induces the expression of cyclooxygena...,"[[0, 7, Mercury, B-SYSTEMATIC], [65, 71, nitri..."
4,Toxic effects of chromium on tannery workers a...,"[[17, 25, chromium, B-SYSTEMATIC], [71, 79, Ch..."


In [None]:
full_data.to_csv('full_data.csv')

Createing a list valid_data_list that stores the data in a format suitable for training an NER model. It will contain start, end, Tags, Entity.

In [None]:
valid_data_list = []
for example in range(len(full_data['sentence_list'])):
  temp_dict = {}
  temp_dict['text'] = full_data['sentence_list'][example]
  temp_dict['entities'] = []
  temp_dict['entities'] = full_data['Tag_list'][example]
  valid_data_list.append(temp_dict)

In [None]:
valid_data_list[5]

{'text': 'Preliminary investigation of a number of plant extracts for allelopathic activity using seed germination inhibition bioassay showed a promising activity of the water extract of the aerial parts of Mikania scandens . Activity - guided fractionation of the M . scandens extract led to the isolation of the highly allelopathic active compound mikanolide , with minimum inhibitory concentration of 0.083 µM mL(-1) . As M . scandens is a highly abundant invasive plant in Sri Lanka and other South Asian countries , this plant could be developed as an environment friendly natural herbicide , either in crude form as shredded plant material or as pure mikanolide , which is the major constituent ( ∼0.02% ) in the plant .',
 'entities': [[340, 350, 'mikanolide', 'B-TRIVIAL'],
  [647, 657, 'mikanolide', 'B-TRIVIAL']]}

Splitting Data

In [None]:
#!pip install scikit-learn
from sklearn.model_selection import train_test_split
df_train, df_validation  = train_test_split(data, test_size=0.25, random_state = 101)
df_train = df_train.reset_index(drop=True)
df_train.to_csv('df_train.csv')
df_validation = df_validation .reset_index(drop=True)
df_validation.to_csv('df_val.csv')
print(df_train)
print(df_validation)

     PMID_Type                                     Sentence_Index  \
0     23600432  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
1     23583456  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2     23264448  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
3     23349489  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
4     23528611  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
...        ...                                                ...   
6220  23194512  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...   
6221  23623417  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
6222  23578522  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
6223  23361383  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
6224  23400943  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...   

                                                  Token  \
0     [Introduction, :, 5-Lipoxygenase, (, 5-LO, ), ...   
1     [A, comprehensive, machine, -, readable, view,.

# Now that Data is good enough, We start with Modeling

In [None]:
#!pip install torch
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
!pip install transformers datasets tokenizers seqeval[gpu] -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m101.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m14.9 MB/s[0

In [None]:
!pip install datasets
import datasets
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from datasets import load_dataset

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
dataset = load_dataset('csv', data_files={'train': "df_train.csv", 'validation': 'df_val.csv'})
dataset

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-99e0537640f90b9c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-99e0537640f90b9c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'PMID_Type', 'Sentence_Index', 'Token', 'Tag'],
        num_rows: 6225
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'PMID_Type', 'Sentence_Index', 'Token', 'Tag'],
        num_rows: 2076
    })
})

In [None]:
from ast import literal_eval
import datasets

def convert_to_list_of_tokens(example):
    example["Token"] = literal_eval(example["Token"])
    example["Tag"] = literal_eval(example["Tag"])
    #example["Sentence_Index"] = literal_eval(example["Sentence_Index"])
    return example

# Apply the conversion function to your dataset
Annotated_dataset = dataset.map(convert_to_list_of_tokens)
Annotated_dataset

Map:   0%|          | 0/6225 [00:00<?, ? examples/s]

Map:   0%|          | 0/2076 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'PMID_Type', 'Sentence_Index', 'Token', 'Tag'],
        num_rows: 6225
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'PMID_Type', 'Sentence_Index', 'Token', 'Tag'],
        num_rows: 2076
    })
})

In [None]:
Annotated_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'PMID_Type', 'Sentence_Index', 'Token', 'Tag'],
        num_rows: 6225
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'PMID_Type', 'Sentence_Index', 'Token', 'Tag'],
        num_rows: 2076
    })
})

In [None]:
label_list = sorted(list(set([tag for tags in Annotated_dataset["train"]["Tag"] for tag in tags])))
label_list

['B-ABBREVIATION',
 'B-FAMILY',
 'B-FORMULA',
 'B-IDENTIFIER',
 'B-SYSTEMATIC',
 'B-TRIVIAL',
 'I-FAMILY',
 'I-FORMULA',
 'I-MULTIPLE',
 'I-SYSTEMATIC',
 'I-TRIVIAL',
 'O']

In [None]:
from datasets import ClassLabel

#label_list = sorted(list(set([tag for tags in Annotated_dataset["train"]["Tag"] for tag in tags])))
Annotated_dataset = Annotated_dataset.map(
    lambda example: {"Token": example["Token"], "Tag": ClassLabel(names=label_list).str2int(example["Tag"])}
    # Here, we are using ClassLabel to convert the string tags to integers and storing the result in the "Tag" field
)

# Updating the features for the "train" split to use ClassLabel
Annotated_dataset["train"].features["Tag"] = ClassLabel(names=label_list)

Map:   0%|          | 0/6225 [00:00<?, ? examples/s]

Map:   0%|          | 0/2076 [00:00<?, ? examples/s]

In [None]:
Annotated_dataset["train"].features["Tag"]

Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)

# MODEL 1- BERT base uncased

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
example_text = Annotated_dataset['train'][0]
tokenized_input = tokenizer(example_text["Token"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
word_ids = tokenized_input.word_ids()
print(word_ids)

''' As we can see, it returns a list with the same number of elements as our processed input ids,
mapping special tokens to None and all other tokens to their respective word.
This way, we can align the labels with the processed input ids. '''

[None, 0, 1, 2, 2, 2, 2, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12, 13, 14, 15, 16, 17, 18, 19, 19, 19, 20, 21, 22, 23, 23, 24, 24, 24, 24, 24, 25, 26, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 38, 39, 40, 41, 41, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 53, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 72, 72, 73, 74, 75, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 93, 93, 93, 93, 94, 95, 96, 96, 96, 97, 98, 99, 100, 101, 102, 102, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 114, 115, 116, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127, 127, 127, 127, 127, 127, 127, 127, 128, 129, 129, 129, 129, 130, 131, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 155, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 1

' As we can see, it returns a list with the same number of elements as our processed input ids, \nmapping special tokens to None and all other tokens to their respective word. \nThis way, we can align the labels with the processed input ids. '

In [None]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'introduction',
 ':',
 '5',
 '-',
 'lip',
 '##ox',
 '##y',
 '##genase',
 '(',
 '5',
 '-',
 'lo',
 ')',
 'is',
 'a',
 'crucial',
 'enzyme',
 'of',
 'the',
 'ara',
 '##chi',
 '##don',
 '##ic',
 'acid',
 '(',
 'aa',
 ')',
 'cascade',
 'and',
 'cat',
 '##aly',
 '##zes',
 'the',
 'formation',
 'of',
 'bio',
 '##active',
 'le',
 '##uk',
 '##ot',
 '##rien',
 '##es',
 '(',
 'lt',
 '##s',
 ')',
 'with',
 'the',
 'help',
 'of',
 'flap',
 ',',
 'the',
 '5',
 '-',
 'lo',
 '-',
 'act',
 '##ivating',
 'protein',
 '.',
 'lt',
 '##s',
 'are',
 'inflammatory',
 'media',
 '##tors',
 'playing',
 'a',
 'path',
 '##op',
 '##hy',
 '##sio',
 '##logical',
 'role',
 'in',
 'different',
 'diseases',
 'like',
 'asthma',
 ',',
 'allergic',
 'r',
 '##hini',
 '##tis',
 'as',
 'well',
 'as',
 'cardiovascular',
 'diseases',
 'and',
 'certain',
 'types',
 'of',
 'cancer',
 '.',
 'with',
 'the',
 'rising',
 'number',
 'of',
 'indications',
 'for',
 'anti',
 '-',
 'lt',
 'therapy',
 ',',
 '5',
 '-',
 'lo',
 '

In [None]:
len(example_text['Tag']), len(tokenized_input["input_ids"])

(233, 303)

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["Token"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["Tag"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
Annotated_dataset['train'][4:5]

{'Unnamed: 0': [4],
 'PMID_Type': [23528611],
 'Sentence_Index': ['[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]'],
 'Token': [['Epidermal',
   'growth',


In [None]:
q = tokenize_and_align_labels(Annotated_dataset['train'][4:5])
print(q)

{'input_ids': [[101, 4958, 18688, 9067, 3930, 5387, 10769, 24054, 1052, 3211, 1011, 18610, 21208, 2015, 22935, 3860, 2302, 15189, 3896, 2006, 1996, 14234, 1999, 23760, 25808, 3512, 1019, 1013, 1020, 11265, 8458, 2890, 6593, 20936, 6924, 11432, 1012, 9099, 18908, 25761, 1997, 4958, 18688, 9067, 3930, 5387, 10769, 1006, 1041, 25708, 2099, 1007, 14828, 2011, 1043, 1011, 5250, 1011, 11211, 13833, 2038, 2042, 20467, 1999, 2195, 22935, 1006, 26226, 1007, 3785, 1010, 2164, 23760, 29048, 1010, 2540, 4945, 1010, 15050, 1998, 21449, 23760, 13181, 21281, 1012, 2174, 1010, 1996, 17261, 4022, 1997, 1041, 25708, 2099, 23586, 1999, 2122, 3785, 2003, 2747, 4242, 1012, 2364, 7863, 1997, 1996, 2556, 2817, 2001, 2000, 8556, 15050, 1010, 21449, 1998, 25125, 3896, 1997, 1041, 25708, 2099, 23586, 2011, 1052, 3211, 1011, 18610, 1999, 1996, 23760, 25808, 3512, 11888, 14234, 4295, 1006, 23616, 2094, 1007, 2944, 1012, 11432, 9601, 1019, 1013, 1020, 11265, 8458, 2890, 6593, 16940, 1006, 1019, 1013, 1020, 26807, 

In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
ep______________________________________ 11
##ider__________________________________ 11
##mal___________________________________ 11
growth__________________________________ 11
factor__________________________________ 11
receptor________________________________ 11
inhibitor_______________________________ 11
p_______________________________________ 3
##ki____________________________________ 3
-_______________________________________ 3
166_____________________________________ 3
govern__________________________________ 11
##s_____________________________________ 11
cardiovascular__________________________ 11
protection______________________________ 11
without_________________________________ 11
beneficial______________________________ 11
effects_________________________________ 11
on______________________________________ 11
the_____________________________________ 11
kidney__________________________________ 11
in________________________________

In [None]:
## Applying on entire data
tokenized_datasets = Annotated_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/6225 [00:00<?, ? examples/s]

Map:   0%|          | 0/2076 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets['train'][0]

{'Unnamed: 0': 0,
 'PMID_Type': 23600432,
 'Sentence_Index': '[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]',
 'Token': ['Introduction',
  ':',
  '5-Lipoxygenase',
  '(',
  '5-LO',
  ')',
  'is',
  'a',
  'crucial',
  'enzyme',
  'of',
  'the',
  'arachidonic',
  'acid',
  '(',
  'AA',
  ')',
  'cascade',
  'and',

In [None]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=12)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [None]:
#Define training args
from transformers import TrainingArguments, Trainer


args = TrainingArguments(
"test-ner",
evaluation_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = datasets.load_metric("seqeval")
example = Annotated_dataset['train'][0]

labels = [label_list[i] for i in example["Tag"]]
labels

  metric = datasets.load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-TRIVIAL',
 'I-TRIVIAL',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-FAMILY',
 'O',
 'B-FAMILY',
 'O',
 'B-FAMILY',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O

In [None]:
metric.compute(predictions=[labels], references=[labels])

{'FAMILY': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 3},
 'TRIVIAL': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [None]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
          "precision": results["overall_precision"],
          "recall": results["overall_recall"],
          "f1": results["overall_f1"],
          "accuracy": results["overall_accuracy"],
  }

In [None]:
trainer = Trainer(
   model,
   args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)


In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.133998,0.778049,0.78566,0.781836,0.959611
2,0.221200,0.110916,0.801344,0.846727,0.823411,0.966952
3,0.096800,0.107672,0.806504,0.864843,0.834655,0.968739


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1170, training_loss=0.147719971746461, metrics={'train_runtime': 1978.5942, 'train_samples_per_second': 9.439, 'train_steps_per_second': 0.591, 'total_flos': 4759330603765320.0, 'train_loss': 0.147719971746461, 'epoch': 3.0})

In [None]:
## Save model with epoch - 3
model.save_pretrained("ner_model_3")

In [None]:
## Save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [None]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

# Its time for Prediction !

In [None]:
#Loading the Model
import json
config = json.load(open("ner_model_3/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model_3/config.json","w"))

In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model_3")
from transformers import pipeline

nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)
example = "Effects of docosahexaenoic acid and methylmercury on child's brain development due to consumption of fish by Finnish mother during pregnancy : a probabilistic modeling approach . Fish contains both beneficial substances e.g . docosahexaenoic acids but also harmful compounds e.g . methylmercury . Importantly , the health effects caused by these two substances can be evaluated in one common end point , intelligence quotient ( IQ ) , providing a more transparent analysis . We estimated health effects of maternal fish consumption on child's central nervous system by creating a model with three alternative maternal fish consumption scenarios ( lean fish , fatty fish , and current fish consumption ) . Additionally , we analyzed impacts of both regular fish consumption and extreme fish consumption habits . At the individual level , the simulated net effects were small , encompassing a range of one IQ point in all scenarios . Fatty fish consumption , however , clearly generated a beneficial net IQ effect , and lean fish consumption evoked an adverse net IQ effect . In view of the current fish consumption pattern of Finnish mothers the benefits and risks seem to more or less compensate each other . This study clearly shows the significance of which fish species are consumed during pregnancy and lactation , and the results can be generalized to apply to typical western population fish consumption habits ."
ner_results = nlp(example)
print(ner_results)


[{'entity': 'B-SYSTEMATIC', 'score': 0.8939415, 'index': 3, 'word': 'doc', 'start': 11, 'end': 14}, {'entity': 'B-SYSTEMATIC', 'score': 0.9282967, 'index': 4, 'word': '##osa', 'start': 14, 'end': 17}, {'entity': 'B-SYSTEMATIC', 'score': 0.9481607, 'index': 5, 'word': '##he', 'start': 17, 'end': 19}, {'entity': 'B-SYSTEMATIC', 'score': 0.96345955, 'index': 6, 'word': '##xa', 'start': 19, 'end': 21}, {'entity': 'B-SYSTEMATIC', 'score': 0.9646003, 'index': 7, 'word': '##eno', 'start': 21, 'end': 24}, {'entity': 'B-SYSTEMATIC', 'score': 0.9444547, 'index': 8, 'word': '##ic', 'start': 24, 'end': 26}, {'entity': 'I-SYSTEMATIC', 'score': 0.8015042, 'index': 9, 'word': 'acid', 'start': 27, 'end': 31}, {'entity': 'B-SYSTEMATIC', 'score': 0.9878562, 'index': 11, 'word': 'methyl', 'start': 36, 'end': 42}, {'entity': 'B-SYSTEMATIC', 'score': 0.9815501, 'index': 12, 'word': '##mer', 'start': 42, 'end': 45}, {'entity': 'B-SYSTEMATIC', 'score': 0.9773691, 'index': 13, 'word': '##cu', 'start': 45, 'en

In [None]:
import random

sentence_no = random.randint(0 , len(valid_data_list)- 1)
text = valid_data_list[sentence_no]['text']
entities = valid_data_list[sentence_no]['entities']
print(text, entities)

doc = nlp(text)

Synthesis and physicochemical properties of new tripodal amphiphiles bearing fatty acids as a hydrophobic group . Saturated fatty acids ( FA ) were grafted using tyrosine as a spacer group to the cyclotriphosphazene ring along with equimolar hydrophilic methoxy poly(ethylene glycol ) ( MPEG ) in cis-nongeminal way . Seven new cyclotriphosphazene amphiphiles were prepared from combinations of hydrophilic MPEGs with different molecular weights of 350 , 550 , 750 and 1000 and four different fatty acids of different hydrophobicity including , myristic , palmitic and stearic acids . These steric amphiphiles bearing fatty acids as a hydrophobic group were found to form more stable micelles with very low critical micelle concentrations ( CMC ) ( 2.95-7.80mg / L ) compared with oligopeptide analogues , and their highly hydrophobic core environment is unique and potentially useful for various biomedical applications . [[77, 82, 'fatty', 'B-FAMILY'], [83, 88, 'acids', 'I-FAMILY'], [114, 123, 'Sa

O                 631474
B-TRIVIAL           8821
B-SYSTEMATIC        6655
B-ABBREVIATION      4535
B-FORMULA           4443
B-FAMILY            4087
I-SYSTEMATIC        2098
I-FAMILY            1403
I-TRIVIAL           1188
I-MULTIPLE           708
B-IDENTIFIER         671
I-FORMULA            595

> Indented block



In [None]:
import spacy
from spacy.tokens import Span
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

# Define colors for the entity tags
colors = {"B-FAMILY": "#F67DE3", "B-SYSTEMATIC": "#7DF6D9", "I-FAMILY": "#FFFFFF", "I-SYSTEMATIC" : "#2AFF00","O" : "#157403","B-TRIVIAL" : "#B6FF01",
          "B-ABBREVIATION" : "#FC6262","B-FORMULA" : "#04A094","I-TRIVIAL" : "#44F15B","I-MULTIPLE" : "#1A57D1","B-IDENTIFIER" : "#870197","I-FORMULA" : "#FF2626"}
options = {"colors": colors}

# Create a new Doc with the custom entities
doc = nlp(text)

# Function to merge overlapping entities
def merge_overlapping_spans(spans):
    sorted_spans = sorted(spans, key=lambda x: (x.start, -x.end))
    merged_spans = [sorted_spans[0]]
    for span in sorted_spans[1:]:
        if span.start <= merged_spans[-1].end:
            merged_spans[-1] = Span(doc, merged_spans[-1].start, max(merged_spans[-1].end, span.end), label=span.label)
        else:
            merged_spans.append(span)
    return merged_spans

custom_ents = []
for start, end, label, tag in entities:
    span = doc.char_span(start, end, label=tag)
    if span is not None:
        custom_ents.append(span)

# Merge overlapping entities
merged_ents = merge_overlapping_spans(custom_ents)
doc.ents = tuple(merged_ents)

# Display the text with highlighted entities
displacy.render(doc, style="ent", options=options, jupyter=True)


# MODEL 2- RoBERTa

In [None]:
from transformers import BertTokenizerFast, RobertaTokenizerFast
from transformers import RobertaForTokenClassification, RobertaTokenizer

In [None]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train': "df_train.csv", 'validation': 'df_val.csv'})



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from ast import literal_eval
import datasets

# Define a function to convert a string of tokenized sentences to a list of tokens
def convert_to_list_of_tokens(example):
    example["Token"] = literal_eval(example["Token"])
    example["Tag"] = literal_eval(example["Tag"])
    #example["Sentence_Index"] = literal_eval(example["Sentence_Index"])
    return example

# Apply the conversion function to your dataset
Annotated_dataset = dataset.map(convert_to_list_of_tokens)
label_list = sorted(list(set([tag for tags in Annotated_dataset["train"]["Tag"] for tag in tags])))
from datasets import ClassLabel

Annotated_dataset = Annotated_dataset.map(
    lambda example: {"Token": example["Token"], "Tag": ClassLabel(names=label_list).str2int(example["Tag"])}
    # Here, we are using ClassLabel to convert the string tags to integers and storing the result in the "Tag" field
)

# Updating the features for the "train" split to use ClassLabel
Annotated_dataset["train"].features["Tag"] = ClassLabel(names=label_list)
Annotated_dataset["train"].features["Tag"]



Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)

In [None]:
example_text = Annotated_dataset['train'][0]

tokenized_input = tokenizer(example_text["Token"], is_split_into_words=True)

tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

word_ids = tokenized_input.word_ids()

print(word_ids)

[None, 0, 1, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 23, 24, 24, 24, 24, 24, 25, 26, 26, 27, 28, 29, 30, 31, 32, 32, 33, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 38, 39, 40, 41, 41, 42, 43, 44, 44, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 53, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 72, 72, 73, 74, 75, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 93, 93, 94, 95, 96, 96, 96, 97, 98, 99, 100, 101, 102, 102, 102, 103, 104, 105, 106, 107, 107, 108, 109, 110, 111, 112, 113, 114, 115, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 128, 129, 129, 129, 129, 130, 131, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172,

In [None]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['<s>',
 'ĠIntroduction',
 'Ġ:',
 'Ġ5',
 '-',
 'L',
 'ip',
 'oxy',
 'gen',
 'ase',
 'Ġ(',
 'Ġ5',
 '-',
 'LO',
 'Ġ)',
 'Ġis',
 'Ġa',
 'Ġcrucial',
 'Ġenzyme',
 'Ġof',
 'Ġthe',
 'Ġar',
 'ach',
 'id',
 'onic',
 'Ġacid',
 'Ġ(',
 'ĠAA',
 'Ġ)',
 'Ġcascade',
 'Ġand',
 'Ġcataly',
 'zes',
 'Ġthe',
 'Ġformation',
 'Ġof',
 'Ġbio',
 'active',
 'Ġle',
 'uk',
 'ot',
 'rien',
 'es',
 'Ġ(',
 'ĠL',
 'Ts',
 'Ġ)',
 'Ġwith',
 'Ġthe',
 'Ġhelp',
 'Ġof',
 'ĠFL',
 'AP',
 'Ġ,',
 'Ġthe',
 'Ġ5',
 '-',
 'LO',
 '-',
 'activ',
 'ating',
 'Ġprotein',
 'Ġ.',
 'ĠL',
 'Ts',
 'Ġare',
 'Ġinflammatory',
 'Ġmedi',
 'ators',
 'Ġplaying',
 'Ġa',
 'Ġpath',
 'ophys',
 'iological',
 'Ġrole',
 'Ġin',
 'Ġdifferent',
 'Ġdiseases',
 'Ġlike',
 'Ġasthma',
 'Ġ,',
 'Ġallergic',
 'Ġrh',
 'in',
 'itis',
 'Ġas',
 'Ġwell',
 'Ġas',
 'Ġcardiovascular',
 'Ġdiseases',
 'Ġand',
 'Ġcertain',
 'Ġtypes',
 'Ġof',
 'Ġcancer',
 'Ġ.',
 'ĠWith',
 'Ġthe',
 'Ġrising',
 'Ġnumber',
 'Ġof',
 'Ġindications',
 'Ġfor',
 'Ġanti',
 '-',
 'LT',
 'Ġtherapy',
 'Ġ,',

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["Token"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["Tag"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)

            elif word_idx != previous_word_idx:

                label_ids.append(label[word_idx])
            else:

                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_datasets = Annotated_dataset.map(tokenize_and_align_labels, batched=True)
model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=12)



Map:   0%|          | 0/2076 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

In [None]:
#Define training args
from transformers import TrainingArguments, Trainer


args = TrainingArguments(
"test-ner",
evaluation_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=10,
per_device_eval_batch_size=10,
num_train_epochs=5,
weight_decay=0.01,
)
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = datasets.load_metric("seqeval")
example = Annotated_dataset['train'][0]
labels = [label_list[i] for i in example["Tag"]]
metric.compute(predictions=[labels], references=[labels])
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)

    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
          "precision": results["overall_precision"],
          "recall": results["overall_recall"],
          "f1": results["overall_f1"],
          "accuracy": results["overall_accuracy"],
  }

In [None]:
trainer = Trainer(
   model,
   args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2094,0.106216,0.791048,0.84366,0.816507,0.967054
2,0.0966,0.087604,0.84808,0.865754,0.856826,0.974373
3,0.0671,0.083763,0.858464,0.8857,0.871869,0.977059
4,0.0531,0.079226,0.86461,0.891811,0.878,0.978319
5,0.0333,0.080451,0.871542,0.895787,0.883498,0.979054


TrainOutput(global_step=3115, training_loss=0.08164032305415905, metrics={'train_runtime': 3189.095, 'train_samples_per_second': 9.76, 'train_steps_per_second': 0.977, 'total_flos': 7548711300331560.0, 'train_loss': 0.08164032305415905, 'epoch': 5.0})

In [None]:
## Save model with epoch - 5
import json
model.save_pretrained("roberta_model_5")
tokenizer.save_pretrained("tokenizer")
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

config = json.load(open("roberta_model_5/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("roberta_model_5/config.json","w"))

In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("roberta_model_5")
from transformers import pipeline
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)
example = "Effects of docosahexaenoic acid and methylmercury on child's brain development due to consumption of fish by Finnish mother during pregnancy : a probabilistic modeling approach . Fish contains both beneficial substances e.g . docosahexaenoic acids but also harmful compounds e.g . methylmercury . Importantly , the health effects caused by these two substances can be evaluated in one common end point , intelligence quotient ( IQ ) , providing a more transparent analysis . We estimated health effects of maternal fish consumption on child's central nervous system by creating a model with three alternative maternal fish consumption scenarios ( lean fish , fatty fish , and current fish consumption ) . Additionally , we analyzed impacts of both regular fish consumption and extreme fish consumption habits . At the individual level , the simulated net effects were small , encompassing a range of one IQ point in all scenarios . Fatty fish consumption , however , clearly generated a beneficial net IQ effect , and lean fish consumption evoked an adverse net IQ effect . In view of the current fish consumption pattern of Finnish mothers the benefits and risks seem to more or less compensate each other . This study clearly shows the significance of which fish species are consumed during pregnancy and lactation , and the results can be generalized to apply to typical western population fish consumption habits ."
ner_results = nlp(example)
print(ner_results)


[{'entity': 'B-SYSTEMATIC', 'score': 0.9954053, 'index': 3, 'word': 'Ġdoc', 'start': 11, 'end': 14}, {'entity': 'B-SYSTEMATIC', 'score': 0.99550664, 'index': 4, 'word': 'osa', 'start': 14, 'end': 17}, {'entity': 'B-SYSTEMATIC', 'score': 0.9960276, 'index': 5, 'word': 'he', 'start': 17, 'end': 19}, {'entity': 'B-SYSTEMATIC', 'score': 0.99624264, 'index': 6, 'word': 'xa', 'start': 19, 'end': 21}, {'entity': 'B-SYSTEMATIC', 'score': 0.9965258, 'index': 7, 'word': 'eno', 'start': 21, 'end': 24}, {'entity': 'B-SYSTEMATIC', 'score': 0.9966859, 'index': 8, 'word': 'ic', 'start': 24, 'end': 26}, {'entity': 'I-SYSTEMATIC', 'score': 0.982547, 'index': 9, 'word': 'Ġacid', 'start': 27, 'end': 31}, {'entity': 'B-SYSTEMATIC', 'score': 0.99749076, 'index': 11, 'word': 'Ġmethyl', 'start': 36, 'end': 42}, {'entity': 'B-SYSTEMATIC', 'score': 0.9972108, 'index': 12, 'word': 'mer', 'start': 42, 'end': 45}, {'entity': 'B-SYSTEMATIC', 'score': 0.9970818, 'index': 13, 'word': 'c', 'start': 45, 'end': 46}, {'

In [None]:
import random

sentence_no = random.randint(0 , len(valid_data_list)- 1)
text = valid_data_list[sentence_no]['text']
entities = valid_data_list[sentence_no]['entities']
print(text, entities)

doc = nlp(text)

Synthesis and physicochemical properties of new tripodal amphiphiles bearing fatty acids as a hydrophobic group . Saturated fatty acids ( FA ) were grafted using tyrosine as a spacer group to the cyclotriphosphazene ring along with equimolar hydrophilic methoxy poly(ethylene glycol ) ( MPEG ) in cis-nongeminal way . Seven new cyclotriphosphazene amphiphiles were prepared from combinations of hydrophilic MPEGs with different molecular weights of 350 , 550 , 750 and 1000 and four different fatty acids of different hydrophobicity including , myristic , palmitic and stearic acids . These steric amphiphiles bearing fatty acids as a hydrophobic group were found to form more stable micelles with very low critical micelle concentrations ( CMC ) ( 2.95-7.80mg / L ) compared with oligopeptide analogues , and their highly hydrophobic core environment is unique and potentially useful for various biomedical applications . [[77, 82, 'fatty', 'B-FAMILY'], [83, 88, 'acids', 'I-FAMILY'], [114, 123, 'Sa

In [None]:
import spacy
from spacy.tokens import Span
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

# Define colors for the entity tags
colors = {"B-FAMILY": "#F67DE3", "B-SYSTEMATIC": "#7DF6D9", "I-FAMILY": "#FFFFFF", "I-SYSTEMATIC" : "#2AFF00","O" : "#157403","B-TRIVIAL" : "#B6FF01",
          "B-ABBREVIATION" : "#FC6262","B-FORMULA" : "#04A094","I-TRIVIAL" : "#44F15B","I-MULTIPLE" : "#1A57D1","B-IDENTIFIER" : "#870197","I-FORMULA" : "#FF2626"}
options = {"colors": colors}

# Create a new Doc with the custom entities
doc = nlp(text)

# Function to merge overlapping entities
def merge_overlapping_spans(spans):
    sorted_spans = sorted(spans, key=lambda x: (x.start, -x.end))
    merged_spans = [sorted_spans[0]]
    for span in sorted_spans[1:]:
        if span.start <= merged_spans[-1].end:
            merged_spans[-1] = Span(doc, merged_spans[-1].start, max(merged_spans[-1].end, span.end), label=span.label)
        else:
            merged_spans.append(span)
    return merged_spans

custom_ents = []
for start, end, label, tag in entities:
    span = doc.char_span(start, end, label=tag)
    if span is not None:
        custom_ents.append(span)

# Merge overlapping entities
merged_ents = merge_overlapping_spans(custom_ents)
doc.ents = tuple(merged_ents)

# Display the text with highlighted entities
displacy.render(doc, style="ent", options=options, jupyter=True)
