In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout

# Предобработка, токенизация и паддинг

In [2]:
df = pd.read_csv("ner_datasetreference.csv", encoding="latin1")
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [3]:
df.isna().sum()

Sentence #    1000616
Word               10
POS                 0
Tag                 0
dtype: int64

In [4]:
df.Word.dropna(inplace=True)

In [5]:
num_classes = df.Tag.nunique()
num_classes

17

In [6]:
df = df.ffill()
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O


In [7]:
df.Tag = LabelEncoder().fit_transform(df.Tag)
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,16
1,Sentence: 1,of,IN,16
2,Sentence: 1,demonstrators,NNS,16
3,Sentence: 1,have,VBP,16
4,Sentence: 1,marched,VBN,16
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,16
1048571,Sentence: 47959,responded,VBD,16
1048572,Sentence: 47959,to,TO,16
1048573,Sentence: 47959,the,DT,16


In [8]:
O_tag = df.Tag[0]

In [9]:
grouped_by_sentence_df = df.groupby('Sentence #').agg({'Word': list, 'Tag': list}).reset_index(drop=True)
grouped_by_sentence_df

Unnamed: 0,Word,Tag
0,"[Thousands, of, demonstrators, have, marched, ...","[16, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16..."
1,"[Iranian, officials, say, they, expect, to, ge...","[3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16..."
2,"[Helicopter, gunships, Saturday, pounded, mili...","[16, 16, 7, 16, 16, 16, 16, 16, 2, 16, 16, 16,..."
3,"[They, left, after, a, tense, hour-long, stand...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]"
4,"[U.N., relief, coordinator, Jan, Egeland, said...","[2, 16, 16, 6, 14, 16, 7, 16, 2, 16, 3, 16, 3,..."
...,...,...
47954,"[Opposition, leader, Mir, Hossein, Mousavi, ha...","[16, 16, 16, 6, 14, 16, 16, 16, 16, 16, 16, 16..."
47955,"[On, Thursday, ,, Iranian, state, media, publi...","[16, 7, 16, 3, 16, 16, 16, 16, 16, 16, 16, 16,..."
47956,"[Following, Iran, 's, disputed, June, 12, elec...","[16, 2, 16, 16, 7, 15, 16, 16, 16, 16, 16, 16,..."
47957,"[Since, then, ,, authorities, have, held, publ...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."


In [10]:
X_train, X_test, y_train, y_test = train_test_split(grouped_by_sentence_df.Word, grouped_by_sentence_df.Tag, test_size=0.2, random_state=42)

In [11]:
X_train

7707     [South, Korea, 's, government, Tuesday, also, ...
26089    [When, the, Lion, found, that, he, could, not,...
308      [The, cost, of, major, food, commodities, has,...
24975    [Argentina, 's, Lionel, Messi, tied, the, matc...
471      [In, addition, to, 65,000, regular, H1-B, visa...
                               ...                        
11284    [During, an, address, Wednesday, marking, the,...
44732    [General, Abizaid, made, the, remarks, during,...
38158    [Milosevic, had, been, on, trial, at, the, Uni...
860      [Lieberman, introduced, the, bill, with, Repub...
15795    [She, said, divulging, the, contents, of, the,...
Name: Word, Length: 38367, dtype: object

In [12]:
y_train

7707     [2, 10, 16, 16, 7, 16, 16, 16, 16, 16, 16, 16,...
26089    [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1...
308      [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 7, 15...
24975             [5, 16, 6, 14, 16, 16, 16, 7, 16, 7, 16]
471      [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1...
                               ...                        
11284    [16, 16, 16, 7, 16, 16, 5, 16, 16, 2, 10, 16, ...
44732    [5, 13, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16...
38158    [6, 16, 16, 16, 16, 16, 16, 5, 13, 16, 16, 16,...
860        [6, 16, 16, 16, 16, 5, 13, 6, 5, 16, 6, 14, 16]
15795    [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1...
Name: Tag, Length: 38367, dtype: object

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
grouped_by_sentence_df.Word.apply(len).max()

104

In [15]:
MAX_LEN = 128

In [16]:
def encode_words_tags(sentences, MAX_LEN):
    input_ids = []
    attention_masks = []
    for sentence in sentences:
        inputs = tokenizer.encode_plus(sentence, 
                                       return_attention_mask=True,
                                       add_special_tokens=True, 
                                       max_length=MAX_LEN,
                                       is_split_into_words=True,
                                       padding='max_length', 
                                       truncation=True)
        
        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])

    return np.array(input_ids), np.array(attention_masks)

In [17]:
train_input_ids, train_attention_masks = encode_words_tags(X_train, MAX_LEN)

In [18]:
test_input_ids, test_attention_masks = encode_words_tags(X_test, MAX_LEN)

In [19]:
len(train_input_ids[0])

128

In [20]:
train_tags = np.array(y_train.apply(lambda x: x + [0] * (MAX_LEN - len(x))).tolist())
test_tags = np.array(y_test.apply(lambda x: x + [0] * (MAX_LEN - len(x))).tolist())

# Определение модели

In [21]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [22]:
input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_masks = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_masks")

In [23]:
bert_output = bert_model(input_ids, attention_mask=attention_masks)[0]
bert_output

<KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'tf_bert_model')>

In [24]:
dropout = Dropout(0.2)(bert_output)

In [25]:
dense_layer = Dense(64, activation="selu")(dropout)
outputs = Dense(num_classes, activation="softmax")(dense_layer)

In [26]:
model = Model(inputs=[input_ids, attention_masks], outputs=outputs)

In [27]:
# Удаляются теги О и паддинг из y_true и соответствующие элменты из y_pred при помощи маски
# считаем кол-во одникаовых элеметов в полученных матрицых и делим на всю длину одной матрицы
def excluding_accuracy(y_true, y_pred):
    mask = tf.logical_or(tf.equal(y_true, 0), tf.equal(y_true, O_tag))
    excluded_y_true = tf.boolean_mask(y_true, ~mask)
    excluded_y_pred = tf.boolean_mask(y_pred, ~mask)
    
    excluded_y_pred = tf.argmax(excluded_y_pred, axis=-1) # перевод вероятнойстей принадлежности к классу в классы
    
    excluded_y_true = tf.cast(excluded_y_true, tf.int32) # нужно для совместимости
    excluded_y_pred = tf.cast(excluded_y_pred, tf.int32)

    correct_predictions = tf.equal(excluded_y_true, excluded_y_pred)
    accuracy = tf.reduce_sum(tf.cast(correct_predictions, tf.float32)) / tf.cast(tf.size(excluded_y_true), tf.float32)
    return accuracy


In [28]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy', excluding_accuracy])

In [29]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_masks (InputLaye  [(None, 128)]                0         []                            
 r)                                                                                               
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1094822   ['input_ids[0][0]',           
 )                           ngAndCrossAttentions(last_   40         'attention_masks[0][0]']     
                             hidden_state=(None, 128, 7                                       

In [30]:
early_stopping = EarlyStopping(monitor='val_loss', patience=4)

In [31]:
model.fit([train_input_ids, train_attention_masks], train_tags, validation_data=([test_input_ids, test_attention_masks], test_tags), epochs=3, batch_size=32)

Epoch 1/3
   3/1199 [..............................] - ETA: 20:46:58 - loss: 4.8480 - accuracy: 0.3252 - excluding_accuracy: 0.0286


KeyboardInterrupt



Для анализа обучения, можно посмотреть на графики функции потерь по эпохам на тренировочном датасете, по нему можно увидеть сходимость модели к оптимальным параметрам, а также заметить если вдруг наступило переобучение (хотя его должен предотвратить early stopping и дропаут). График accuracy является не совсем информативным, из-за преобладания тегов "О", в данном случае нас больше будет интересовать, как выглядит график кастомной метрики excluding_accuracy, по нему можно определить растет ли точность модели.
Эти жже графики необходимо исследовать и на валидационном наборе данных. Еще можно посмотреть на confusion matrix, там будет явно заметно, если модель путает какие-то теги между собой. На основе этого можно провети анализ ошибок конкретных тегов, например, модель может ошибаться в них из-за того, что у нее было мало примеров. 