<a href="https://colab.research.google.com/github/olgasem10/Compling-tasks/blob/master/RNN_POS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM, GRU, Bidirectional, TimeDistributed, InputLayer, Embedding, Conv1D, Input, Flatten, concatenate, MaxPool1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter

In [0]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [0]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [0]:
len(tagged_sentences)

3914

In [0]:
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(sentence)
    sentence_tags.append(tags)

In [0]:
char_sents = [[list(word) for word in sent] for sent in sentences]

### Обработка данных

In [0]:
sent_train, sent_test, char_sent_train, char_sent_test, tag_train, tag_test = train_test_split(sentences, char_sents, sentence_tags, test_size=0.2, random_state=0)

In [0]:
sent_train[0]

('Edward', 'L.', 'Kane', 'succeeded', 'Mr.', 'Taylor', 'as', 'chairman', '.')

In [0]:
vocab = Counter()
for sent in sent_train:
    sent = [word.lower() for word in sent]
    vocab.update(sent)

In [0]:
filtered_vocab = {word for word in vocab if vocab[word] > 5}

In [0]:
len(filtered_vocab)

1679

In [0]:
word2id = {'PAD':0,'UNK':1}    
for i,word in enumerate(filtered_vocab):
      word2id[word] = i + 2

id2word = {i:word for word, i in word2id.items()}

In [0]:
tag2id = {'PAD':0}  
for tags in tag_train:
    for tag in tags:
      if tag.lower() not in tag2id:
        tag2id[tag.lower()] = len(tag2id)

id2tag = {i:tag for tag, i in tag2id.items()}

In [0]:
len(id2tag)

47

In [0]:
def data2ints(data, smth2id):
  int_data = []
  for seq in data:
      int_seq = []
      for i in seq:
          try:
            int_seq.append(smth2id[i.lower()])
          except KeyError:
            int_seq.append(smth2id['UNK'])
  
      int_data.append(int_seq)
  return int_data

In [0]:
X_train_ids, X_test_ids = data2ints(sent_train, word2id), data2ints(sent_test, word2id)
y_train_ids, y_test_ids = data2ints(tag_train, tag2id), data2ints(tag_test, tag2id)


print(X_train_ids[0])
print(X_test_ids[0])
print(y_train_ids[0])
print(y_test_ids[0])

[341, 1, 1, 1, 561, 1, 593, 1529, 907]
[507, 839, 499, 1162, 467, 642, 1168, 1, 334, 1, 1314, 1518, 947, 907]
[1, 1, 1, 2, 1, 1, 3, 4, 5]
[18, 19, 21, 24, 10, 25, 24, 18, 21, 14, 3, 7, 15, 5]


In [0]:
MAX_LEN = max(len(x) for x in X_train_ids)
MAX_LEN

128

In [0]:
X_train, X_test = pad_sequences(X_train_ids, maxlen=MAX_LEN, padding='post'), pad_sequences(X_test_ids, maxlen=MAX_LEN, padding='post')
y_train_pad, y_test_pad = pad_sequences(y_train_ids, maxlen=MAX_LEN, padding='post'), pad_sequences(y_test_ids, maxlen=MAX_LEN, padding='post')

In [0]:
print(X_train.shape, y_train_pad.shape, X_test.shape, y_test_pad.shape)

(3131, 128) (3131, 128) (783, 128) (783, 128)


In [0]:
y_train, y_test = to_categorical(y_train_pad, num_classes=len(tag2id)), to_categorical(y_test_pad, num_classes=len(tag2id))

In [0]:
y_test.shape

(783, 128, 47)

### Данные для второго инпута

In [0]:
alphabet = Counter()
for sent in char_sent_train:
  for word in sent:
    word = [char.lower() for char in word]
    alphabet.update(word)

len(alphabet)

52

In [0]:
char2id = {'PAD':0,'UNK':1}    
for i,char in enumerate(alphabet):
      char2id[char] = i + 2

id2char = {i:char for char, i in char2id.items()}

In [0]:
X_char_train_ids = [data2ints(sent, char2id) for sent in char_sent_train]
X_char_test_ids = [data2ints(sent, char2id) for sent in char_sent_test]

In [0]:
MAX_CHAR_LEN = max(max(len(word) for word in sent) for sent in X_char_train_ids)
MAX_CHAR_LEN

24

In [0]:
def chars_pad(data, MAX_LEN, MAX_CHAR_LEN):
  pad_char = np.zeros((len(data), MAX_LEN, MAX_CHAR_LEN))
  for i, s in enumerate(data):
      for j, word in enumerate(s):
        for k, char in enumerate(word):
          try:
              pad_char[i][j][k] = char
          except:
            continue
  return pad_char

In [0]:
X_char_train = chars_pad(X_char_train_ids, MAX_LEN, MAX_CHAR_LEN)
X_char_test = chars_pad(X_char_test_ids, MAX_LEN, MAX_CHAR_LEN)

In [0]:
print(X_char_train.shape, X_char_test.shape)

(3131, 128, 24) (783, 128, 24)


### Сеть

In [0]:
input1 = Input(shape=(MAX_LEN,), name='input1')
embeddings1 = Embedding(input_dim=len(word2id), output_dim=100, name='embeddings', mask_zero=True)(input1)

bi = Bidirectional(LSTM(16, return_sequences=True), merge_mode='concat', name='bi')(embeddings1)
drop = Dropout(0.2, name = 'drop')(bi)


input2 = Input(shape=(MAX_LEN, MAX_CHAR_LEN,), name='input2')
embeddings2 = TimeDistributed(Embedding(input_dim=len(char2id), output_dim=40), name='embeddings2')(input2)
conv1 = TimeDistributed(Conv1D(kernel_size=3, filters=30, strides = 1), name='conv1')(embeddings2)
drop2 = Dropout(0.2, name = 'drop2')(conv1)
flat = TimeDistributed(Flatten(), name='flat')(drop2)

concat = concatenate([drop, flat], name='concat')

bi2 = Bidirectional(LSTM(16, return_sequences=True), merge_mode='concat', name='bi2')(concat)

outputs = TimeDistributed(Dense(len(tag2id), activation='softmax'), name='output')(bi2)

model1 = tf.keras.Model(inputs=[input1, input2], outputs=outputs)
model1.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
model1.summary()

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input2 (InputLayer)             [(None, 128, 24)]    0                                            
__________________________________________________________________________________________________
input1 (InputLayer)             [(None, 128)]        0                                            
__________________________________________________________________________________________________
embeddings2 (TimeDistributed)   (None, 128, 24, 40)  2160        input2[0][0]                     
__________________________________________________________________________________________________
embeddings (Embedding)          (None, 128, 100)     168100      input1[0][0]                     
___________________________________________________________________________________________

In [0]:
model1.fit([X_train, X_char_train], y_train, validation_data=([X_test, X_char_test], y_test), batch_size=128, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f1c16868be0>

### Функция для разметки предложений

In [0]:
def tagger(sent, model):
  sent_ids = data2ints([sent], word2id)
  length = len(sent)
  pad_sent = pad_sequences(sent_ids, maxlen=MAX_LEN, padding='post')
  char_sent = [[list(word) for word in s] for s in [sent]]
  char_sent_ids = [data2ints(sent, char2id) for sent in char_sent]
  padded_chars = chars_pad(char_sent_ids, MAX_LEN, MAX_CHAR_LEN)
  tags_ids = np.argmax(model.predict([pad_sent, padded_chars]), axis=2).tolist()[0][:length]
  tags = [id2tag[ind] for ind in tags_ids]
  tagged_sent = [(word, tag) for word, tag in zip(sent, tags)]   
  return tagged_sent


In [0]:
tagger(sent_test[100], model1)

[('David', 'nnp'),
 ('A.', 'nnp'),
 ('DiLoreto', 'nnp'),
 (',', ','),
 ('president', 'nn'),
 ('of', 'in'),
 ('metal', 'nnp'),
 ('container', 'nnp'),
 ('division', 'nn'),
 (',', ','),
 ('was', 'vbd'),
 ('named', 'vbn'),
 ('*-1', '-none-'),
 ('to', 'to'),
 ('the', 'dt'),
 ('additional', 'jj'),
 ('post', 'nn'),
 ('of', 'in'),
 ('group', 'nn'),
 ('vice', 'nn'),
 ('president', 'nn'),
 (',', ','),
 ('packaging', 'vbg'),
 ('products', 'nns'),
 (',', ','),
 ('at', 'in'),
 ('this', 'dt'),
 ('packaging', 'vbg'),
 (',', ','),
 ('industrial', 'jj'),
 ('and', 'cc'),
 ('aerospace', 'jj'),
 ('products', 'nns'),
 ('concern', 'nn'),
 (',', ','),
 ('*-1', '-none-'),
 ('succeeding', 'nnp'),
 ('Delmont', 'nnp'),
 ('A.', 'nnp'),
 ('Davis', 'nnp'),
 (',', ','),
 ('who', 'wp'),
 ('*T*-3', '-none-'),
 ('was', 'vbd'),
 ('named', 'vbn'),
 ('*-2', '-none-'),
 ('president', 'nn'),
 ('and', 'cc'),
 ('chief', 'nn'),
 ('operating', 'nn'),
 ('officer', 'nn'),
 ('in', 'in'),
 ('August', 'nnp'),
 ('.', '.')]