In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Embedding, Bidirectional, TimeDistributed, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from google.colab import drive
import json

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tagged_sents = []
with open('/content/drive/MyDrive/Cleaned_Sentences_Task/cleaned_sentences.txt', 'r') as file:
    tagged_sents = json.load(file)

In [None]:
print(tagged_sents[0])

[['आग', 'nn'], ['की', 'psp']]


In [None]:
tagged_sentences = []
for i in tagged_sents:
    tagged_sentences.append([tuple(i[0]), tuple(i[1])])

In [None]:
print(tagged_sentences[0])

[('आग', 'nn'), ('की', 'psp')]


In [None]:
sentences, tags = [], []
for i in tagged_sentences:
  curr_sent, curr_tag = [], []
  for word, tag in i:
    curr_sent.append(word)
    curr_tag.append(tag.lower())
  sentences.append(curr_sent)
  tags.append(curr_tag)

In [None]:
print(sentences[0])
print(tags[0])

['आग', 'की']
['nn', 'psp']


In [None]:
tokenizer_sents = Tokenizer()
tokenizer_sents.fit_on_texts(sentences)
tokenized_sents = tokenizer_sents.texts_to_sequences(sentences)

In [None]:
print(tokenized_sents[0])

[632, 3]


In [None]:
tokenizer_tags = Tokenizer()
tokenizer_tags.fit_on_texts(tags)
tokenized_tags = tokenizer_tags.texts_to_sequences(tags)

In [None]:
tokenized_tags[0]

[1, 2]

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer_sents.word_index) + 1, output_dim=128, input_length=2, trainable=True))
model.add(Bidirectional(LSTM(512, return_sequences=True, activation='relu')))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(256, return_sequences=True, activation='relu')))
model.add(Bidirectional(LSTM(128, return_sequences=True, activation='relu')))
model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(128, activation='relu')))
model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(64, activation='relu')))
model.add(TimeDistributed(Dense(len(tokenizer_tags.word_index) + 1, activation='softmax')))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 128)            2589952   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 2, 1024)           2625536   
_________________________________________________________________
dropout_3 (Dropout)          (None, 2, 1024)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 2, 512)            2623488   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 2, 256)            656384    
_________________________________________________________________
dropout_4 (Dropout)          (None, 2, 256)            0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 2, 128)           

In [None]:
padded_sents = pad_sequences(tokenized_sents, maxlen=2, padding='post')
padded_tags = pad_sequences(tokenized_tags, maxlen=2, padding='post')

In [None]:
model.fit(padded_sents[0: -4000], to_categorical(padded_tags[0: -4000]), batch_size=128, epochs=20, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fdf0f1cd7f0>

In [None]:
print("Accuracy of the model =", model.evaluate(padded_sents[-4000: ], to_categorical(padded_tags[-4000: ], num_classes=len(tokenizer_tags.word_index) + 1))[1])

Accuracy of the model = 0.9352499842643738


In [None]:
actual = [list(i) for i in padded_tags[-5: ]]
actuals = []
for i in actual:
  curr = []
  for v in i:
    curr.append(int(v))
  actuals.append(curr)

In [None]:
predictions = []
for i in model.predict_classes(padded_sents[-5: ]):
  curr = []
  for v in i:
    curr.append(int(v))
  predictions.append(curr)
print(predictions)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
[[7, 7], [8, 10], [8, 24], [2, 5], [4, 2]]


In [None]:
pred = tokenizer_tags.sequences_to_texts(predictions)
act = tokenizer_tags.sequences_to_texts(actuals)
print(pred)
print(act)

['prp prp', 'cc nnc', 'cc inj', 'psp jj', 'nnp psp']
['prp prp', 'cc nnc', 'cc inj', 'psp jj', 'nnp psp']


In [None]:
predictions = model.predict_classes(padded_sents)

In [None]:
predict = [list(i) for i in predictions]
final_predict = tokenizer_tags.sequences_to_texts(predict)

In [None]:
rnn_tags = []
for i in final_predict:
  rnn_tags.extend(i.split(" "))

In [None]:
print(len(tags) * 2)
print(len(rnn_tags))

662728
662728


In [None]:
with open ('/content/drive/MyDrive/Cleaned_Sentences_Task/rnn_predictions.txt', 'w+') as file:
    json.dump(rnn_tags, file)