In [1]:
import nltk
import re
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from collections import Counter

In [2]:
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\peaks\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\peaks\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print(tagged_sentences[0])
print("Tagged sentences :", len(tagged_sentences))
print("Tagged words :", len(nltk.corpus.treebank.tagged_words()))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences : 3914
Tagged words : 100676


In [187]:
def prepare(tagged_sentences):
    sentences, sentence_tags = [], []
    for tagged_sentence in tagged_sentences:
        sentence, tags = zip(*tagged_sentence)
        sentences.append(np.array(sentence))
        sentence_tags.append(np.array(tags))
    return sentences, sentence_tags

In [188]:
X,y = prepare(tagged_sentences)

In [189]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2)

In [190]:
words, tags = set([]), set([])

for s in X_train:
    for w in s:
        words.add(w.lower())

for ts in y_train:
    for t in ts:
        tags.add(t)

CREATE WORD INDEX

In [191]:
word2index = {w: i for i,w in enumerate(list(words),2)}
word2index['-PAD-'] = 0
word2index['-OOV-'] = 1

In [192]:
tag2index = {t: i for i, t in enumerate(list(tags),1)}
tag2index['-PAD-'] = 0

CREATE INDEX VECTOR WITH PADDING

In [193]:
train_X, test_X, train_y, test_y = [],[],[],[]

In [194]:
def w2index(sentence):
    result = []
    for s in sentence:
        s_int = []
        for w in s:
            try:
                s_int.append(word2index[w.lower()])
            except KeyError:
                s_int.append(word2index['-OOV-'])
        result.append(s_int)
    return result
def t2index(tagging):
    result = []
    for s in tagging:
        sss = []
        for t in s:
            sss.append(tag2index[t])
        result.append(sss)
    return result

In [197]:
train_X = w2index(X_train)
test_X = w2index(X_test)
train_y = t2index(y_train)
test_y = t2index(y_test)
print(train_X[0])
print(test_X[0])
print(train_y[0])
print(test_y[0])



[153, 7801, 7190, 6586, 8480, 5056, 5840, 2962, 3675, 230, 9955, 5090, 7278, 5466, 7190, 7661, 5419, 5141]
[1931, 904, 3724, 1160, 5141]
[39, 39, 31, 20, 36, 3, 40, 24, 29, 4, 33, 21, 17, 36, 31, 36, 36, 44]
[29, 3, 40, 4, 44]


In [198]:
Max_length = len(max(train_X, key=len))
print(Max_length)

271


In [199]:
from keras.preprocessing.sequence import pad_sequences
train_X = pad_sequences(train_X,maxlen=Max_length,padding='post')
test_X = pad_sequences(test_X,maxlen=Max_length,padding='post')
train_y = pad_sequences(train_y,maxlen=Max_length,padding='post')
test_y = pad_sequences(test_y,maxlen=Max_length,padding='post')


In [200]:
print(train_X[0])

[ 153 7801 7190 6586 8480 5056 5840 2962 3675  230 9955 5090 7278 5466
 7190 7661 5419 5141    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

Building Model

In [201]:
import tensorflow as tf
import keras
from keras.layers import *

In [202]:
model = keras.models.Sequential(
    [
        InputLayer(input_shape=(Max_length,)),
        Embedding(len(word2index),128),
        Bidirectional(LSTM(256, return_sequences=True)),
        TimeDistributed(Dense(len(tag2index))),
        Activation('softmax')
    ] 
)
    

In [203]:
model.compile(loss='categorical_crossentropy',optimizer=keras.optimizers.Adam(0.001),metrics=['accuracy'])

In [204]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 271, 128)          1301376   
                                                                 
 bidirectional_8 (Bidirecti  (None, 271, 512)          788480    
 onal)                                                           
                                                                 
 time_distributed_8 (TimeDi  (None, 271, 47)           24111     
 stributed)                                                      
                                                                 
 activation_8 (Activation)   (None, 271, 47)           0         
                                                                 
Total params: 2113967 (8.06 MB)
Trainable params: 2113967 (8.06 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [205]:
print(len(tag2index))
print(len(train_y[0]))
print(len(train_X[0]))

47
271
271


In [206]:
def to_categortical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [207]:
cat_y_train = np.array(to_categortical(train_y, len(tag2index)))

In [208]:
model.fit(train_X,cat_y_train,batch_size=128,epochs=20)

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x17ef2cb98d0>

In [209]:
cat_y_test = np.array(to_categortical(test_y, len(tag2index)))
scores = model.evaluate(test_X,cat_y_test)



In [212]:
print(f"{model.metrics_names[1]}:{scores[1]*100}%")

accuracy:99.04521107673645%


In [214]:
test = ["He reads many books in the library".split(),"Although she completed her literature review".split()]
print(test)

[['He', 'reads', 'many', 'books', 'in', 'the', 'library'], ['Although', 'she', 'completed', 'her', 'literature', 'review']]


In [217]:
sample = w2index(test)
padded_sample = pad_sequences(sample,maxlen=Max_length,padding='post')

In [219]:
prediction = model.predict(padded_sample)



In [220]:
print(prediction)

[[[1.8640072e-03 4.4722561e-04 3.5835899e-05 ... 2.1408070e-04
   9.6128148e-04 7.4752490e-04]
  [4.5205005e-02 5.5212267e-03 7.0775985e-03 ... 8.3648181e-04
   9.4772261e-03 1.5221503e-02]
  [7.3485631e-03 1.7491283e-03 3.3296183e-02 ... 5.4759999e-05
   3.4532154e-03 8.9769540e-03]
  ...
  [9.9992001e-01 4.3011880e-07 1.1311869e-09 ... 2.1744210e-05
   2.1017263e-09 7.4617628e-09]
  [9.9987280e-01 6.9727122e-07 1.7467490e-09 ... 2.3579663e-05
   3.2616077e-09 1.2133955e-08]
  [9.9981087e-01 1.0384198e-06 2.4860405e-09 ... 2.3846491e-05
   4.7196109e-09 1.8633127e-08]]

 [[1.8216176e-03 2.2501107e-03 1.6676892e-02 ... 1.2285743e-06
   5.1843049e-03 2.1493409e-02]
  [9.0344646e-04 5.9950794e-04 9.8472621e-05 ... 1.1074911e-04
   2.7836266e-03 1.0340712e-03]
  [3.2397746e-03 2.1991485e-03 2.0707571e-03 ... 6.9810456e-05
   6.3462229e-03 2.4792381e-02]
  ...
  [9.9992001e-01 4.3011755e-07 1.1311913e-09 ... 2.1744398e-05
   2.1017343e-09 7.4617770e-09]
  [9.9987280e-01 6.9726855e-07 1.746

In [221]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for cat_seq in sequences:
        token_sequence = []
        for cat in cat_seq:
            token_sequence.append(index[np.argmax(cat)])

        token_sequences.append(token_sequence)
    return token_sequence


In [223]:
answer = logits_to_tokens(prediction,{i:t for t,i in tag2index.items()})

In [229]:
print(test[1])
print(answer[:len(sample[1])])

['Although', 'she', 'completed', 'her', 'literature', 'review']
['IN', 'PRP', 'VBD', 'PRP$', 'NN', 'NN']
