# build the dataset from kaggle

In [None]:
https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus

In [110]:
pwd

'/Users/couali/Desktop/ai-workshop/other/cs230-code-examples/pytorch/nlp/test'

In [109]:
"""Read, split and save the kaggle dataset for our model"""

import csv
import os
import sys


def load_dataset(path_csv):
    """Loads dataset into memory from csv file"""
    # Open the csv file, need to specify the encoding for python3
    use_python3 = sys.version_info[0] >= 3
    with (open(path_csv, encoding="windows-1252") if use_python3 else open(path_csv)) as f:
        csv_file = csv.reader(f, delimiter=',')
        dataset = []
        words, tags = [], []

        # Each line of the csv corresponds to one word
        for idx, row in enumerate(csv_file):
            if idx == 0: continue
            sentence, word, pos, tag = row
            # If the first column is non empty it means we reached a new sentence
            if len(sentence) != 0:
                if len(words) > 0:
                    assert len(words) == len(tags)
                    dataset.append((words, tags))
                    words, tags = [], []
            try:
                word, tag = str(word), str(tag)
                words.append(word)
                tags.append(tag)
            except UnicodeDecodeError as e:
                print("An exception was raised, skipping a word: {}".format(e))
                pass

    return dataset


def save_dataset(dataset, save_dir):
    """Writes sentences.txt and labels.txt files in save_dir from dataset
    Args:
        dataset: ([(["a", "cat"], ["O", "O"]), ...])
        save_dir: (string)
    """
    # Create directory if it doesn't exist
    print("Saving in {}...".format(save_dir))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Export the dataset
    with open(os.path.join(save_dir, 'sentences.txt'), 'w') as file_sentences:
        with open(os.path.join(save_dir, 'labels.txt'), 'w') as file_labels:
            for words, tags in dataset:
                file_sentences.write("{}\n".format(" ".join(words)))
                file_labels.write("{}\n".format(" ".join(tags)))
    print("- done.")


# Check that the dataset exists (you need to make sure you haven't downloaded the `ner.csv`)
path_dataset = '../data/kaggle/ner_dataset.csv'
msg = "{} file not found. Make sure you have downloaded the right dataset".format(path_dataset)
assert os.path.isfile(path_dataset), msg

# Load the dataset into memory
print("Loading Kaggle dataset into memory...")
dataset = load_dataset(path_dataset)
print("- done.")

# Split the dataset into train, val and split (dummy split with no shuffle)
train_dataset = dataset[:int(0.7*len(dataset))]
val_dataset = dataset[int(0.7*len(dataset)) : int(0.85*len(dataset))]
test_dataset = dataset[int(0.85*len(dataset)):]

# Save the datasets to files
save_dataset(train_dataset, '../data/kaggle/train')
save_dataset(val_dataset, '../data/kaggle/val')
save_dataset(test_dataset, '../data/kaggle/test')

Loading Kaggle dataset into memory...
- done.
Saving in ../data/kaggle/train...
- done.
Saving in ../data/kaggle/val...
- done.
Saving in ../data/kaggle/test...
- done.


In [36]:
data_path = '../data/kaggle/train/'

In [54]:
with open(data_path+'sentences.txt', 'r') as f:
    sentences = f.read().split('\n')
with open(data_path+'labels.txt', 'r') as f:
    sentence_tags = f.read().split('\n')

print(sentences[10])
print(sentence_tags[10])
print(len(sentences), ' examples')

The step will allow the facility to operate at full capacity .
O O O O O O O O O O O O
33571  examples


In [40]:
sentences =[sent.split(' ') for sent in sentences]
sentence_tags =[sent.split(' ') for sent in sentence_tags]

In [58]:
print(sentences[193])
print(sentence_tags[193])

Democrats will now have a better chance to protect or expand their one-seat majority in the Senate .
O O O O O O O O O O O O O O O O B-org O


In [42]:
print(len(sentences[9]), len(sentence_tags[9]))

25 25


In [43]:
import numpy as np
sentences = np.array(sentences)
sentence_tags = np.array(sentence_tags)

In [44]:
from sklearn.model_selection import train_test_split
 
 
(train_sentences, 
test_sentences, 
train_tags, 
test_tags) = train_test_split(sentences, sentence_tags, test_size=0.1)
 

In [45]:

words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w.lower())

for ts in train_tags:
    for t in ts:
        tags.add(t)

word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding

In [46]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []
 
for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)
 
for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)
 
for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])
 
for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])
 
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[13167, 11174, 2486, 9148, 5586, 5494, 2997, 7739, 3783, 6357, 7069, 5195, 9223, 22448, 2486, 12219, 6373, 22137, 24493, 12219, 5105, 22612, 8229, 9223, 2178, 5825, 24493, 263, 12931]
[12219, 2905, 10963, 14864, 12917, 23478, 17878, 18798, 20604, 21601, 19457, 15493, 5105, 25290, 2178, 20604, 13949, 12931]
[11, 11, 11, 16, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]
[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]


In [47]:

MAX_LENGTH = len(max(train_sentences_X, key=len))
print(MAX_LENGTH)  # 271

104


In [48]:
from keras.preprocessing.sequence import pad_sequences
 
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
 
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[13167 11174  2486  9148  5586  5494  2997  7739  3783  6357  7069  5195
  9223 22448  2486 12219  6373 22137 24493 12219  5105 22612  8229  9223
  2178  5825 24493   263 12931     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
[12219  2905 10963 14864 12917 23478 17878 18798 20604 21601 19457 15493
  5105 25290  2178 20604 13949 12931     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
 

In [49]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
 
 
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
 
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 104, 128)          3319680   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 104, 512)          788480    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 104, 19)           9747      
_________________________________________________________________
activation_2 (Activation)    (None, 104, 19)           0         
Total params: 4,117,907
Trainable params: 4,117,907
Non-trainable params: 0
_________________________________________________________________


In [50]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [52]:
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))
print(cat_train_tags_y[0])
 

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [59]:
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=1, validation_split=0.2)

Train on 24170 samples, validate on 6043 samples
Epoch 1/1


<keras.callbacks.History at 0x7fd493d34ac8>

In [61]:
scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")   # acc: 99.09751977804825
 

acc: 98.71575342820755


In [98]:
test_samples = [
    "At the Group of Eight summit in Scotland , Japanese Prime Minister Junichiro Koizumi said he is outraged by the London attacks .He noted terrorist acts must not be forgivable . ".split(),
    "Sarin gas attacks on the Tokyo subway system in 1995 killed 12 people and injured thousands .".split(),
    
]
print(test_samples)

[['At', 'the', 'Group', 'of', 'Eight', 'summit', 'in', 'Scotland', ',', 'Japanese', 'Prime', 'Minister', 'Junichiro', 'Koizumi', 'said', 'he', 'is', 'outraged', 'by', 'the', 'London', 'attacks', '.He', 'noted', 'terrorist', 'acts', 'must', 'not', 'be', 'forgivable', '.'], ['Sarin', 'gas', 'attacks', 'on', 'the', 'Tokyo', 'subway', 'system', 'in', '1995', 'killed', '12', 'people', 'and', 'injured', 'thousands', '.']]


In [99]:
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)
 
test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
print(test_samples_X)

[[19324 12219 17976 24493  7401 21401  2486 17131 22612 21191  7969  7033
  22505  1107 25276 12427 20988 23251 13153 12219 10478   624     1 11613
  21814  7506 12131  7581 20604     1 12931     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [    1  2611   624  6768 12219 21930  5271 25414  2486  6043  4212 24558
  23143  7739 24340 20856 12931     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0

In [100]:

predictions = model.predict(test_samples_X)
print(predictions, predictions.shape)

[[[1.30730506e-04 2.70952191e-02 1.51855536e-04 ... 2.98023410e-02
   1.23702004e-04 1.37163370e-04]
  [1.50635271e-04 2.47660521e-02 1.41025623e-04 ... 2.72389241e-02
   1.16164381e-04 1.27625288e-04]
  [2.07328048e-04 3.29984613e-02 1.91689513e-04 ... 3.90817411e-02
   1.65869395e-04 1.78201371e-04]
  ...
  [1.00000000e+00 2.18545963e-11 1.02937203e-09 ... 5.35006872e-10
   3.05242664e-10 5.91202087e-10]
  [1.00000000e+00 2.90565298e-11 1.84811366e-09 ... 6.71890377e-10
   5.35562261e-10 1.04345066e-09]
  [1.00000000e+00 4.12342174e-11 3.29064354e-09 ... 9.17855680e-10
   9.29676391e-10 1.81660753e-09]]

 [[1.61607997e-04 3.70741487e-02 2.04422424e-04 ... 4.51217741e-02
   1.75203968e-04 1.90763574e-04]
  [1.29431326e-04 2.56545246e-02 1.46122664e-04 ... 3.11382655e-02
   1.17775424e-04 1.30371976e-04]
  [1.45706101e-04 2.25255229e-02 1.35804468e-04 ... 2.79560611e-02
   1.08024717e-04 1.17837269e-04]
  ...
  [1.00000000e+00 2.13960290e-11 1.01572173e-09 ... 5.29156219e-10
   3.02887

In [101]:

def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

In [102]:

print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', 

In [103]:
from keras import backend as K
 
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [69]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
 
 
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy', ignore_class_accuracy(0)])
 
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 104, 128)          3319680   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 104, 512)          788480    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 104, 19)           9747      
_________________________________________________________________
activation_3 (Activation)    (None, 104, 19)           0         
Total params: 4,117,907
Trainable params: 4,117,907
Non-trainable params: 0
_________________________________________________________________


In [106]:
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=3, validation_split=0.2)

Train on 24170 samples, validate on 6043 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fd437ee38d0>

In [107]:
predictions = model.predict(test_samples_X)
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))

[['O', 'O', 'O', 'O', 'I-org', 'O', 'O', 'B-geo', 'O', 'B-gpe', 'B-per', 'I-per', 'I-per', 'I-per', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-'], ['O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD

In [108]:
    "At the Group of Eight summit in Scotland , Japanese Prime Minister Junichiro Koizumi said he is outraged by the London attacks .He noted terrorist acts must not be forgivable . ".split(),
    "Sarin gas attacks on the Tokyo subway system in 1995 killed 12 people and injured thousands .".split(),

(['Sarin',
  'gas',
  'attacks',
  'on',
  'the',
  'Tokyo',
  'subway',
  'system',
  'in',
  '1995',
  'killed',
  '12',
  'people',
  'and',
  'injured',
  'thousands',
  '.'],)