In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("train_data_3.csv", names=['Word','Tag','Sentence #'], skiprows=1)

In [2]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

1225

In [3]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

3

In [4]:
tags

['Variable', '0', 'Value']

In [5]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [6]:
getter = SentenceGetter(data)


In [7]:
sent = getter.get_next()


In [8]:
print(sent)

None


In [9]:
sentences = getter.sentences
print(sentences)

[[('16.16.5.', '0'), ('The', '0'), ('OPEN', '0'), ('operation', '0'), ('creates', '0'), ('and/or', '0'), ('opens', '0'), ('a', '0'), ('regular', '0'), ('file', '0'), ('in', '0'), ('a', '0'), ('directory', '0'), ('with', '0'), ('the', '0'), ('provided', '0'), ('name', '0'), ('<NULL>.', '0')], [('If', '0'), ('the', '0'), ('file', '0'), ('does', '0'), ('not', '0'), ('exist', '0'), ('at', '0'), ('the', '0'), ('server', '0'), ('and', '0'), ('creation', '0'), ('is', '0'), ('desired', '0'), ('specification', '0'), ('of', '0'), ('the', '0'), ('method', '0'), ('of', '0'), ('creation', '0'), ('is', '0'), ('provided', '0'), ('by', '0'), ('the', '0'), ('openhow', '0'), ('parameter', '0'), ('<NULL>.', '0')], [('The', '0'), ('client', '0'), ('has', '0'), ('the', '0'), ('choice', '0'), ('of', '0'), ('three', '0'), ('creation', '0'), ('methods:', '0'), ('UNCHECKED4', '0'), ('GUARDED4', '0'), ('or', '0'), ('EXCLUSIVE4', '0'), ('If', '0'), ('the', '0'), ('current', 'Variable'), ('filehandle', 'Variable'

In [10]:
max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [11]:
word2idx["current"]

97

In [12]:
tag2idx["0"]


1

In [13]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]

Using TensorFlow backend.


In [14]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words)


In [15]:
y = [[tag2idx[w[1]] for w in s] for s in sentences]


In [16]:
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["0"])


In [17]:
from keras.utils import to_categorical


In [18]:
y = [to_categorical(i, num_classes=n_tags) for i in y]


In [19]:
from sklearn.model_selection import train_test_split


In [20]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)


In [21]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

In [22]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=20,
                  input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags)  # CRF layer
out = crf(model)  # output

In [23]:
model = Model(input, out)


In [24]:
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])


In [25]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 75)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 75, 20)            24520     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 75, 100)           28400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 75, 50)            5050      
_________________________________________________________________
crf_1 (CRF)                  (None, 75, 3)             168       
Total params: 58,138
Trainable params: 58,138
Non-trainable params: 0
_________________________________________________________________


In [26]:
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=20,
                    validation_split=0.1, verbose=1)

Train on 243 samples, validate on 27 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [27]:
hist = pd.DataFrame(history.history)


In [28]:
# import matplotlib.pyplot as plt
# plt.style.use("ggplot")
# plt.figure(figsize=(12,12))
# plt.plot(hist["acc"])
# plt.plot(hist["val_acc"])
# plt.show()

In [29]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report


In [30]:
test_pred = model.predict(X_te, verbose=1)




In [31]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_te)

In [32]:
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))


F1-score: 47.2%


In [33]:
print(classification_report(test_labels, pred_labels))


             precision    recall  f1-score   support

   Variable       0.67      0.22      0.33        18
          0       0.71      0.45      0.56        55
      Value       0.00      0.00      0.00         7

avg / total       0.64      0.36      0.46        80



In [34]:
i = 30
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_te[i], -1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
length = 0
for w, t, pred in zip(X_te[i], true, p[0]):
    length+=1
    if w != 0:
        print("{:15}: {:5} {}".format(words[w-1], tags[t], tags[pred]))

Word           ||True ||Pred
The            : 0     0
server         : 0     0
has            : 0     0
two            : 0     0
choices        : 0     0
<NULL>.        : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0     0
ENDPAD         : 0   