In [8]:
import pandas as pd
import numpy as np
import keras
data = pd.read_csv("C:/Users/natha/source/repos/CreateRNNTrainFiles/CreateRNNTrainFiles/Data/Output/test1.csv", delimiter = ',', quoting = 0)
data[:10]

Unnamed: 0,Sentence,PosTag,Tag,Word
0,1,NN,Python (programming language),python
1,1,VBZ,S,is
2,1,DT,S,an
3,1,JJ,0,interpreted
4,1,NN,0,high-level
5,1,NN,0,programming
6,1,NN,0,language
7,1,IN,S,for
8,1,JJ,0,general-purpose
9,1,.,Punc,.


In [16]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words)

tags = list(set(data["Tag"].values))
n_tags = len(tags)

In [19]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["PosTag"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [26]:
getter = SentenceGetter(data)
sentences = getter.sentences
sentences[0]

[('python', 'NN', 'Python (programming language)'),
 ('is', 'VBZ', 'S'),
 ('an', 'DT', 'S'),
 ('interpreted', 'JJ', '0'),
 ('high-level', 'NN', '0'),
 ('programming', 'NN', '0'),
 ('language', 'NN', '0'),
 ('for', 'IN', 'S'),
 ('general-purpose', 'JJ', '0'),
 ('.', '.', 'Punc')]

In [28]:
max_len = 25
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
word2idx["python"]

47

In [34]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words-1)

y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=n_tags-1)

y = [to_categorical(i, num_classes=n_tags) for i in y]

In [35]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [39]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

input = Input(shape=(max_len,))

model = Embedding(input_dim=n_words + 1, output_dim = 20, input_length=max_len, mask_zero=True)(input)

model = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(model)

model = TimeDistributed(Dense(50, activation="relu"))(model)

crf = CRF(n_tags)

out = crf(model)

In [42]:
model = Model(input, out)

model.compile(optimizer="rmsprop", loss=crf.loss_function,metrics=[crf.accuracy])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 25)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 25, 20)            2040      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 25, 100)           28400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 25, 50)            5050      
_________________________________________________________________
crf_1 (CRF)                  (None, 25, 13)            858       
Total params: 36,348
Trainable params: 36,348
Non-trainable params: 0
_________________________________________________________________


In [45]:
history = model.fit(x_train, np.array(y_train), batch_size=32, epochs=5,
                    validation_split=0.1, verbose=1)

Train on 7 samples, validate on 1 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [56]:
test_pred = model.evaluate(x_test, np.array(y_test))
test_pred



[2.042203187942505, 0.6399999856948853]