In [1]:
import spacy
import numpy as np
import wikipedia, string

nlp = spacy.load('en')

from keras.optimizers import Adam
from keras.models import Model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Input, Dropout, concatenate, Conv1D, Lambda
from keras.preprocessing.sequence import pad_sequences
from keras.layers.wrappers import TimeDistributed
from keras.layers.pooling import MaxPooling1D
from keras import regularizers
from keras import losses

from layers import ChainCRF

Using TensorFlow backend.


In [2]:
def corpus_builder(subjects):
    """receives a list of topics, retrieves corresponding Wikipedia articles and assembles a text corpus"""
    text = ""
    for subj in subjects:
        text += wikipedia.page(subj).content
    return text.lower()

def remove_punc(text):
    return text.translate(str.maketrans('','',string.punctuation))

text = corpus_builder(['linguistics', 
                       'noam chomsky', 
                       'computational linguistics', 
                       'natural language processing',
                       'semantics',
                       'pragmatics',
                       'phonetics',
                       'phonology',
                       'syntax',
                       'george lakoff',
                       'cognitive linguistics'])

y_1 = [] # corresponds to POS tags
y_2 = [] # corresponds to dependency tags

X_ = text.split('.')
X_ = [remove_punc(x) for x in X_]

X = []
for sent in X_:
    tokens = []
    pos_tags = []
    dep_tags = []
    doc = nlp(sent)
    for token in doc:
        pos_tags.append(str(token.pos_))
        dep_tags.append(str(token.dep_))
        tokens.append(str(token)) # convert 'spacy.tokens.token.Token' to 'string'
    X.append(tokens)
    y_1.append(pos_tags)
    y_2.append(dep_tags)

# if there are empty (unknown) tags, insert 'UNK' to signal unknown
y_1 = [['UNK'] if y == [] else y for y in y_1]
y_2 = [['UNK'] if y == [] else y for y in y_2]

In [3]:
all_pos_tags = list(set([elem for sublist in y_1 for elem in sublist]))
all_dep_tags = list(set([elem for sublist in y_2 for elem in sublist]))

pos2idx = {label:i for (i,label) in enumerate(all_pos_tags, start=1)}
pos2idx[0] = 0 
idx2pos = {v: k for k, v in pos2idx.items()}

dep2idx = {label:i for (i,label) in enumerate(all_dep_tags, start=1)}
dep2idx[0] = 0
idx2dep = {v: k for k, v in dep2idx.items()}

# integer-encode labels 
y_1 = [[pos2idx[pos] for pos in sent] for sent in y_1]
y_2 = [[dep2idx[dep] for dep in sent] for sent in y_2]

In [4]:
# integer encode words in the sequence 
words = list(set([elem for sublist in X for elem in sublist]))
w2idx = {label:i for (i,label) in enumerate(words, start=1)}

idx2w = {v: k for k, v in w2idx.items()}

X = [np.array([w2idx[w] for w in sent]) for sent in X]

# zero-pad all the sequences smaller than max_length (max len of sents)
max_length = len(max(X, key=len))

X_padded = pad_sequences(X, maxlen=max_length, padding='post')
y_1_padded = pad_sequences(y_1, maxlen=max_length, padding='post')
y_2_padded = pad_sequences(y_2, maxlen=max_length, padding='post')

In [5]:
# one-hot encode the labels 
from keras.utils import to_categorical

# y_1
pos_idx = np.array(list(idx2pos.keys()))
pos_vec = to_categorical(pos_idx)
one_hot_y_1 = dict(zip(pos_idx, pos_vec))

y_1_padded = np.array([[one_hot_y_1[l] for l in labels] for labels in y_1_padded])

# y_2
dep_idx = np.array(list(idx2dep.keys()))
dep_vec = to_categorical(dep_idx)
one_hot_y_2 = dict(zip(dep_idx, dep_vec))

y_2_padded = np.array([[one_hot_y_2[l] for l in labels] for labels in y_2_padded])

In [6]:
n_classes_pos = len(pos2idx)
n_classes_dep = len(dep2idx)

# multi-task learning (no shared layers)
def MTL():
    # pos 
    visible_pos = Input(shape=(max_length,), dtype='int32', name='visible_pos')
    embed1 = Embedding(input_dim=len(words)+1, output_dim=512)(visible_pos)
    conv1 = Conv1D(300, 2, activation="relu", padding="same", name='conv1', 
                        kernel_regularizer=regularizers.l2(0.001))(embed1)
    lstm1 = Bidirectional(LSTM(100,return_sequences=True, name='lstm1', dropout=0.5))(conv1)
    tmd1 = TimeDistributed(Dense(100, activation='relu', name='dense1_relu'), name='TimeDistributed1')(lstm1) 
    output_pos = TimeDistributed(Dense(n_classes_pos, name='dense1'), name='TimeDistributed2')(tmd1) 
    
    # dep
    visible_dep = Input(shape=(max_length,), dtype='int32', name='visible_dep')
    embed2 = Embedding(input_dim=len(words)+1, output_dim=512)(visible_dep)
    conv2 = Conv1D(300, 2, activation="relu", padding="same", name='conv2', 
                        kernel_regularizer=regularizers.l2(0.001))(embed2)
    lstm2 = Bidirectional(LSTM(100,return_sequences=True, name='lstm2', dropout=0.5))(conv2)
    tmd2 = TimeDistributed(Dense(100, activation='relu', name='dense2_relu'), name='TimeDistributed3')(lstm2) 
    output_dep = TimeDistributed(Dense(n_classes_dep, name='dense2'), name='TimeDistributed4')(tmd2) 
    
    model = Model(inputs=[visible_pos, visible_dep], outputs=[output_pos, output_dep]) 
    model.compile(loss=[losses.categorical_crossentropy, losses.categorical_crossentropy],
                  optimizer='adam', 
                  metrics=['mae', 'acc'])
    print(model.summary())
    return model    

In [15]:
# inspired by: https://goo.gl/U4ydx5
def MTL_enhanced():
    # pos (pos tagging is taken to be the main objective)
    main_input = Input(shape=(max_length,), dtype='int32', name='main_input')
    embed1 = Embedding(input_dim=len(words)+1, output_dim=512)(main_input)
    conv1 = Conv1D(100, 2, activation="relu", padding="same", name='conv1', 
                        kernel_regularizer=regularizers.l2(0.001))(embed1)
    lstm1 = Bidirectional(LSTM(100, name='lstm1', dropout=0.5, return_sequences=True))(conv1)
    # dep as an auxiliary task
    aux_output = TimeDistributed(Dense(n_classes_dep, name='dense1', activation='sigmoid'), name='TimeDistributed1')(lstm1)
    aux_input = Input(shape=(max_length,), dtype='int32', name='aux_input')
    embed2 = Embedding(input_dim=len(words)+1, output_dim=512)(aux_input)
    x = concatenate([embed2, lstm1])
    # We stack a deep densely-connected network on top
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    main_output = TimeDistributed(Dense(n_classes_pos, name='dense2'), name='TimeDistributed2')(x) 

    model = Model(inputs=[main_input, aux_input], outputs=[main_output, aux_output]) 
    model.compile(loss=[losses.categorical_crossentropy, losses.categorical_crossentropy],
                  optimizer='adam', 
                  metrics=['acc'],
                  loss_weights=[1., 0.2])
    print(model.summary())
    return model     

In [18]:
y = tuple(zip(y_1_padded, y_2_padded))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.1, random_state=42)

# we look at pos tags first
y_train_pos = np.array([y[0] for y in y_train])
y_test_pos = np.array([y[0] for y in y_test])

y_train_dep = np.array([y[1] for y in y_train])
y_test_dep = np.array([y[1] for y in y_test])

model = MTL_enhanced()

model.fit({'main_input': X_train, 'aux_input': X_train},
          {'TimeDistributed2': y_train_pos, 'TimeDistributed1': y_train_dep},
          epochs=15, batch_size=32)

# model.fit([X_train, X_train], [y_train_pos, y_train_dep], validation_split=0, batch_size=16, epochs=5)

# since we are not using early stopping and our data is small already, we set validation split to zero
# model.fit(X_train, y_train, validation_split=0, batch_size=16, epochs=5)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 164)          0                                            
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 164, 512)     3259904     main_input[0][0]                 
__________________________________________________________________________________________________
aux_input (InputLayer)          (None, 164)          0                                            
__________________________________________________________________________________________________
conv1 (Conv1D)                  (None, 164, 100)     102500      embedding_11[0][0]               
__________________________________________________________________________________________________
embedding_

<keras.callbacks.History at 0x1a7b573b70>

In [19]:
model.evaluate([X_test, X_test], [y_test_pos, y_test_dep])



[15.818660499635806,
 15.681115757335316,
 0.12599670665323243,
 0.030714573723471854,
 0.96608546253078242]

In [20]:
model.metrics_names

['loss',
 'TimeDistributed2_loss',
 'TimeDistributed1_loss',
 'TimeDistributed2_acc',
 'TimeDistributed1_acc']