In [1]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input, optimizers
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

In [2]:
from keras.layers import concatenate

In [3]:
import pandas as pd
from itertools import chain

In [4]:
from sklearn.model_selection import train_test_split
from keras_preprocessing.sequence import pad_sequences
# changed line:
#from keras.utils import to_categorical
from tensorflow.keras.utils import to_categorical

In [5]:
from gensim.models import KeyedVectors

In [6]:
# reproducibility 
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [7]:
# conll data
path_train ='../Data/dev.conll' #adapt
path_eval = '../Data/test.conll' # adapt

paths = [path_train, path_eval]

# change to test if you are evaluating on test:
eval_split = 'test'

In [8]:
path_emb = '../Model/wiki-news-300d-1M.vec'

In [9]:
# model output path
output_path = 'lsmt-out.csv' # adapt

### Data peperation

In [10]:
import pandas as pd
def convert_data(paths):
    data = []
    sent_id = 0
    
    all_dfs = []
    
    for path in paths:
        split = path.split('/')[-1].split('.')[0]
        
        df = pd.read_csv(path, sep='\t', header=None, on_bad_lines='skip', engine='python',
                         names=["id", "word", "lemma", "pos-univ", "pos", "morph", "head", "basic_dep", "enhanced_dep" , "space", "predicate", "label"])
        # Remove missing values
        df = df.dropna()
        
        # Add 'Split' column
        df['Split'] = split

        # Add 'Sentence #' column
        df['Start'] = df['id'].shift(1) >= df['id']
        df['Sentence #'] = df['Start'].cumsum() + sent_id
        sent_id = df['Sentence #'].iloc[-1]

        # Remove the temporary 'Start' column
        df = df.drop(columns=['Start'])

        # Append DataFrame to the list
        all_dfs.append(df)

    # Concatenate all DataFrames
    result = pd.concat(all_dfs, ignore_index=True)
    
    return result

In [11]:
data = convert_data(paths)

In [12]:
data['label'].value_counts()

_             175852
V               9777
ARG1            6561
ARG0            3466
ARG2            2338
ARGM-TMP        1095
ARGM-ADV         977
ARGM-MOD         819
ARGM-ADJ         479
ARGM-LOC         449
ARGM-NEG         431
ARGM-DIS         368
ARGM-MNR         323
ARGM-EXT         217
ARG3             151
ARGM-LVB         144
ARGM-PRR         144
ARGM-PRP         136
R-ARG0           127
R-ARG1           118
ARGM-CAU         116
C-ARG1           105
ARG4             103
ARGM-DIR          95
ARGM-PRD          94
ARGM-GOL          50
C-V               36
ARGM-COM          27
ARGM-CXN          26
R-ARGM-LOC        19
C-ARG2            14
C-ARGM-CXN        12
R-ARGM-TMP        10
R-ARGM-MNR        10
C-ARG3             9
C-ARG0             7
R-ARG2             5
ARGM-REC           4
C-ARGM-LOC         4
R-ARGM-ADV         2
ARG5               2
C-ARGM-EXT         2
ARG1-DSP           2
ARGA               2
C-ARGM-MNR         1
R-ARGM-COM         1
R-ARGM-CAU         1
R-ARG3       

In [13]:
# Check the lengths of sentences after creating the DataFrame
sentences = data.groupby("Sentence #")["word"].apply(list).tolist()
sentence_lengths = [len(sentence) for sentence in sentences]

max_length = max(sentence_lengths)
max_length_idx = sentence_lengths.index(max_length)

print("Max length after reading dataset:", max_length)
print("Sentence index:", max_length_idx)
print("Sentence:", sentences[max_length_idx])


Max length after reading dataset: 81
Sentence index: 5037
Sentence: ['(', 'You', 'do', "n't", 'need', 'to', 'use', 'their', 'site', ',', 'you', 'can', 'opt', '-', 'out', 'of', 'sharing', 'your', 'information', ',', 'you', 'do', "n't", 'need', 'to', 'send', 'stuff', 'to', 'anyone', 'with', 'a', 'Gmail', 'account', ',', 'and', 'if', '--', 'wonder', 'of', 'wonders', '--', 'you', "'re", 'worried', 'that', 'you', 'might', 'send', 'something', 'to', 'someone', 'who', 'would', 'forward', 'an', 'excerpt', 'to', 'someone', 'who', 'would', 'then', 'store', 'it', 'on', 'a', 'Gmail', 'account', '...', 'you', 'have', 'far', ',', 'far', 'too', 'much', 'time', 'on', 'your', 'hands', ')', '.']


In [14]:
def get_dict_map(data, token_or_tag, embedding_model=None):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'word':
        vocab = list(set(data['word'].to_list()))
    else:
        vocab = list(set(data['label'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}   
    
    return tok2idx, idx2tok


token2idx, idx2token = get_dict_map(data, 'word')
tag2idx, idx2tag = get_dict_map(data, 'label')



In [15]:
n_vocab = len(token2idx)
n_tags = len(tag2idx)

print(n_vocab)
print(n_tags)

7858
51


In [16]:
w2v_model = KeyedVectors.load_word2vec_format(path_emb, binary=False)

In [17]:
# Create embedding matrix with zero vectors for oov words
emb_dim = 300
embedding_matrix = np.zeros((len(token2idx) + 1, emb_dim))
print(embedding_matrix.shape)
for word, i in token2idx.items():
    # You may have to change the following line to:
    # if word in w2v_model:
    if word in w2v_model.key_to_index:
        embedding_vector = w2v_model[word]
    else:
        embedding_vector = None
        # If you want to check OOV words:
        #print('couldnt find:', word, i)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

(7859, 300)


In [18]:
# Check dimensions, store number of vector dimensions in variable
print(embedding_matrix.shape)
emb_dim = embedding_matrix.shape[1]
print(emb_dim)

(7859, 300)
300


In [19]:
# Add index info to dataframe

data['Word_idx'] = data['word'].map(token2idx)
data['Tag_idx'] = data['label'].map(tag2idx)
data['Predicate_idx'] = data['label'].apply(lambda x: 1 if x == 'V' else 0)
data[:-10]

Unnamed: 0,id,word,lemma,pos-univ,pos,morph,head,basic_dep,enhanced_dep,space,predicate,label,Split,Sentence #,Word_idx,Tag_idx,Predicate_idx
0,1,From,from,ADP,IN,_,3,case,3:case,_,_,_,dev,0,417,45,0
1,2,the,the,DET,DT,Definite=Def|PronType=Art,3,det,3:det,_,_,_,dev,0,3104,45,0
2,3,AP,AP,PROPN,NNP,Number=Sing,4,obl,4:obl:from,_,_,ARG2,dev,0,5161,9,0
3,4,comes,come,VERB,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,0,root,0:root,_,come.03,V,dev,0,897,27,1
4,5,this,this,DET,DT,Number=Sing|PronType=Dem,6,det,6:det,_,_,_,dev,0,4049,45,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204720,6,in,in,SCONJ,IN,_,7,mark,7:mark,_,_,_,test,9770,2227,45,0
204721,7,diagnosing,diagnose,VERB,VBG,VerbForm=Ger,5,advcl,5:advcl:in,SpaceAfter=No,diagnose.01,_,test,9770,7620,45,0
204722,8,",",",",PUNCT,",",_,9,punct,9:punct,_,_,_,test,9770,7584,45,0
204723,9,addressing,address,VERB,VBG,VerbForm=Ger,7,conj,5:advcl:in|7:conj:and,_,address.02,_,test,9770,6088,45,0


In [20]:
data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
data_group = data_fillna.groupby(['Sentence #'], as_index=False)[
    ["id", "word", "lemma", "pos-univ", "pos", "morph", "head", "basic_dep", 
     "enhanced_dep", "space", "predicate", "label", 'Word_idx', 'Tag_idx','Predicate_idx', 'Split']
].agg(lambda x: list(x))


In [21]:
# Change eval_split from 'dev' to test to run on test data
def get_pad_train_test_val(data_group, data, eval_split='train', n_vocab = n_vocab):

    #get max token and tag length
    n_token = len(list(set(data['word'].to_list())))
    n_tag = len(list(set(data['label'].to_list())))
    print(n_tag)

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist() 
    maxlen = max([len(s) for s in tokens])
    print ("this is maxlen:", maxlen)
    # value should be the number of items in the vocb?
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int64', padding='post', value= n_vocab)
    print('padding', len(pad_tokens[0]))
    # I used the code below to check the if the padded vectors are set to 0:
#     for token in pad_tokens:
#         print(token[-1])
# #         print(embedding_matrix[token[-1]])
#         break

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int64', padding='post', value= tag2idx["R-ARGM-ADJ"])
    n_tags = len(tag2idx)
    print ("n_tags:",n_tags)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    # Pad predicates
    predicates = data_group['Predicate_idx'].tolist()
    pad_predicates = pad_sequences(predicates, maxlen=maxlen, dtype='int64', padding='post', value=0)
    
    train_tokens = []
    dev_tokens = []
    train_tags = []
    dev_tags = []
    train_predicates = []
    dev_predicates = []

    for i, row in data_group.iterrows():
        if 'test' in row['Split']:
            train_tokens.append(pad_tokens[i])
            train_tags.append(pad_tags[i])
            train_predicates.append(pad_predicates[i])
        elif 'dev' in row['Split']:
            dev_tokens.append(pad_tokens[i])
            dev_tags.append(pad_tags[i])
            dev_predicates.append(pad_predicates[i])

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        #'\ntest_tokens length:', len(test_tokens),
        #'\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(dev_tokens),
        '\nval_tags:', len(dev_tags))
 
    return np.array(train_tokens), np.array(dev_tokens), np.array(train_predicates), np.array(dev_predicates), np.array(train_tags), np.array(dev_tags)

train_tokens, dev_tokens, train_predicates,dev_predicates, train_tags, dev_tags = get_pad_train_test_val(data_group, data, eval_split= eval_split)

51
this is maxlen: 81
padding 81
n_tags: 51
train_tokens length: 4799 
train_tokens length: 4799 
val_tokens: 4972 
val_tags: 4972


In [23]:
input_dim = len(list(set(data['word'].to_list()))) +1
output_dim = emb_dim # number of dimensions
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', 
      input_dim, '\noutput_dim: ', 
      output_dim, '\ninput_length: ', 
      input_length, '\nn_tags: ', n_tags)
print('emb dim', emb_dim)

input_dim:  7859 
output_dim:  300 
input_length:  81 
n_tags:  51
emb dim 300


In [24]:
def get_bilstm_lstm_model_v2(embedding_matrix, embedding_dim):
    word_input = Input(shape=(input_length,), dtype='int32', name='word_input')
    predicate_input = Input(shape=(input_length,), dtype='int32', name='predicate_input')

    embedding_layer = Embedding(len(token2idx) + 1,
                                 embedding_dim,
                                 weights=[embedding_matrix],
                                 input_length=input_length,
                                 trainable=False)(word_input)

    predicate_embedding = Embedding(2, embedding_dim, input_length=input_length)(predicate_input)

    merged_embeddings = concatenate([embedding_layer, predicate_embedding])

    bilstm = Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode='concat')(merged_embeddings)
    
    output = TimeDistributed(Dense(n_tags, activation="softmax"))(bilstm)

    model = Model(inputs=[word_input, predicate_input], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
    model.summary()

    return model


In [25]:
print(train_tags.shape)
print(dev_tags.shape)

n_tags


(4799, 81, 51)
(4972, 81, 51)


51

In [27]:
def train_model(X, X_pred, y, model):
    loss = list()
    for i in range(8):
        hist = model.fit([X, X_pred], y, batch_size=200, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [28]:
results = pd.DataFrame()
embedding_dim = 300 # dimensions of the word2vec vectors
model_bilstm_lstm_v2 = get_bilstm_lstm_model_v2(embedding_matrix, embedding_dim)
results['with_add_lstm'] = train_model(dev_tokens, dev_predicates, dev_tags, model_bilstm_lstm_v2)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 word_input (InputLayer)        [(None, 81)]         0           []                               
                                                                                                  
 predicate_input (InputLayer)   [(None, 81)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 81, 300)      2357700     ['word_input[0][0]']             
                                                                                                  
 embedding_1 (Embedding)        (None, 81, 300)      600         ['predicate_input[0][0]']        
                                                                                              



In [30]:
# print("Evaluate on test data")
# results = model_bilstm_lstm_v2.evaluate([dev_tokens, dev_predicates], np.array(dev_tags), batch_size=1)
# print("test loss, test acc:", results)

In [31]:
y_pred = model_bilstm_lstm_v2.predict([train_tokens, train_predicates])



In [32]:
# get dimension index with highest prob (--> label)
y_pred = np.argmax(y_pred, axis=-1)
y_dev =  np.argmax(dev_tags, axis=-1)
print(len(y_pred))

4799


In [37]:
from sklearn.metrics import classification_report
import numpy as np

# Convert the predictions and true labels to their original tag forms (not one-hot encoded)
predicted_tags = np.argmax(y_pred, axis=-1)
true_tags = np.argmax(np.array(dev_tags), axis=-1)

# Create a reverse mapping from tag indices to tag names
idx2tag = {i: tag for tag, i in tag2idx.items()}

predicted_tags_names = []
true_tags_names = []

for true_seq, pred_seq in zip(true_tags, predicted_tags):
    for true_tag, pred_tag in zip(true_seq.ravel(), pred_seq.ravel()):
        # Ignore padding values when both true_tag and pred_tag are padding tags
        #if not (true_tag == tag2idx["R-ARGM-ADJ"] and pred_tag == tag2idx["R-ARGM-ADJ"]):
            predicted_tags_names.append(idx2tag[pred_tag])
            true_tags_names.append(idx2tag[true_tag])

# Generate the classification report
report = classification_report(true_tags_names, predicted_tags_names, zero_division=0)

print(report)


              precision    recall  f1-score   support

        ARG0       0.00      0.00      0.00     405.0
        ARG1       0.00      0.00      0.00     329.0
        ARG2       0.00      0.00      0.00      18.0
        ARG3       0.00      0.00      0.00       2.0
    ARGM-ADJ       0.00      0.00      0.00      30.0
    ARGM-ADV       0.00      0.00      0.00      42.0
    ARGM-CAU       0.00      0.00      0.00      11.0
    ARGM-DIS       0.00      0.00      0.00      89.0
    ARGM-EXT       0.00      0.00      0.00       4.0
    ARGM-LOC       0.00      0.00      0.00       5.0
    ARGM-LVB       0.00      0.00      0.00       8.0
    ARGM-MNR       0.00      0.00      0.00      14.0
    ARGM-MOD       0.00      0.00      0.00      21.0
    ARGM-NEG       0.00      0.00      0.00       4.0
    ARGM-PRD       0.00      0.00      0.00       2.0
    ARGM-TMP       0.00      0.00      0.00      19.0
  C-ARGM-CXN       0.00      0.00      0.00       0.0
  R-ARGM-LOC       0.00    

In [34]:
#predicted_tags_names

In [38]:
from collections import Counter
my_counter = Counter(predicted_tags_names)
my_counter

Counter({'C-ARGM-CXN': 4568, 'R-ARGM-LOC': 231})