# Event detection

In [1]:
#avoid multiple loggin
first_run = True

In [11]:
# This script trains the BiLSTM-CRF architecture for part-of-speech tagging using
# the universal dependency dataset (http://universaldependencies.org/).
# The code use the embeddings by Komninos et al. (https://www.cs.york.ac.uk/nlp/extvec/)
from __future__ import print_function
import os
import logging
import sys
from neuralnets.BiLSTM import BiLSTM
from util.preprocessing import perpareDataset, loadDatasetPickle

'''
# :: Change into the working dir of the script ::
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)
'''

if first_run:
    first_run = False
    # :: Logging level ::
    loggingLevel = logging.INFO
    logger = logging.getLogger()
    logger.setLevel(loggingLevel)

    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(loggingLevel)
    formatter = logging.Formatter('%(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)


######################################################
#
# Data preprocessing
#
######################################################
datasets = {
    'tacMixed':                                   #Name of the dataset
        {'columns': {0:'tokens', 1:'lemma', 2:'POS', 5:'chunk_BIO'},
         'label': 'chunk_BIO',                                #Which column we like to predict
         'evaluate': True,                                  #Should we evaluate on this task? Set true always for single task setups
         'commentSymbol': None} 
}


#Path on your computer to the word embeddings
#embeddingsPath = 'glove.6B.300d.txt'
embeddingsPath = 'HistoGlove.txt'


#Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets)


######################################################
#
# The training of the network starts here
#
######################################################


#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {'classifier': ['Softmax'], 'LSTM-Size': [75, 75], 'dropout': (0.25, 0.25),
         'featureNames': ['tokens', 'lemma', 'casing', 'POS'], 'addFeatureDimensions': 10,
         'miniBatchSize': 32, 'earlyStopping': 10}

model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.storeResults('./tacMixedHistoGlove.csv') #Path to store performance scores for dev / test
model.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5" #Path to store models
model.fit(epochs=50)

Generate new embeddings files for a dataset
Read file: HistoGlove.txt
Added words: 6
:: Transform tacMixed dataset ::
:: Create Train Matrix ::
Unknown-Tokens: 4.00%
:: Create Dev Matrix ::
Unknown-Tokens: 3.59%
:: Create Test Matrix ::
Unknown-Tokens: 1.33%
DONE - Embeddings file saved: pkl/tacMixed_HistoGlove.pkl
--- tacMixed ---
2388 train sentences
1919 dev sentences
1985 test sentences
LSTM-Size: [75, 75]
_____________________________________________________________________________________________________________________________
Layer (type)                             Output Shape               Param #        Connected to                              
words_input (InputLayer)                 (None, None)               0                                                        
_____________________________________________________________________________________________________________________________
lemma_input (InputLayer)                 (None, None)               0             

Wrong BIO-Encoding 8/1787 labels, 0.45%
Wrong BIO-Encoding 8/1787 labels, 0.45%
Test-Data: Prec: 0.693, Rec: 0.178, F1: 0.2835

Scores from last epoch:
  Train-Score: 0.8848
  Dev-Score: 0.6516

2.94 sec for evaluation

--------- Epoch 10 -----------
4.75 sec for training (55.03 total)
-- tacMixed --
Wrong BIO-Encoding 1/3662 labels, 0.03%
Wrong BIO-Encoding 1/3662 labels, 0.03%
Train-Data: Prec: 0.888, Rec: 0.917, F1: 0.9026
Wrong BIO-Encoding 1/2084 labels, 0.05%
Wrong BIO-Encoding 1/2084 labels, 0.05%
Dev-Data: Prec: 0.570, Rec: 0.723, F1: 0.6375
Wrong BIO-Encoding 6/1958 labels, 0.31%
Wrong BIO-Encoding 6/1958 labels, 0.31%
Test-Data: Prec: 0.679, Rec: 0.192, F1: 0.2990

Scores from last epoch:
  Train-Score: 0.9026
  Dev-Score: 0.6375

2.91 sec for evaluation

--------- Epoch 11 -----------
4.76 sec for training (59.79 total)
-- tacMixed --
Wrong BIO-Encoding 4/3688 labels, 0.11%
Wrong BIO-Encoding 4/3688 labels, 0.11%
Train-Data: Prec: 0.895, Rec: 0.930, F1: 0.9119
Wrong BIO-Enco