In [0]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
	#nltk.download("mac_morpho")
	#nltk.download('stopwords')
	#nltk.download('averaged_perceptron_tagger')
from nltk.corpus import mac_morpho, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from keras import initializers
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

Using TensorFlow backend.


In [0]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [0]:
def pre_process():
    current_dir = os.getcwd()
    train_text = open('/content/drive/My Drive/nlp/corpus/macmorpho-train.txt', 'r')
    dev_text = open('/content/drive/My Drive/nlp/corpus/macmorpho-dev.txt', 'r')
    test_text = open('/content/drive/My Drive/nlp/corpus/macmorpho-test.txt', 'r')

    sentences_train = []
    sentences_dev = []
    sentences_test = []
    #TRAIN DATA
    for line in train_text.readlines():
    	tudo = line.replace('\n', '').split(' ')
    	train = [nltk.tag.util.str2tuple(word, sep='_') for word in tudo]
    	sentences_train.append(train)
    	#print(train)
    MAX_LEGTH = 248


    classes = set([x[1] for x in train]) 
    #print(classes)

    #DEV DATA
    for line in dev_text.readlines():
    	tudo = line.replace('\n', '').split(' ')
    	dev = [nltk.tag.util.str2tuple(word, sep='_') for word in tudo]
    	sentences_dev.append(dev)
	#print(dev)

    #TEST DATA
    for line in test_text.readlines():
    	tudo = line.replace('\n', '').split(' ')
    	test = [nltk.tag.util.str2tuple(word, sep='_') for word in tudo]
    	sentences_test.append(test)
    	#print(test)

    print("Pre-processing done")
    return sentences_train, classes, sentences_dev,sentences_test
    #return sentences_train, train, classes, sentences_dev, dev, sentences_test, test

def splits_sentences(sentences_train, sentences_dev, sentences_test):
	#split words and tags
	train_sentence_words, train_sentence_tags =[], [] 
	for tagged_sentence in sentences_train:
	    sentence, tags = zip(*tagged_sentence)
	    train_sentence_words.append(np.array(sentence))
	    train_sentence_tags.append(np.array(tags))

	dev_sentence_words, dev_sentence_tags =[], [] 
	for tagged_sentence in sentences_dev:
	    sentence, tags = zip(*tagged_sentence)
	    dev_sentence_words.append(np.array(sentence))
	    dev_sentence_tags.append(np.array(tags))

	test_sentence_words, test_sentence_tags =[], [] 
	for tagged_sentence in sentences_test:
	    sentence, tags = zip(*tagged_sentence)
	    test_sentence_words.append(np.array(sentence))
	    test_sentence_tags.append(np.array(tags))

	#converts to numbers    
	return convert_to_numbers(train_sentence_words, train_sentence_tags, dev_sentence_words, dev_sentence_tags, test_sentence_words, test_sentence_tags)

def convert_to_numbers(train_sentence_words, train_sentence_tags, dev_sentence_words, dev_sentence_tags, test_sentence_words, test_sentence_tags):
	#converting to numbers
	words, tags = set([]), set([])
	 
	for s in train_sentence_words :
	    for w in s:
	        words.add(w.lower())
 
	for ts in train_sentence_tags:
	    for t in ts:
	        tags.add(t)

	word2index = {w: i + 2 for i, w in enumerate(list(words))}
	word2index['-PAD-'] = 0  # The special value used for padding
	word2index['-OOV-'] = 1  # The special value used for OOVs

	tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
	tag2index['-PAD-'] = 0  # The special value used to padding

	#converting the word dataset to numbers

	train_sentences_X, val_sentences_X, test_sentences_X, train_tags_y, val_tags_y, test_tags_y = [], [], [], [], [], []

	#train	 
	for s in train_sentence_words:
	    s_int = []
	    for w in s:
	        try:
	            s_int.append(word2index[w.lower()])
	        except KeyError:
	            s_int.append(word2index['-OOV-'])
	 
	    train_sentences_X.append(s_int)

	#validation
	for s in dev_sentence_words:
	    s_int = []
	    for w in s:
	        try:
	            s_int.append(word2index[w.lower()])
	        except KeyError:
	            s_int.append(word2index['-OOV-'])
	 
	    val_sentences_X.append(s_int)
	#test 
	for s in test_sentence_words:
	    s_int = []
	    for w in s:
	        try:
	            s_int.append(word2index[w.lower()])
	        except KeyError:
	            s_int.append(word2index['-OOV-'])
	 
	    test_sentences_X.append(s_int)
	

	for s in train_sentence_tags:
	    train_tags_y.append([tag2index[t] for t in s])
	
	for s in dev_sentence_tags:
	    val_tags_y.append([tag2index[t] for t in s])
	 
	for s in test_sentence_tags:
	    test_tags_y.append([tag2index[t] for t in s])
	 
	print("Done number converting")

	return word2index, tag2index, train_sentences_X, val_sentences_X, test_sentences_X, train_tags_y, val_tags_y, test_tags_y

def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

def create_model(window_size,train_sentences_X,train_tags_y,epochs,batch_size,val_sentences_X,val_tags_y, tag2index):
    
    model = Sequential()
    model.add(InputLayer(input_shape=(248,)))
    model.add(Embedding(len(word2index), 128))
    model.add(Bidirectional(LSTM(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(len(tag2index))))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer ='rmsprop',metrics=['accuracy', ignore_class_accuracy(0), single(0), single(1), single(2), single(3), single(4), single(5), single(6),
                                                                                 single(7), single(8), single(9), single(10), single(11), single(12), single(13), single(14), 
                                                                                 single(15), single(16), single(17), single(18), single(19), single(20), single(21), single(22), single(23), single(24), single(25), single(26), single(27)])
    print(model.summary())
    model.fit(train_sentences_X,train_tags_y,epochs=epochs, batch_size=batch_size,verbose=1,  validation_split=0.2)#validation_data=(val_sentences_X,val_tags_y))
    return model

def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

def single(INTERESTING_CLASS_ID):
  def single_class_accuracy(y_true, y_pred):
      class_id_true = K.argmax(y_true, axis=-1)
      class_id_preds = K.argmax(y_pred, axis=-1)
      # Replace class_id_preds with class_id_true for recall here
      accuracy_mask = K.cast(K.equal(class_id_preds, INTERESTING_CLASS_ID), 'int32')
      class_acc_tensor = K.cast(K.equal(class_id_true, class_id_preds), 'int32') * accuracy_mask
      class_acc = K.sum(class_acc_tensor) / K.maximum(K.sum(accuracy_mask), 1)
      return class_acc
  return single_class_accuracy

def main(window_size, epochs,batch_size, train, classes, dev, test):

    data_train,classes_train,vectorizer,corpus = return_training_data(train, window_size, epochs)
    
    data_val,classes_val,vectorizer2,corpus2 = return_validation_data(dev, window_size,epochs)

    model = create_model(window_size,data_train,classes_train,epochs,batch_size,data_val,classes_val)
    #generating test samples

    data_test = []
    classes_test = []
    data_test,classes_test,valor_test_por_classe,resultado_test_por_classe = return_testing_data(vectorizer, window_size, corpus, test)

    resultado = str(window_size) + '-' + str(epochs)


    #checa se a header existe
    if os.path.exists("results/total_accuracy.csv"):
        header_exists = True
    else:
        header_exists = False

    # if it does not exist, save the header
    with open("/content/drive/My Drive/nlp/results/total_accuracy.csv", "a+") as f:
        if not header_exists:
            f.write("window_size,epochs,accuracy\n")
        f.write(str(window_size)+","+str(epochs)+","+str(model.evaluate(data_test,classes_test,batch_size=batch_size,verbose=2)[1])+"\n")


    with open("/content/drive/My Drive/nlp/results/"+resultado+'.csv', "w") as f:
        f.write("index,accuracy\n")

    classes_list = vectorizer.get_feature_names()# will be used to return each class's accuracy, but without using an index

    for index in valor_test_por_classe:
        score = model.evaluate(valor_test_por_classe[index], resultado_test_por_classe[index], batch_size = batch_size, verbose = 2)
        with open("/content/drive/My Drive/nlp/results/"+resultado+".csv","a") as f:
                f.write(str(classes_list[index])+","+str(score[1])+"\n")

    #graph_by_class("/content/drive/My Drive/nlp/results/"+resultado+".csv",window_size,epochs) # generating graphics
    del model

In [0]:
#pre process
sentences_train, classes, sentences_dev,sentences_test = pre_process()

Pre-processing done


In [0]:
#splits
word2index, tag2index, train_sentences_X, val_sentences_X, test_sentences_X, train_tags_y, val_tags_y, test_tags_y = splits_sentences(sentences_train, sentences_dev,sentences_test)

Done number converting


In [0]:
#pad
window_size = 3
batch_size = 90
epochs=2

train_sentences_X = pad_sequences(train_sentences_X, maxlen=248, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=248, padding='post')
val_sentences_X = pad_sequences(val_sentences_X, maxlen=248, padding='post')
val_sentences_X = pad_sequences(val_sentences_X, maxlen=248, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=248, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=248, padding='post')


In [0]:
#model
model = create_model(window_size,train_sentences_X,to_categorical(train_tags_y, len(tag2index)),epochs,batch_size,val_sentences_X,to_categorical(val_tags_y, len(tag2index)), tag2index)






Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 248, 128)          6046464   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 248, 512)          788480    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 248, 27)           13851     
_________________________________________________________________
activation_1 (Activation)    (None, 248, 27)           0         
Total params: 6,848,795
Trainable params: 6,848,795
Non-trainable params: 0
_________________________________________________________________
None
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 30358 samples, validate on 7590 samples
Epoch 1/2





Epoch 2/2


In [0]:

scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(scores)
print(f"{model.metrics_names[1]}: {scores[2] * 100}")
 

[0.02015901155547602, 0.9938480502864173, 0.9119116343409843, 1.0, 0.764216168268918, 0.9683652829737939, 0.8946121664828393, 0.973425810750413, 0.94483836461234, 0.6623761910647986, 0.14418744367677983, 0.8472593681365085, 0.9629768540001534, 0.0, 0.7926543467546772, 0.06408330830079102, 0.3732852708521077, 0.2998564800907847, 0.6984036844702636, 0.9571518542518057, 0.5468442308334167, 0.8767106183509339, 0.0, 0.7797464810074396, 0.9108951088122716, 0.8547706318031272, 0.9995071591144277, 0.22162144120690233, 0.7862092683216526, 0.9511625326136471, 0.0]
acc: 91.19116343409843


In [0]:
print('batch_size, epochs, acc_fake, acc_real')
print(batch_size, ',', epochs, ',', scores[1], ',',scores[2])


batch_size, epochs, acc_fake, acc_real
90 , 2 , 0.9938480502864173 , 0.9119116343409843


In [0]:
#print("loss,fake_acc,acc,-PAD-, V, PREP+ADV, CUR, PREP, PROADJ, PU, PREP+ART, PRO-KS, N, KC, PCP, PREP+PROPESS, PROPESS, NUM, IN, ADV-KS, PREP+PRO-KS, PREP+PROSUB, KS, NPROP, ART, ADJ, ADV, PDEN, PROSUB, PREP+PROADJ")
print(batch_size, ",", epochs, ",", str(scores)[1:-1])

90 , 2 , 0.02015901155547602, 0.9938480502864173, 0.9119116343409843, 1.0, 0.764216168268918, 0.9683652829737939, 0.8946121664828393, 0.973425810750413, 0.94483836461234, 0.6623761910647986, 0.14418744367677983, 0.8472593681365085, 0.9629768540001534, 0.0, 0.7926543467546772, 0.06408330830079102, 0.3732852708521077, 0.2998564800907847, 0.6984036844702636, 0.9571518542518057, 0.5468442308334167, 0.8767106183509339, 0.0, 0.7797464810074396, 0.9108951088122716, 0.8547706318031272, 0.9995071591144277, 0.22162144120690233, 0.7862092683216526, 0.9511625326136471, 0.0
