# Named Entity Recognition Project - CAS 764

### Import Libraries

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import nltk
from nltk.cluster import KMeansClusterer
from nltk.classify import MaxentClassifier
import keras
import keras.backend as K
from keras import Sequential
from keras.layers import Activation, Dense, Flatten, Dropout
from keras.optimizers import Adam
from keras.layers.convolutional import *
from keras.utils import np_utils
from keras.models import model_from_json
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import pickle
from sklearn.preprocessing import LabelEncoder
from nltk.metrics.scores import (precision, recall)
import collections
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
from nltk.classify import ClassifierI
from statistics import mode
import random
from random import randint
import re
from sklearn import cluster
from sklearn import metrics

Using TensorFlow backend.


### Import Datsets

In [2]:
datapath1 = './NEEL2006/'
datapath2 = './CoNLL2003/'

neel_training_file = open(datapath1 + 'NEEL2016-training.tsv', 'r')
neel_training_data = neel_training_file.readlines()
neel_training_file.close()

neel_training_labels_file = open(datapath1 + 'NEEL2016-training_neel.gs', 'r')
neel_training_labels_data = neel_training_labels_file.readlines()
neel_training_labels_file.close()

neel_validation_file = open(datapath1 + 'NEEL2016-dev.tsv', 'r')
neel_validation_data = neel_validation_file.readlines()
neel_validation_file.close()

neel_validation_labels_file = open(datapath1 + 'NEEL2016-dev_neel.gs', 'r')
neel_validation_labels_data = neel_validation_labels_file.readlines()
neel_validation_labels_file.close()

neel_testing_file = open(datapath1 + 'NEEL2016-test.tsv', 'r')
neel_testing_data = neel_testing_file.readlines()
neel_testing_file.close()

neel_testing_labels_file = open(datapath1 + 'NEEL2016-test_neel.gs', 'r')
neel_testing_labels_data = neel_testing_labels_file.readlines()
neel_testing_labels_file.close()

conll_training_file = open(datapath2 + 'eng.train', 'r')
conll_training_data = conll_training_file.readlines()
conll_training_file.close()

conll_testA_file = open(datapath2 + 'eng.testa', 'r')
conll_testA_data = conll_testA_file.readlines()
conll_testA_file.close()

conll_testB_file = open(datapath2 + 'eng.testb', 'r')
conll_testB_data = conll_testB_file.readlines()
conll_testB_file.close()

conll_testC_file = open(datapath2 + 'eng.testc', 'r')
conll_testC_data = conll_testC_file.readlines()
conll_testC_file.close()

# Pre-processing

### CONLL

In [3]:
def conll_preprocess(data):
    data_dummy = data[:]
    for i in range(len(data)):
        if data[i] != "\n":
            split_data = data[i].split()
            ne_tag = split_data[-1]
            if ne_tag == 'I-ORG':
                split_data[-1] = 'ORG'
            elif ne_tag == 'I-MISC':
                split_data[-1] = 'MISC'
            elif ne_tag == 'I-PER':
                split_data[-1] = 'PER'
            elif ne_tag == 'I-LOC':
                split_data[-1] = 'LOC'
            elif ne_tag == 'B-ORG':
                split_data[-1] = 'ORG'
            elif ne_tag == 'B-MISC':
                split_data[-1] = 'MISC'
            elif ne_tag == 'B-LOC':
                split_data[-1] = 'LOC'
            else:
                split_data[-1] = 'O'
            data_dummy[i] = " ".join(split_data)
    return data_dummy

conll_training = conll_preprocess(conll_training_data) 
conll_testing = conll_preprocess(conll_testA_data) + conll_preprocess(conll_testB_data) + conll_preprocess(conll_testC_data)

### NEEL

In [4]:
def neel_preprocess(data):
    processed = []      
    for i in range(len(data)):
        if data[i] != '\n':
            line = data[i].rstrip()
            split_index = line.find(',')
            entry = [line[1:split_index-1], line[split_index+2:-1]]
            processed.append(entry)
    return processed

def neel_preprocess_labels(data):
    labels = []
    for i in range(len(data)):
        if data[i] != '\n':
            line = data[i].rstrip().split()
            t = line[5]
            if t == 'Person':
                t = 'PER'
            elif t == 'Product':
                t = 'MISC'
            elif t == 'Thing':
                t = 'MISC'
            elif t == 'Organization':
                t = 'ORG'
            elif t == 'Location':
                t = 'LOC'
            elif t == 'Character':
                t = 'PER'
            elif t == 'Event':
                t = 'MISC'
            else:
                t = 'O'
            labels.append([line[0], line[1], line[2], t])
    return labels

def generate_word_labels(data, labels):
    all_labels = []
    for i in range(len(data)): 
        sentence_labels = []
        for word in range(len(data[i][1].split())):
            sentence_labels.append('O')
        for j in range(len(labels)): 
            if data[i][0] == labels[j][0]:
                start_index = int(labels[j][1])
                end_index = int(labels[j][2])
                w = data[i][1][start_index:end_index].split()
                for ele in range(len(w)):
                    ind = [idx for idx, wo in enumerate(data[i][1].split(), 1) if w[ele] in wo]
                    for index in ind:
                        sentence_labels[index-1] = labels[j][3]
        all_labels.append(sentence_labels)
    return all_labels

def generate_pos_tags(data):
    all_pos_tags = []
    for i in range(len(data)):
        sentence_tags = nltk.pos_tag(data[i][1].split())
        tag_list = []
        for tag in range(len(sentence_tags)):
            tag_list.append(sentence_tags[tag][1])
        all_pos_tags.append(tag_list)
    return all_pos_tags

def reformat(data, labels, pos_tags):
    reformatted_dataset = []
    for i in range(len(data)):
        word_list = data[i][1].split()
        for word in range(len(word_list)):
            output = word_list[word] + ' ' + pos_tags[i][word] + ' ' + ' X ' + labels[i][word]
            reformatted_dataset.append(output)
        reformatted_dataset.append('\n')
    return reformatted_dataset

def convert(initial_dataset, initial_labelset):
    a = neel_preprocess(initial_dataset)
    b = neel_preprocess_labels(initial_labelset)
    c = generate_word_labels(a, b)
    d = generate_pos_tags(a)
    e = reformat(a, c, d)
    return e

nltk.download('averaged_perceptron_tagger')
neel_training = convert(neel_training_data, neel_training_labels_data)
neel_testing = convert(neel_testing_data, neel_testing_labels_data)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/marshallwice/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
def clean(data):
    cleaned_data = []
    for i in range(len(data)):
        if data[i] != '\n':
            entry = data[i].split()
            cleaned_word = ''.join(c for c in entry[0] if  c not in '!$%^&*-–=_’+“{}[]\|;:,<.>/?`~")(')
            if (len(cleaned_word) != 0) and (cleaned_word[0] != '\'') and (cleaned_word[-1] != '\''):
                clean_word = ''.join(c for c in cleaned_word if  c not in '\'')
                if ('http' not in clean_word) and (any(c.isalpha() for c in clean_word) == True) and (clean_word != 'RT'):
                    if clean_word[0] == '#':
                        cleaned_data.append(' '.join(['hashtag', entry[1], entry[2], entry[3]]))
                    elif clean_word[0] == '@':
                        cleaned_data.append(' '.join(['sign', entry[1], entry[2], entry[3]]))
                    else:
                        cleaned_data.append(' '.join([clean_word, entry[1], entry[2], entry[3]]))
        else:
            cleaned_data.append('\n')
    return cleaned_data

cleaned_conll_training = clean(conll_training)
cleaned_conll_testing = clean(conll_testing)

cleaned_neel_training = clean(neel_training)
cleaned_neel_testing = clean(neel_testing)

# Convolutional Neural Network (CNN)

### Load Word Vector Model

In [6]:
glove_input_file = './glove/glove.6B.100d.txt'
word2vec_output_file = './glove/glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

filename = './glove/glove.6B.100d.txt.word2vec'
w2v_model = KeyedVectors.load_word2vec_format(filename, binary=False)

### Pre-processing

In [7]:
def restructure(data):
    matrix = []
    cnn_X = []
    cnn_Y = []
    cnn_Z = []
    for i in range(len(data)):
        if data[i] != data[0]:
            if data[i] != "\n":    
                matrix.append(data[i].rstrip().split())
            elif len(matrix)>0:
                matrix = np.array(matrix)
                cnn_X.append(matrix[:,0])
                cnn_Y.append(matrix[:,3])
                cnn_Z.append(matrix[:,1])
                matrix = []
                
    return cnn_X, cnn_Y, cnn_Z

def vectorize(word_matrix):
    matrixvex = []
    for matrix in word_matrix:
        wordvex = []
        for w in range(matrix.shape[0]):
            try:
                wordvex.append(w2v_model[matrix[w].lower()])
            except KeyError:
                wordvex.append(w2v_model['the'])
        matrixvex.append(np.array(wordvex))
    return np.array(matrixvex)
    
def pad_words(dataset):
    data_matrix = [] 
    for i in range(len(dataset)):
        sentence = dataset[i]
        sentence_matrix = []
        if len(sentence) == 1:
            word_matrix = np.zeros((3, 100))
            word_matrix[1] = sentence
            sentence_matrix.append(word_matrix)

        elif len(sentence) == 2:
            for i in range(len(sentence)):
                word_matrix = np.zeros((3, 100))
                if i == 0:
                    word_matrix[1] = sentence[i]
                    word_matrix[2] = sentence[i+1]
                elif i == len(sentence)-1:
                    word_matrix[0] = sentence[i-1]
                    word_matrix[1] = sentence[i]
                sentence_matrix.append(word_matrix)

        else:
            for i in range(len(sentence)):
                word_matrix = np.zeros((3, 100))
                if i == 0:
                    word_matrix[1] = sentence[i]
                    word_matrix[2] = sentence[i+1]
                elif i == len(sentence)-1:
                    word_matrix[0] = sentence[i-1]
                    word_matrix[1] = sentence[i]
                else:
                    word_matrix[0] = sentence[i-1]
                    word_matrix[1] = sentence[i]
                    word_matrix[2] = sentence[i+1]
                sentence_matrix.append(word_matrix)
            
        data_matrix.append(np.array(sentence_matrix)) 
        
    data_matrix = np.array(data_matrix)
    flattened_data_matrix = []
    for i in range(len(data_matrix)):
        for j in range(data_matrix[i].shape[0]):
            flattened_data_matrix.append(data_matrix[i][j])
            
    return np.array(flattened_data_matrix)

def pad_labels(dataset):
    padded_dataset = []
    for i in range(len(dataset)):
        for j in range(len(dataset[i])):
            padded_dataset.append(dataset[i][j])
    return np.array(padded_dataset)

def cnn_preprocess(dataset):
    cnn_x, cnn_y, cnn_z = restructure(dataset) 
    cnn_x = pad_words(vectorize(cnn_x))
    cnn_x = cnn_x.reshape(cnn_x.shape[0], 3, 100, 1)
    cnn_y = pad_labels(cnn_y)
    encoder = LabelEncoder()
    encoder.fit(cnn_y)
    encoded_Y = encoder.transform(cnn_y)
    cnn_y = np_utils.to_categorical(encoded_Y)
    return cnn_x, cnn_y

cnn_train_x, cnn_train_y = cnn_preprocess(cleaned_conll_training + cleaned_neel_training)
cnn_test_x, cnn_test_y = cnn_preprocess(cleaned_conll_testing + cleaned_neel_testing)

### Training

In [8]:
def create_model():
    model = Sequential()
    model.add(Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=(3, 100, 1)))
    model.add(Conv2D(32, (3, 3), padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='softmax'))
    return model

def train_cnn(train_x, train_y, batch_size, num_cycles):
    cnn_model = create_model()
    cnn_model.summary()
    cnn_model.compile(Adam(lr=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
    cnn_model.fit(train_x, train_y, batch_size=batch_size, epochs=num_cycles, shuffle=False, verbose=2)
    return cnn_model

def save_model(cnn_model, with_name):
    model_json = cnn_model.to_json()
    with open(with_name + ".json", "w") as json_file:
        json_file.write(model_json)
    cnn_model.save_weights(with_name + ".h5")
    
def load_model(with_name):
    json_file = open(with_name + '.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    loaded_model.load_weights(with_name + ".h5")
    return loaded_model

# Train Model & Save to File
# -----------------------------------------------------------------------------------------------------

# cnn_model = train_cnn(cnn_train_x, cnn_train_y, 32, 20)
# save_model(cnn_model, 'combinedgeneralcnnmodel')

# Load Model from File
# -----------------------------------------------------------------------------------------------------

cnn_model = load_model('combinedgeneralcnnmodel')
cnn_model.compile(Adam(lr=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

### Evaluation

In [9]:
def form_cnn(cnn_list):
    new_list = []
    for i in range(len(cnn_list)):
        if cnn_list[i] == 0:
            new_list.append('LOC')
        elif cnn_list[i] == 1:
            new_list.append('MISC')
        elif cnn_list[i] == 2:
            new_list.append('O')
        elif cnn_list[i] == 3:
            new_list.append('ORG')
        elif cnn_list[i] == 4:
            new_list.append('PER')
    return new_list

def cnn_evaluation(model, test_x, test_y):
    num_correct = 0
    for i in range(len(test_x)):
        if np.where(test_y[i]==1)[0] == model.predict_classes(test_x[i].reshape(1, 3, 100, 1)):
            num_correct+=1

    acc = model.evaluate(test_x, test_y, verbose=2)[1]*100
    
    Y_test = np.argmax(test_y, axis=1)
    Y_pred = model.predict_classes(test_x)
    Y_test = form_cnn(Y_test)
    Y_pred = form_cnn(Y_pred)
    
    print('Predicted', num_correct, 'out of', len(test_x), 'giving an accuracy of {}%'.format(round(acc, 2)))
    print('\n')
    print(classification_report(Y_test, Y_pred))
    
cnn_evaluation(cnn_model, cnn_test_x, cnn_test_y)

Predicted 104500 out of 114442 giving an accuracy of 91.31%


             precision    recall  f1-score   support

        LOC       0.65      0.91      0.76      4062
       MISC       0.59      0.51      0.55      2680
          O       0.96      0.96      0.96     96665
        ORG       0.86      0.42      0.56      4689
        PER       0.65      0.80      0.71      6346

avg / total       0.92      0.91      0.91    114442



# Maximum Entropy Markov Model (MEMM)

### Pre-processing

In [10]:
def memm_preprocess(data):
    new_data = []
    for i in range(len(data)):
        if data[i] != "\n":    
            new_data.append(data[i].rstrip().split())
        else:
            new_data.append(['-END-END-END-', '-END-', '-END-', '-END-'])
    
    new_data[:] = [x for x in new_data if x != ['-DOCSTART-', '-X-', 'O', 'O']]
        
    if new_data[0] == ['-END-END-END-', '-END-', '-END-', '-END-']:
        new_data = np.delete(new_data, 0, 0)
    
    new_data = np.array(new_data)
    if len(new_data[0]) == 4:
        new_data = np.delete(new_data, 2, 1)

    tuple_data = []
    for i in range(len(new_data)):
        tuple_data.append((new_data[i]))
    
    return tuple_data
    
memm_train = memm_preprocess(cleaned_conll_training+cleaned_neel_training)
memm_test = memm_preprocess(cleaned_conll_testing+cleaned_neel_testing)

### Training

In [11]:
def memm_find_features(current_word, next_word, previous_word, previous_netag):
    return {'current_word':current_word, 
            'next_word':next_word, 
            'previous_word':previous_word, 
            'previous_netag':previous_netag}

def fetch_featureset(dataset):
    featureset = []
    for i in range(len(dataset)):
        if dataset[i][0] != '-END-END-END-':
            if i == 0:
                current_word = dataset[i][0]
                next_word = dataset[i+1][0]
                previous_word = ""
                previous_netag = ""
            elif dataset[i-1][0] == '-END-END-END-':
                current_word = dataset[i][0]
                next_word = dataset[i+1][0] 
                previous_word = ""
                previous_netag = ""
            else:
                try:
                    current_word = dataset[i][0]
                    if dataset[i+1][0] != '-END-END-END-':
                        next_word = dataset[i+1][0] 
                    else:
                        next_word = ""
                    previous_word = dataset[i-1][0] 
                    previous_netag = dataset[i-1][2]
                except IndexError:
                    current_word = dataset[i][0]
                    next_word = ""
                    previous_word = dataset[i-1][0] 
                    previous_netag = dataset[i-1][2]
            featureset.append([memm_find_features(current_word, next_word, previous_word, previous_netag), dataset[i][2]])
    return featureset

def fetch_test_featureset(dataset):
    featureset = []
    for i in range(len(dataset)):
        if dataset[i][0] != '-END-END-END-':
            if i == 0:
                current_word = dataset[i][0]
                next_word = dataset[i+1][0]
                previous_word = ""
                previous_netag = ""
            elif dataset[i-1][0] == '-END-END-END-':
                current_word = dataset[i][0]
                next_word = dataset[i+1][0] 
                previous_word = ""
                previous_netag = ""
            else:
                try:
                    current_word = dataset[i][0]
                    if dataset[i+1][0] != '-END-END-END-':
                        next_word = dataset[i+1][0] 
                    else:
                        next_word = ""
                    previous_word = dataset[i-1][0] 
                    previous_netag = memm_classifier.classify(memm_find_features(current_word, next_word, previous_word, previous_netag))
                except IndexError:
                    current_word = dataset[i][0]
                    next_word = ""
                    previous_word = dataset[i-1][0] 
                    previous_netag = memm_classifier.classify(memm_find_features(current_word, next_word, previous_word, previous_netag))
            featureset.append([memm_find_features(current_word, next_word, previous_word, previous_netag), dataset[i][2]])
    return featureset

memm_test_featuresets = fetch_featureset(memm_test)
        
# Train Model & Save to File
# -----------------------------------------------------------------------------------------------------

# memm_train_featuresets = fetch_featureset(memm_train)
# memm_classifier = nltk.classify.MaxentClassifier.train(memm_train_featuresets, max_iter=20)

# memm_save_classifier = open('combinedmemmclassifier.pickle', 'wb')
# pickle.dump(memm_classifier, memm_save_classifier)
# memm_save_classifier.close()

# Load Model from File
# -----------------------------------------------------------------------------------------------------

memm_save_classifier_f = open('combinedmemmclassifier.pickle', 'rb')
memm_classifier = pickle.load(memm_save_classifier_f)
memm_save_classifier_f.close()

### Evaluation

In [12]:
def memm_evaluate(testset, classifier, test_feats):
    new_memm_test = []
    for i in range(len(testset)):
        if testset[i][0] != '-END-END-END-':
            new_memm_test.append(testset[i])

    num_correct = 0
    pred_list = []
    ground_truths = []
    for i in range(len(new_memm_test)):
        prediction = classifier.classify(test_feats[i][0])
        pred_list.append(prediction)
        ground_truths.append(new_memm_test[i][2])
        if prediction == new_memm_test[i][2]:
            num_correct = num_correct + 1
    acc = (nltk.classify.accuracy(classifier, test_feats))*100
    
    print('Predicted', num_correct, 'out of', len(new_memm_test), 'giving an accuracy of {}%'.format(round(acc, 2)))
    print('\n')
    print(classification_report(ground_truths, pred_list))

memm_evaluate(memm_test, memm_classifier, memm_test_featuresets)

Predicted 104424 out of 114658 giving an accuracy of 91.07%


             precision    recall  f1-score   support

        LOC       0.84      0.73      0.78      4062
       MISC       0.49      0.62      0.55      2680
          O       0.95      0.96      0.95     96881
        ORG       0.80      0.65      0.72      4689
        PER       0.67      0.66      0.66      6346

avg / total       0.91      0.91      0.91    114658



# Naive Bayes Classification (NBC)

### Pre-processing

In [13]:
def nbc_preprocess(data):
    new_data = []
    for i in range(len(data)):
        if data[i] != "\n":    
            new_data.append(data[i].rstrip().split())
    
    new_data[:] = [x for x in new_data if x != ['-DOCSTART-', '-X-', 'O', 'O']]

    new_data = np.array(new_data)
    if len(new_data[0]) == 4:
        new_data = np.delete(new_data, 2, 1)

    tuple_data = []
    for i in range(len(new_data)):
        tuple_data.append((new_data[i]))
            
    return tuple_data

nbc_train = nbc_preprocess(cleaned_conll_training + cleaned_neel_training)
nbc_test = nbc_preprocess(cleaned_conll_testing + cleaned_neel_testing)

### Training

In [14]:
def nbc_find_features(word, pos):
    return {'first_letter': word[0], 'pos': pos, 'word':word}

def nbc_train_classifier(trainset, testset):
    train_feats = [(nbc_find_features(word, pos_tag), ne_tag) for (word, pos_tag, ne_tag) in trainset]
    test_feats = [(nbc_find_features(word, pos_tag), ne_tag) for (word, pos_tag, ne_tag) in testset]
    classifier = nltk.NaiveBayesClassifier.train(train_feats)
    return train_feats, test_feats, classifier

nbc_train_featuresets, nbc_test_featuresets, nbc_classifier = nbc_train_classifier(nbc_train, nbc_test)

### Evaluation

In [15]:
def nbc_evaluate(classifier, testset, test_featureset):   
    num_correct = 0
    pred_list = []
    ground_truths = []
    for i in range(len(testset)):
        prediction = classifier.classify(nbc_find_features(testset[i][0], testset[i][1]))
        pred_list.append(prediction)
        ground_truths.append(testset[i][2])
        if prediction == testset[i][2]:
            num_correct = num_correct + 1
    
    acc = (nltk.classify.accuracy(classifier, test_featureset))*100
    print('Predicted', num_correct, 'out of', len(testset), 'giving an accuracy of {}%'.format(round(acc, 2)))
    print('\n')
    print(classification_report(ground_truths, pred_list))
    
nbc_evaluate(nbc_classifier, nbc_test, nbc_test_featuresets)

Predicted 96619 out of 114658 giving an accuracy of 84.27%


             precision    recall  f1-score   support

        LOC       0.47      0.82      0.60      4062
       MISC       0.30      0.63      0.40      2680
          O       0.99      0.87      0.92     96881
        ORG       0.46      0.66      0.55      4689
        PER       0.44      0.73      0.55      6346

avg / total       0.90      0.84      0.86    114658



# Unsupervised Methods

### Bootstrapping (K-Means)

In [16]:
kcnn_x, kcnn_y, kcnn_z = restructure(cleaned_conll_training + cleaned_neel_training) 

res = vectorize(kcnn_x)
flattened = []
for sent in res:
    for wordvec in sent:
        flattened.append(wordvec)
flattened = np.array(flattened)

kmeans = cluster.KMeans(n_clusters=5)
kmeans.fit(flattened)
klabels = kmeans.labels_
kcentroids = kmeans.cluster_centers_

kcnn_x = pad_words(vectorize(kcnn_x))
kcnn_train_x = kcnn_x.reshape(kcnn_x.shape[0], 3, 100, 1)

encoder = LabelEncoder()
encoder.fit(klabels)
encoded_Y = encoder.transform(klabels)
kcnn_train_y = np_utils.to_categorical(encoded_Y)

# Train Model & Save to File
# -----------------------------------------------------------------------------------------------------
# kcnn_model = train_cnn(kcnn_train_x, kcnn_train_y, 32, 10)
# save_model(kcnn_model, 'kcnnmodel')

# Load Model from File
# -----------------------------------------------------------------------------------------------------
kcnn_model = load_model('kcnnmodel')
kcnn_model.compile(Adam(lr=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

### Evaluate Bootsrapping Method

In [17]:
cnn_evaluation(kcnn_model, cnn_test_x, cnn_test_y)

Predicted 28859 out of 114442 giving an accuracy of 25.22%


             precision    recall  f1-score   support

        LOC       0.08      0.35      0.13      4062
       MISC       0.01      0.13      0.02      2680
          O       0.84      0.24      0.37     96665
        ORG       0.00      0.01      0.00      4689
        PER       0.19      0.60      0.29      6346

avg / total       0.73      0.25      0.34    114442



### Ensemble

In [18]:
quarter = int(len(kcnn_train_y) / 4)
elabels = np.concatenate((np.concatenate((kcnn_train_y[:quarter], cnn_train_y[quarter:len(kcnn_train_y)-quarter])), kcnn_train_y[len(kcnn_train_y)-quarter:]))

# Train Model & Save to File
# -----------------------------------------------------------------------------------------------------
# ecnn_model = train_cnn(kcnn_train_x, elabels, 32, 10)
# save_model(ecnn_model, 'ecnnmodel')

# Load Model from File
# -----------------------------------------------------------------------------------------------------
ecnn_model = load_model('ecnnmodel')
ecnn_model.compile(Adam(lr=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

### Ensemble Evaluation

In [19]:
cnn_evaluation(ecnn_model, cnn_test_x, cnn_test_y)

Predicted 28500 out of 114442 giving an accuracy of 24.9%


             precision    recall  f1-score   support

        LOC       0.07      0.44      0.11      4062
       MISC       0.01      0.12      0.02      2680
          O       1.00      0.24      0.38     96665
        ORG       0.09      0.41      0.15      4689
        PER       0.09      0.26      0.14      6346

avg / total       0.85      0.25      0.34    114442

