# NER Machine Learning

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import gensim
from gensim.models import KeyedVectors
import pandas as pd
import sys
import csv

In [2]:
def extract_embeddings_as_features_and_gold(conllfile,word_embedding_model):
    '''
    Function that extracts features and gold labels using word embeddings
    
    :param conllfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    :return labels: list of gold labels
    '''
    ### This code was partially inspired by code included in the HLT course, obtained from https://github.com/cltl/ma-hlt-labs/, accessed in May 2020.
    labels = []
    features = []
    
    conllinput = open(conllfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    for row in csvreader:
        #check for cases where empty lines mark sentence boundaries (which some conll files do).
        if len(row) > 3:
            if row[0] in word_embedding_model:
                vector = word_embedding_model[row[0]]
            else:
                vector = [0]*300
            features.append(vector)
            labels.append(row[-1])

    return features, labels

In [3]:
def extract_embeddings_features(inputfile,word_embedding_model):
    '''
    This function extracts features from embeddings
    
    :param inputfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    '''
    ### This code was partially inspired by code included in the HLT course, obtained from https://github.com/cltl/ma-hlt-labs/, accessed in May 2020.
    features = []
    
    conllinput = open(inputfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    
    for row in csvreader:
        #check for cases where empty lines mark sentence boundaries (which some conll files do).
        if len(row) > 3:
            if row[0] in word_embedding_model:
                vector = word_embedding_model[row[0]]
            else:
                vector = [0]*300
            features.append(vector)

    return features

In [4]:
feature_indexes = {'token': 0, 'pos': 1, 'tag': 2, 'previous': 4, 'latter': 5, 'capitals': 6,'stemm':7,'lemma':8}

def extract_features_and_selected_labels(trainingfile, selected_features):
    '''
    Extract features and gold labels from a preprocessed file with the training data and return them as lists
    
    :param trainingfile: path to training file
    :param selected_features: list of features that will be used to train the model
    
    :type trainingfile: string
    :type selected_features: list
    
    :return features: features as a list of dictionaries
    :return gold_labels: list of gold labels
    '''
    features = []
    gold_labels = []
    
    conllinput = open(trainingfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    
    for row in csvreader:
        feature_value = {}
        # Only extract the selected features
        for feature_name in selected_features:
            row_index = feature_indexes.get(feature_name)
            feature_value[feature_name] = row[row_index]
        features.append(feature_value)
        
        # Gold is in the third column
        gold_labels.append(row[3])
                
    return features, gold_labels

In [5]:
def extract_features_and_labels(trainingfile):
    
    data = []
    targets = []
    # TIP: recall that you can find information on how to integrate features here:
    # https://scikit-learn.org/stable/modules/feature_extraction.html
    with open(trainingfile, 'r', encoding='utf8') as infile:
        for line in infile:
            components = line.rstrip('\n').split()
            if len(components) > 0:
                token = components[0]
                feature_dict = {'token':token}
                data.append(feature_dict)
                #gold is in the last column
                targets.append(components[-1])

    return data, targets

In [6]:
feature_to_index = {'token': 0, 'pos': 1, 'tag': 2, 'previous': 4, 'latter': 5, 'capitals': 6,'stemm':7,'lemma':8}

def extract_features(testfile, selected_features):
    '''Extract features from a preprocessed file with the test data and return them as a list
    
    :param trainingfile: path to test file
    :param selected_features: list of features that were selected to train the model
    
    :type testfile: string
    :type selected_features: list
    
    :return features: features as a list of dictionaries'''

    features = []
    gold_labels = []
    
    conllinput = open(testfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    
    for row in csvreader:
        feature_value = {}
        for feature_name in selected_features:
            row_index = feature_to_index.get(feature_name)
            feature_value[feature_name] = row[row_index]
        features.append(feature_value)
                
    return features

In [7]:
data, targets = extract_features_and_labels("datas/conll2003.train.conll")

In [8]:
print(data[:10])

[{'token': 'EU'}, {'token': 'rejects'}, {'token': 'German'}, {'token': 'call'}, {'token': 'to'}, {'token': 'boycott'}, {'token': 'British'}, {'token': 'lamb'}, {'token': '.'}, {'token': 'Peter'}]


In [9]:
print(targets[:10])

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'B-PER']


    def extract_features(inputfile):
   
    data = []
    with open(inputfile, 'r', encoding='utf8') as infile:
        for line in infile:
            components = line.rstrip('\n').split()
            if len(components) > 0:
                token = components[0]
                feature_dict = {'token':token}
                data.append(feature_dict)
    return data

In [10]:
data = extract_features("./conll2003.train_extracted_features.conll",["token","pos","tag","previous","latter","capitals","stemm","lemma"])
print(data[:10])

[{'token': 'token', 'pos': 'pos', 'tag': 'tag', 'previous': 'previous', 'latter': 'latter', 'capitals': 'capitals', 'stemm': 'stemm', 'lemma': 'lemma'}, {'token': 'rejects', 'pos': 'VBZ', 'tag': 'B-VP', 'previous': ' ', 'latter': 'German', 'capitals': '0', 'stemm': 'reject', 'lemma': 'reject'}, {'token': 'German', 'pos': 'JJ', 'tag': 'B-NP', 'previous': 'rejects', 'latter': 'call', 'capitals': '0', 'stemm': 'german', 'lemma': 'German'}, {'token': 'call', 'pos': 'NN', 'tag': 'I-NP', 'previous': 'German', 'latter': 'to', 'capitals': '0', 'stemm': 'call', 'lemma': 'call'}, {'token': 'to', 'pos': 'TO', 'tag': 'B-VP', 'previous': 'call', 'latter': 'boycott', 'capitals': '0', 'stemm': 'to', 'lemma': 'to'}, {'token': 'boycott', 'pos': 'VB', 'tag': 'I-VP', 'previous': 'to', 'latter': 'British', 'capitals': '0', 'stemm': 'boycott', 'lemma': 'boycott'}, {'token': 'British', 'pos': 'JJ', 'tag': 'B-NP', 'previous': 'boycott', 'latter': 'lamb', 'capitals': '0', 'stemm': 'british', 'lemma': 'British

In [11]:
def create_classifier(train_features, train_targets, modelname):
    
    '''Create a classifier and train it with vectorized features and corresponding gold labels
    
    input train_features: features to be transformed into vectors
    input train_labels: gold labels corresponding to features
    input modelname: name of the model that will be trained
    
    output model: trained classifier
    output vec: DictVectorizer'''
   
    if modelname ==  'logreg':
        # TIP: you may need to solve this: https://stackoverflow.com/questions/61814494/what-is-this-warning-convergencewarning-lbfgs-failed-to-converge-status-1
        model = LogisticRegression(max_iter=10000)
    if modelname == 'NB':
        model = MultinomialNB()
    if modelname == 'SVM':
        model = LinearSVC(max_iter=10000)
        
    vec = DictVectorizer()
    
    features_vectorized = vec.fit_transform(train_features)
    model.fit(features_vectorized, train_targets)
    
    return model, vec

In [12]:
def create_classifier_embeddings(train_features, train_labels):
    '''
    Create an SVM classifier and train it with vectorized features and corresponding gold labels
    
    input train_features: features to be transformed into vectors
    input train_labels: gold labels corresponding to features
    
    output model: trained classifier
    '''

    model = LinearSVC(max_iter=10000)
    model.fit(train_features, train_labels)
    
    return model

In [13]:
def classify_data(model, vec, inputdata, outputfile,selected_features):
  
    # Extracting features from input data
    features = extract_features(inputdata,selected_features)
    features = vec.transform(features)
    
    # Making prediction
    predictions = model.predict(features)
    
    # Writing the results
    outfile = open(outputfile, 'w')
    counter = 0
    for line in open(inputdata, 'r'):
        if len(line.rstrip('\n').split()) > 0:
            outfile.write(line.rstrip('\n') + '\t' + predictions[counter] + '\n')
            counter += 1
    outfile.close()
    
    return predictions

In [14]:
def classify_data_embeddings(model, inputdata, outputfile, word_embedding_model):
    '''
    This function creates a classifier for making predictions embedded data
    
    input model: classifier that will make predictions
    input inputdata: path to input data
    input outputfile: path to output file, where the predictions for each feature will be written
    input word_embedding_model : embedding model
    '''
    # extracting features
    features = extract_embeddings_features(inputdata,word_embedding_model)
    
    # making predictions with extracted features
    predictions = model.predict(features)
    
    # Write results to an outputfile
    outfile = open(outputfile, 'w')
    counter = 0
    for line in open(inputdata, 'r'):
        if len(line.rstrip('\n').split()) > 0:
            outfile.write(line.rstrip('\n') + '\t' + predictions[counter] + '\n')
            counter += 1
    outfile.close()

In [15]:
def main(system_type,argv=None):
    
    #a very basic way for picking up commandline arguments
    if argv is None:
        argv = sys.argv    
    
    # LR - NB - SVM with extracted features
    trainingfile = "conll2003.train_extracted_features.conll"
    inputfile = "conll2003.dev_extracted_features.conll"
    outputfile = "output.conll2003_features"
    language_model = "./models/GoogleNews-vectors-negative300.bin.gz"
    
    # SVM with embeddings
    trainingfile_svm = "datas/conll2003.train.conll"
    inputfile_svm = "datas/conll2003.dev.conll"
    outputfile_svm = "svm_output.conll2003_embeddings"
    
    if system_type == "with_features":
        
        # selecting features to train the model
        selected_features = ["token","pos","tag","previous","latter","capitals","stemm","lemma"]
    
        # getting the selected training features and gold labels
        training_features, gold_labels = extract_features_and_selected_labels(trainingfile,selected_features)
    
        # Training three different models with the features, 
        # Classifying the data and writing the result to new conll files        
        for modelname in ['logreg', 'NB', 'SVM']:
        
            ml_model, vec = create_classifier(training_features, gold_labels, modelname)
            classify_data(ml_model, vec, inputfile, outputfile.replace('.conll','.' + modelname + '.conll'),selected_features)
        
            dataframe = pd.read_table(outputfile.replace('.conll','.' + modelname + '.conll'))
            dataframe = dataframe.set_axis([*dataframe.columns[:-1], 'NER2'], axis=1, inplace=False)
            dataframe.to_csv(outputfile.replace('.conll','.' + modelname + '.conll'), sep='\t')
            
    elif system_type == "word_embeddings":

        # creating a language model
        language_model = gensim.models.KeyedVectors.load_word2vec_format(language_model, binary=True)

        # extracting the features and gold label
        training_features, gold_labels = extract_embeddings_as_features_and_gold(trainingfile_svm, language_model)

        # creating the classification model
        ml_model = create_classifier_embeddings(training_features[:50000], gold_labels[:50000])
        classify_data_embeddings(ml_model, inputfile_svm, outputfile_svm.replace('.conll','.embedded.conll'), language_model)

        data_frame = pd.read_table(outputfile_svm.replace('.conll','.embedded.conll'))
        data_frame = data_frame.set_axis([*data_frame.columns[:-1], 'NER2'], axis=1, inplace=False)
        data_frame.to_csv(outputfile_svm.replace('.conll','.embedded_last_50000.conll'), sep='\t')

In [16]:
main(system_type="word_embeddings")