# NER Machine Learning

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import gensim
from gensim.models import KeyedVectors
import pandas as pd
import sys
import csv

In [2]:
def extract_embeddings_as_features_and_gold(conllfile,word_embedding_model):
    '''
    Function that extracts features and gold labels using word embeddings
    
    :param conllfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    :return labels: list of gold labels
    '''
    ### This code was partially inspired by code included in the HLT course, obtained from https://github.com/cltl/ma-hlt-labs/, accessed in May 2020.
    labels = []
    features = []
    
    conllinput = open(conllfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    for row in csvreader:
        #check for cases where empty lines mark sentence boundaries (which some conll files do).
        if len(row) > 3:
            if row[0] in word_embedding_model:
                vector = word_embedding_model[row[0]]
            else:
                vector = [0]*300
            features.append(vector)
            labels.append(row[-1])

    return features, labels

In [3]:
def extract_embeddings_features(inputfile,word_embedding_model):
    '''
    This function extracts features from embeddings
    
    :param inputfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    '''
    ### This code was partially inspired by code included in the HLT course, obtained from https://github.com/cltl/ma-hlt-labs/, accessed in May 2020.
    features = []
    
    conllinput = open(inputfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    
    for row in csvreader:
        #check for cases where empty lines mark sentence boundaries (which some conll files do).
        if len(row) > 3:
            if row[0] in word_embedding_model:
                vector = word_embedding_model[row[0]]
            else:
                vector = [0]*300
            features.append(vector)

    return features

In [4]:
feature_indexes = {'token': 0, 'pos': 1, 'tag': 2, 'previous': 4, 'latter': 5, 'capitals': 6,'stemm':7,'lemma':8}

def extract_features_and_selected_labels(trainingfile, selected_features):
    '''
    Extract features and gold labels from a preprocessed file with the training data and return them as lists
    
    :param trainingfile: path to training file
    :param selected_features: list of features that will be used to train the model
    
    :type trainingfile: string
    :type selected_features: list
    
    :return features: features as a list of dictionaries
    :return gold_labels: list of gold labels
    '''
    features = []
    gold_labels = []
    
    conllinput = open(trainingfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    
    for row in csvreader:
        feature_value = {}
        # Only extract the selected features
        for feature_name in selected_features:
            row_index = feature_indexes.get(feature_name)
            feature_value[feature_name] = row[row_index]
        features.append(feature_value)
        
        # Gold is in the third column
        gold_labels.append(row[3])
                
    return features, gold_labels

In [5]:
def extract_features_and_labels(trainingfile):
    
    data = []
    targets = []
    # TIP: recall that you can find information on how to integrate features here:
    # https://scikit-learn.org/stable/modules/feature_extraction.html
    with open(trainingfile, 'r', encoding='utf8') as infile:
        for line in infile:
            components = line.rstrip('\n').split()
            if len(components) > 0:
                token = components[0]
                feature_dict = {'token':token}
                data.append(feature_dict)
                #gold is in the last column
                targets.append(components[-1])

    return data, targets

In [6]:
feature_to_index = {'token': 0, 'pos': 1, 'tag': 2, 'previous': 4, 'latter': 5, 'capitals': 6,'stemm':7,'lemma':8}

def extract_features(testfile, selected_features):
    '''Extract features from a preprocessed file with the test data and return them as a list
    
    :param trainingfile: path to test file
    :param selected_features: list of features that were selected to train the model
    
    :type testfile: string
    :type selected_features: list
    
    :return features: features as a list of dictionaries'''

    features = []
    gold_labels = []
    
    conllinput = open(testfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    
    for row in csvreader:
        feature_value = {}
        for feature_name in selected_features:
            row_index = feature_to_index.get(feature_name)
            feature_value[feature_name] = row[row_index]
        features.append(feature_value)
                
    return features

In [7]:
def create_classifier(train_features, train_targets, modelname):
    
    '''Create a classifier and train it with vectorized features and corresponding gold labels
    
    input train_features: features to be transformed into vectors
    input train_labels: gold labels corresponding to features
    input modelname: name of the model that will be trained
    
    output model: trained classifier
    output vec: DictVectorizer'''
   
    if modelname ==  'logreg':
        # TIP: you may need to solve this: https://stackoverflow.com/questions/61814494/what-is-this-warning-convergencewarning-lbfgs-failed-to-converge-status-1
        model = LogisticRegression(max_iter=10000)
    if modelname == 'NB':
        model = MultinomialNB()
    if modelname == 'SVM':
        model = LinearSVC(max_iter=10000)
        
    vec = DictVectorizer()
    
    features_vectorized = vec.fit_transform(train_features)
    model.fit(features_vectorized, train_targets)
    
    return model, vec

In [8]:
def create_classifier_embeddings(train_features, train_labels):
    '''
    Create an SVM classifier and train it with vectorized features and corresponding gold labels
    
    input train_features: features to be transformed into vectors
    input train_labels: gold labels corresponding to features
    
    output model: trained classifier
    '''

    model = LinearSVC(max_iter=10000)
    model.fit(train_features, train_labels)
    
    return model

In [9]:
def classify_data(model, vec, inputdata, outputfile,selected_features):
  
    # Extracting features from input data
    features = extract_features(inputdata,selected_features)
    features = vec.transform(features)
    
    # Making prediction
    predictions = model.predict(features)
    
    # Writing the results
    outfile = open(outputfile, 'w')
    counter = 0
    for line in open(inputdata, 'r'):
        if len(line.rstrip('\n').split()) > 0:
            outfile.write(line.rstrip('\n') + '\t' + predictions[counter] + '\n')
            counter += 1
    outfile.close()


In [10]:
def classify_data_embeddings(model, inputdata, outputfile, word_embedding_model):
    '''
    This function creates a classifier for making predictions embedded data
    
    input model: classifier that will make predictions
    input inputdata: path to input data
    input outputfile: path to output file, where the predictions for each feature will be written
    input word_embedding_model : embedding model
    '''
    # extracting features
    features = extract_embeddings_features(inputdata,word_embedding_model)
    
    # making predictions with extracted features
    predictions = model.predict(features)
    
    # Write results to an outputfile
    outfile = open(outputfile, 'w')
    counter = 0
    for line in open(inputdata, 'r'):
        if len(line.rstrip('\n').split()) > 0:
            outfile.write(line.rstrip('\n') + '\t' + predictions[counter] + '\n')
            counter += 1
    outfile.close()

In [73]:
def main(system_type, argv=None):
    
    #a very basic way for picking up commandline arguments
    if argv is None:
        argv = sys.argv    
    
    trainingfile = "/Users/orbaytopal/Desktop/VUAI/Master/Machine learning NLP/ma-ml4nlp-labs-main/data/conll2003.train.conll"
    inputfile = "/Users/orbaytopal/Desktop/VUAI/Master/Machine learning NLP/ma-ml4nlp-labs-main/data/conll2003.dev.conll"
    outputfile = "output.conll2003_features"
    language_model = "/Users/orbaytopal/Desktop/VUAI/Master/Machine learning NLP/ma-ml4nlp-labs-main/data/GoogleNews-vectors-negative300.bin"
    
    
    if system_type == "with_features":
    # selecting features to train the model
        selected_features = ["token","pos","tag","previous","latter","capitals","stemm","lemma"]

        # getting the selected training features and gold labels
        training_features, gold_labels = extract_features_and_selected_labels(trainingfile,selected_features)

        # Training three different models with the features, 
        # Classifying the data and writing the result to new conll files
        for modelname in ['logreg', 'NB', 'SVM']:

            ml_model, vec = create_classifier(training_features, gold_labels, modelname)
            classify_data(ml_model, vec, inputfile, outputfile.replace('.conll','.' + modelname + '.conll'),selected_features)

            dataframe = pd.read_table(outputfile.replace('.conll','.' + modelname + '.conll'))
            dataframe = dataframe.set_axis([*dataframe.columns[:-1], 'NER2'], axis=1, inplace=False)
            dataframe.to_csv(outputfile.replace('.conll','.' + modelname + '.conll'), sep='\t')
        
    elif system_type == "word_embeddings":

    # creating a language model
        language_model = gensim.models.KeyedVectors.load_word2vec_format(language_model, binary=True)

        # extracting the features and gold label
        training_features, gold_labels = extract_embeddings_as_features_and_gold(trainingfile, language_model)

        # creating the classification model
        ml_model = create_classifier_embeddings(training_features[:1], gold_labels[:1])
        classify_data_embeddings(ml_model, inputfile, outputfile.replace('.conll','.embedded.conll'), language_model)

        data_frame = pd.read_table(outputfile.replace('.conll','.embedded.conll'))
        data_frame = data_frame.set_axis([*data_frame.columns[:-1], 'NER2'], axis=1, inplace=False)
        data_frame.to_csv(outputfile.replace('.conll','.embedded2.conll'), sep='\t')
    

In [60]:
#main(system_type="with_features")

In [11]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [12]:
trainingfile = "datas/conll2003.train.conll"
inputfile = "datas/conll2003.dev.conll"
outputfile = "output.conll2003_features"
language_model = "models/GoogleNews-vectors-negative300.bin.gz"

language_model = gensim.models.KeyedVectors.load_word2vec_format(language_model, binary=True)

In [13]:
# X_train, Y_train
training_features, gold_labels = extract_embeddings_as_features_and_gold(trainingfile, language_model)
# X_test, y_test
test_features, tests_gold_labels = extract_embeddings_as_features_and_gold(inputfile, language_model)

In [14]:
model = LinearSVC(max_iter=3000)

In [15]:
model.fit(training_features[:100000], gold_labels[:100000])

In [16]:
prediction = model.predict(test_features)
a =classification_report(tests_gold_labels,prediction)
print(confusion_matrix(tests_gold_labels, prediction))
#param_grid = {'C':[1,10,100,1000]}
param_grid = { 'C':[1,100,1000],'max_iter':[1000,3000]}
grid = GridSearchCV(LinearSVC(),param_grid,refit = True, verbose=2)

[[ 1430    22   177    32    10     3    19    36   108]
 [   41   644    36    27     1    15    11    11   136]
 [   99    51   886    32     6     7    59    52   149]
 [   26     7    17  1303     3     3     2   365   116]
 [   22     0    17     9   115     3    43    10    38]
 [    3    33    14     3    10   150     8    10   115]
 [   32     9   127    26    34    13   278    24   208]
 [   27     4    27   249     1     1    12   747   239]
 [   13    28    67     8     3    30    42    16 42552]]


In [17]:
print(a)

              precision    recall  f1-score   support

       B-LOC       0.84      0.78      0.81      1837
      B-MISC       0.81      0.70      0.75       922
       B-ORG       0.65      0.66      0.65      1341
       B-PER       0.77      0.71      0.74      1842
       I-LOC       0.63      0.45      0.52       257
      I-MISC       0.67      0.43      0.53       346
       I-ORG       0.59      0.37      0.45       751
       I-PER       0.59      0.57      0.58      1307
           O       0.97      1.00      0.98     42759

    accuracy                           0.94     51362
   macro avg       0.72      0.63      0.67     51362
weighted avg       0.93      0.94      0.93     51362



In [18]:
LinearSVC().get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'loss', 'max_iter', 'multi_class', 'penalty', 'random_state', 'tol', 'verbose'])

In [19]:
grid.fit(training_features[:100000], gold_labels[:100000])
print("Best Parameters \n: ",grid.best_params_)
gold_labels
predic = grid.predict(test_features)
print(classification_report(tests_gold_labels,predic))
print(confusion_matrix(tests_gold_labels, predic))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END .................................C=1, max_iter=1000; total time= 1.2min
[CV] END .................................C=1, max_iter=1000; total time= 1.0min
[CV] END .................................C=1, max_iter=1000; total time= 1.1min
[CV] END .................................C=1, max_iter=1000; total time= 1.1min
[CV] END .................................C=1, max_iter=1000; total time= 1.1min
[CV] END .................................C=1, max_iter=3000; total time= 1.1min
[CV] END .................................C=1, max_iter=3000; total time=  59.0s
[CV] END .................................C=1, max_iter=3000; total time= 1.1min
[CV] END .................................C=1, max_iter=3000; total time= 1.2min
[CV] END .................................C=1, max_iter=3000; total time= 1.1min
[CV] END ...............................C=100, max_iter=1000; total time= 4.7min
[CV] END ...............................C=100, ma

# NB - LG - SVM with Extracted Features

In [51]:
trainingfile = "conll2003.train_extracted_features.conll"
inputfile = "conll2003.dev_extracted_features.conll"
#outputfile = "output.conll2003_features"
#language_model = "/Users/orbaytopal/Desktop/VUAI/Master/Machine learning NLP/ma-ml4nlp-labs-main/data/GoogleNews-vectors-negative300.bin"
    
selected_features = ["token","pos","tag","previous","latter","capitals","stemm","lemma"]
training_features, gold_labels = extract_features_and_selected_labels(trainingfile, selected_features)
test_features, test_gold_labels = extract_features_and_selected_labels(inputfile,selected_features)

In [53]:
#NB
vec = DictVectorizer()
model_NB = MultinomialNB()
features_vectorized = vec.fit_transform(training_features)
model_NB.fit(features_vectorized, gold_labels)

In [54]:
features = vec.transform(test_features)
prediction_NB = model_NB.predict(features)
print(classification_report(tests_gold_labels,prediction_NB))
print(confusion_matrix(tests_gold_labels, prediction_NB))

              precision    recall  f1-score   support

       B-LOC       0.80      0.87      0.83      1837
      B-MISC       0.86      0.75      0.80       922
       B-ORG       0.79      0.73      0.76      1341
       B-PER       0.93      0.75      0.83      1842
       I-LOC       0.92      0.50      0.65       257
      I-MISC       0.92      0.45      0.61       346
       I-ORG       0.81      0.60      0.69       751
       I-PER       0.93      0.71      0.81      1307
           O       0.97      0.99      0.98     42759

    accuracy                           0.95     51362
   macro avg       0.88      0.71      0.77     51362
weighted avg       0.95      0.95      0.95     51362

[[ 1595    11    72    12     2     0     1     3   141]
 [   48   694    22     9     0     2     2     2   143]
 [   95    18   975    21     0     0    23     3   206]
 [   74     1    16  1373     0     0     2    22   354]
 [   37     2     5     0   128     0    46    14    25]
 [    7   

In [58]:
#LR
model_LG = LogisticRegression(max_iter=10000)
vec = DictVectorizer()
features_vectorized = vec.fit_transform(training_features)
model_LG.fit(features_vectorized, gold_labels)
features = vec.transform(test_features)
prediction_LG = model_LG.predict(features)

print(classification_report(tests_gold_labels,prediction_LG))
print(confusion_matrix(tests_gold_labels, prediction_LG))

              precision    recall  f1-score   support

       B-LOC       0.91      0.86      0.88      1837
      B-MISC       0.93      0.77      0.84       922
       B-ORG       0.85      0.76      0.81      1341
       B-PER       0.88      0.89      0.88      1842
       I-LOC       0.91      0.78      0.84       257
      I-MISC       0.90      0.65      0.76       346
       I-ORG       0.86      0.70      0.77       751
       I-PER       0.81      0.94      0.87      1307
           O       0.99      0.99      0.99     42759

    accuracy                           0.97     51362
   macro avg       0.89      0.82      0.85     51362
weighted avg       0.97      0.97      0.97     51362

[[ 1574     8    75    49     3     0     9    11   108]
 [   20   712    34    28     0     4     4    14   106]
 [   59    14  1024    81     0     2    26    38    97]
 [   39     2    14  1640     0     1     4    39   103]
 [    6     0     2     0   200     3     8    19    19]
 [    7   

In [59]:
#SVM
model_SVM = LinearSVC(max_iter=10000)
vec = DictVectorizer()
features_vectorized = vec.fit_transform(training_features)
model_SVM.fit(features_vectorized, gold_labels)
features = vec.transform(test_features)
prediction_SVM = model_SVM.predict(features)

print(classification_report(tests_gold_labels,prediction_SVM))
print(confusion_matrix(tests_gold_labels, prediction_SVM))

              precision    recall  f1-score   support

       B-LOC       0.91      0.89      0.90      1837
      B-MISC       0.92      0.81      0.87       922
       B-ORG       0.87      0.79      0.83      1341
       B-PER       0.89      0.90      0.89      1842
       I-LOC       0.90      0.81      0.85       257
      I-MISC       0.83      0.68      0.75       346
       I-ORG       0.85      0.74      0.79       751
       I-PER       0.86      0.95      0.91      1307
           O       0.99      1.00      0.99     42759
         ner       0.00      0.00      0.00         0

    accuracy                           0.97     51362
   macro avg       0.80      0.76      0.78     51362
weighted avg       0.97      0.97      0.97     51362

[[ 1639     6    73    44     7     1     7     6    54     0]
 [   18   750    26    19     0     4     3     8    94     0]
 [   52    16  1062    80     0     3    32    20    76     0]
 [   32     4    13  1661     0     0     4    43   