# Features Ablation Analysis

In [1]:
# libraries

import sklearn
import csv
import gensim
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

# Step 1 : A Basic Classifier

In [2]:
trainfile = './conll2003.train_extracted_features.conll'
testfile = './conll2003.dev_extracted_features.conll'

def extract_features_token_only_and_labels(conllfile):
    '''Function that extracts features and gold label from preprocessed conll (here: tokens only).
    
    :param conllfile: path to the (preprocessed) conll file
    :type conllfile: string
    
    
    :return features: a list of dictionaries, with key-value pair providing the value for the feature `token' for individual instances
    :return labels: a list of gold labels of individual instances
    '''
    
    features = []
    labels = []
    conllinput = open(conllfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    for row in csvreader:
        
        # Preprocessing the file so that all rows with instances should contain 6 values,
        # the others are empty lines indicating the beginning of a sentence
        if len(row) > 6:
            #structuring feature value pairs as key-value pairs in a dictionary
            #the first column in the conll file represents tokens
            feature_value = {'Token': row[0]}
            features.append(feature_value)
            #The 3rd column provides the gold label (= the correct answer). 
            labels.append(row[3])
    
    return features, labels

In [3]:
#extract features and labels:
feature_values, labels = extract_features_token_only_and_labels(trainfile)

In [4]:
feature_values[:20]

[{'Token': 'token'},
 {'Token': 'rejects'},
 {'Token': 'German'},
 {'Token': 'call'},
 {'Token': 'to'},
 {'Token': 'boycott'},
 {'Token': 'British'},
 {'Token': 'lamb'},
 {'Token': '.'},
 {'Token': 'Peter'},
 {'Token': 'Blackburn'},
 {'Token': 'BRUSSELS'},
 {'Token': '1996-08-22'},
 {'Token': 'The'},
 {'Token': 'European'},
 {'Token': 'Commission'},
 {'Token': 'said'},
 {'Token': 'on'},
 {'Token': 'Thursday'},
 {'Token': 'it'}]

In [5]:
labels[:20]

['ner',
 'O',
 'B-MISC',
 'O',
 'O',
 'O',
 'B-MISC',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'B-LOC',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O']

In [6]:
def create_vectorizer_and_classifier(features, labels):
    '''
    Function that takes feature-value pairs and gold labels as input and trains a logistic regression classifier
    
    :param features: feature-value pairs
    :param labels: gold labels
    :type features: a list of dictionaries
    :type labels: a list of strings
    
    :return lr_classifier: a trained LogisticRegression classifier
    :return vec: a DictVectorizer to which the feature values are fitted. 
    '''
    
    vec = DictVectorizer()
    # fit creates a mapping between observed feature values and dimensions in a one-hot vector, 
    # transform represents the current values as a vector 
    tokens_vectorized = vec.fit_transform(features)
    lr_classifier = LogisticRegression(solver='saga')
    lr_classifier.fit(tokens_vectorized, labels)
    
    return lr_classifier, vec

In [7]:
lr_classifier, vectorizer = create_vectorizer_and_classifier(feature_values, labels)

In [8]:
lr_classifier

In [9]:
vectorizer

# Step 2: Evaluation

In [10]:
def get_predicted_and_gold_labels_token_only(testfile, vectorizer, classifier):
    '''
    Function that extracts features and runs classifier on a test file returning predicted and gold labels
    
    :param testfile: path to the (preprocessed) test file
    :param vectorizer: vectorizer in which the mapping between feature values and dimensions is stored
    :param classifier: the trained classifier
    :type testfile: string
    :type vectorizer: DictVectorizer
    :type classifier: LogisticRegression()
    
    
    
    :return predictions: list of output labels provided by the classifier on the test file
    :return goldlabels: list of gold labels as included in the test file
    '''
    
    #we use the same function as above (guarantees features have the same name and form)
    sparse_feature_reps, goldlabels = extract_features_token_only_and_labels(testfile)
    #we need to use the same fitting as before, so now we only transform the current features according to this mapping (using only transform)
    test_features_vectorized = vectorizer.transform(sparse_feature_reps)
    predictions = classifier.predict(test_features_vectorized)
    
    return predictions, goldlabels

In [11]:
predictions, goldlabels = get_predicted_and_gold_labels_token_only(testfile, vectorizer, lr_classifier)

In [12]:
predictions[:20]

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC',
       'O', 'B-LOC', 'B-MISC', 'O', 'B-PER', 'I-PER', 'O', 'O'],
      dtype='<U6')

In [13]:
goldlabels[:20]

['gold',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'B-MISC',
 'I-MISC',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O']

In [14]:
def print_confusion_matrix(predictions, goldlabels):
    '''
    Function that prints out a confusion matrix
    
    :param predictions: predicted labels
    :param goldlabels: gold standard labels
    :type predictions, goldlabels: list of strings
    '''
    
    
    
    #based on example from https://datatofish.com/confusion-matrix-python/ 
    data = {'Gold':    goldlabels, 'Predicted': predictions    }
    df = pd.DataFrame(data, columns=['Gold','Predicted'])

    confusion_matrix = pd.crosstab(df['Gold'], df['Predicted'], rownames=['Gold'], colnames=['Predicted'])
    print (confusion_matrix)

In [15]:
print_confusion_matrix(predictions, goldlabels)

Predicted  B-LOC  B-MISC  B-ORG  B-PER  I-LOC  I-MISC  I-ORG  I-PER      O
Gold                                                                      
B-LOC       1305      15    101      4      7       0      6      4    395
B-MISC        41     603     14      8      0      12      2      1    241
B-ORG         78      23    690      5     11       3     38     14    479
B-PER         16       3      2    873      0       0      1    104    843
I-LOC         13       2      1      0    150       3     13      6     69
I-MISC         2      27      7      2      7     145      2      4    150
I-ORG         36      11     47      4     38       5    263      5    342
I-PER          6       2      5    102      0       0      1    292    899
O              3      10      4      0      1      10     11      1  42718
gold           0       0      0      0      0       0      0      0      1


In [16]:
def print_precision_recall_fscore(predictions, goldlabels):
    '''
    Function that prints out precision, recall and f-score
    
    :param predictions: predicted output by classifier
    :param goldlabels: original gold labels
    :type predictions, goldlabels: list of strings
    '''
    
    precision = metrics.precision_score(y_true=goldlabels,
                        y_pred=predictions,
                        average='macro')

    recall = metrics.recall_score(y_true=goldlabels,
                     y_pred=predictions,
                     average='macro')


    fscore = metrics.f1_score(y_true=goldlabels,
                 y_pred=predictions,
                 average='macro')

    print('P:', precision, 'R:', recall, 'F1:', fscore)

In [17]:
print_precision_recall_fscore(predictions, goldlabels)

  _warn_prf(average, modifier, msg_start, len(result))


P: 0.7302667260105565 R: 0.49283025204234604 F1: 0.5737103836701725


# Step 3: A More Elaborate System

In [18]:
#defines the column in which each feature is located (note: you can also define headers and use csv.DictReader)
feature_to_index = {'token': 0, 'pos': 1,'tag':2,'ner':3, 'previous': 4, 'latter': 5, 'capitals': 6,'stemm':7,'lemma':8}

In [19]:
def extract_features_and_gold_labels(conllfile, selected_features):
    '''Function that extracts features and gold label from preprocessed conll (here: tokens only).
    
    :param conllfile: path to the (preprocessed) conll file
    :type conllfile: string
    
    
    :return features: a list of dictionaries, with key-value pair providing the value for the feature `token' for individual instances
    :return labels: a list of gold labels of individual instances
    '''
    
    features = []
    labels = []
    conllinput = open(conllfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    
    for row in csvreader:
        
        if len(row) > 6:
            feature_value = {}
            
            for feature_name in selected_features:
                row_index = feature_to_index.get(feature_name)
                feature_value[feature_name] = row[row_index]
            
            features.append(feature_value)
            
            #The last column provides the gold label (= the correct answer). 
            labels.append(row[3])
    
    return features, labels

In [20]:
#define which from the available features will be used (names must match key names of dictionary feature_to_index)
all_features = ['token','previous', 'latter', 'capitals','stemm','lemma']

sparse_feature_reps, labels = extract_features_and_gold_labels(trainfile, all_features)

In [21]:
sparse_feature_reps[:6]

[{'token': 'token',
  'previous': 'previous',
  'latter': 'latter',
  'capitals': 'capitals',
  'stemm': 'stemm',
  'lemma': 'lemma'},
 {'token': 'rejects',
  'previous': ' ',
  'latter': 'German',
  'capitals': '0',
  'stemm': 'reject',
  'lemma': 'reject'},
 {'token': 'German',
  'previous': 'rejects',
  'latter': 'call',
  'capitals': '0',
  'stemm': 'german',
  'lemma': 'German'},
 {'token': 'call',
  'previous': 'German',
  'latter': 'to',
  'capitals': '0',
  'stemm': 'call',
  'lemma': 'call'},
 {'token': 'to',
  'previous': 'call',
  'latter': 'boycott',
  'capitals': '0',
  'stemm': 'to',
  'lemma': 'to'},
 {'token': 'boycott',
  'previous': 'to',
  'latter': 'British',
  'capitals': '0',
  'stemm': 'boycott',
  'lemma': 'boycott'}]

In [22]:
labels[:6]

['ner', 'O', 'B-MISC', 'O', 'O', 'O']

In [23]:
def get_predicted_and_gold_labels(testfile, vectorizer, classifier, selected_features):
    '''
    Function that extracts features and runs classifier on a test file returning predicted and gold labels
    
    :param testfile: path to the (preprocessed) test file
    :param vectorizer: vectorizer in which the mapping between feature values and dimensions is stored
    :param classifier: the trained classifier
    :type testfile: string
    :type vectorizer: DictVectorizer
    :type classifier: LogisticRegression()
    
    
    
    :return predictions: list of output labels provided by the classifier on the test file
    :return goldlabels: list of gold labels as included in the test file
    '''
    
    #we use the same function as above (guarantees features have the same name and form)
    features, goldlabels = extract_features_and_gold_labels(testfile, selected_features)
    #we need to use the same fitting as before, so now we only transform the current features according to this mapping (using only transform)
    test_features_vectorized = vectorizer.transform(features)
    predictions = classifier.predict(test_features_vectorized)
    
    return predictions, goldlabels

In [28]:
lr_classifier, vectorizer = create_vectorizer_and_classifier(sparse_feature_reps, labels)



In [29]:
vectorizer

In [30]:
lr_classifier

In [31]:
predictions, goldlabels = get_predicted_and_gold_labels(testfile, vectorizer, lr_classifier, all_features)

In [34]:
predictions[:10]

array(['ner', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
      dtype='<U6')

In [35]:
goldlabels[:10]

['gold', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [36]:
print_confusion_matrix(predictions, goldlabels)

Predicted  B-LOC  B-MISC  B-ORG  B-PER  I-LOC  I-MISC  I-ORG  I-PER      O  \
Gold                                                                         
B-LOC       1536      11     65     13      3       0      3      2    204   
B-MISC        17     706     11      5      0       7      1      1    174   
B-ORG         36      16    955     20      0       0     13     15    286   
B-PER         15       1     11   1270      0       0      4     23    518   
I-LOC          6       0      1      0    193       2      3      6     46   
I-MISC         1      25      2      0      0     211      0      3    104   
I-ORG         28       5     13      4      9       7    485      7    193   
I-PER          3       0      3     18      0       0      0    907    376   
O              6       4     12      8      0       4      9     24  42691   
gold           0       0      0      0      0       0      0      0      0   

Predicted  ner  
Gold            
B-LOC        0  
B-MISC      

In [37]:
print_precision_recall_fscore(predictions, goldlabels)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


P: 0.7597343180173812 R: 0.609317205364778 F1: 0.6732806711261443


    with tokens only
P: 0.7302667260105565 R: 0.49283025204234604 F1: 0.5737103836701725

    with all features
P: 0.7597343180173812 R: 0.609317205364778 F1: 0.6732806711261443

# Step 4: Feature Ablation Analysis

In [38]:
# example of system with just one additional feature
#define which from the available features will be used (names must match key names of dictionary feature_to_index)
selected_features = ['token','latter', 'capitals']

feature_values, labels = extract_features_and_gold_labels(trainfile, selected_features)
#we can use the same function as before for creating the classifier and vectorizer
lr_classifier, vectorizer = create_vectorizer_and_classifier(feature_values, labels)
#when applying our model to new data, we need to use the same features
predictions, goldlabels = get_predicted_and_gold_labels(testfile, vectorizer, lr_classifier, selected_features)
print_confusion_matrix(predictions, goldlabels)
print_precision_recall_fscore(predictions, goldlabels)



Predicted  B-LOC  B-MISC  B-ORG  B-PER  I-LOC  I-MISC  I-ORG  I-PER      O
Gold                                                                      
B-LOC       1330       5     60      6      4       0      3     13    416
B-MISC        19     655      4      4      0      14      2      1    223
B-ORG         54      18    781      9      1       2     54     32    390
B-PER         12       1     10    960      0       0      1     83    775
I-LOC         17       0      6      0    150       5     13      4     62
I-MISC         2      41      1      2      1     166      0      5    128
I-ORG         24       7     57      2     10       5    355     16    275
I-PER          4       0      4     83      0       0      0    596    620
O              2       2     17      2      1       8      4     50  42672
gold           0       0      0      0      0       0      0      0      1


  _warn_prf(average, modifier, msg_start, len(result))


P: 0.7768221298070672 R: 0.5528116749098522 F1: 0.6391668575108358


In [39]:
# example of system with just one additional feature
#define which from the available features will be used (names must match key names of dictionary feature_to_index)
selected_features = ['token','pos', 'capitals','stemm']

feature_values, labels = extract_features_and_gold_labels(trainfile, selected_features)
#we can use the same function as before for creating the classifier and vectorizer
lr_classifier, vectorizer = create_vectorizer_and_classifier(feature_values, labels)
#when applying our model to new data, we need to use the same features
predictions, goldlabels = get_predicted_and_gold_labels(testfile, vectorizer, lr_classifier, selected_features)
print_confusion_matrix(predictions, goldlabels)
print_precision_recall_fscore(predictions, goldlabels)



Predicted  B-LOC  B-MISC  B-ORG  B-PER  I-LOC  I-MISC  I-ORG  I-PER      O
Gold                                                                      
B-LOC       1488      12    125     11      7       0     14    116     64
B-MISC        34     662     33     12      0      11     14     45    111
B-ORG         85      17    888     15     13       3     68    137    115
B-PER         17       3     26   1166      0       0      7    518    105
I-LOC         20       2      4      0    169       4     17     25     16
I-MISC         2      31     18      1      9     169      7     31     78
I-ORG         45       5     64     12     41       5    360     83    136
I-PER         13       3     16    120      0       0      7   1108     40
O              6      11     68     11      1       9     26    158  42468
gold           0       0      0      0      0       0      0      0      1


  _warn_prf(average, modifier, msg_start, len(result))


P: 0.7058240414856609 R: 0.6289568936938837 F1: 0.6532858079231936


# Part 2: One-hot versus Embeddings


In [40]:
# create classifier with caps feature only and print vectorizer, then with token only (but you see less)

selected_features = ['capitals']

feature_values, labels = extract_features_and_gold_labels(trainfile, selected_features)

#creating a vectorizing
vectorizer = DictVectorizer()
#fitting the values to dimensions (creating a mapping) and transforming the current observations according to this mapping
capitalization_vectorized = vectorizer.fit_transform(feature_values)
print(capitalization_vectorized.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Using word embeddings

In [41]:
# this step takes a while
word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format('./models/GoogleNews-vectors-negative300.bin.gz',
                                                                       binary=True)  

In [44]:
def extract_embeddings_as_features_and_gold(conllfile,word_embedding_model):
    '''
    Function that extracts features and gold labels using word embeddings
    
    :param conllfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    :return labels: list of gold labels
    '''
    labels = []
    features = []
    
    conllinput = open(conllfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    for row in csvreader:
        if len(row) > 6:
            if row[0] in word_embedding_model:
                vector = word_embedding_model[row[0]]
            else:
                vector = [0]*300
            features.append(vector)
            labels.append(row[3])
    return features, labels

In [45]:
dense_feature_representations, labels = extract_embeddings_as_features_and_gold(trainfile,word_embedding_model)

In [57]:
print('Extracting dense features...')
dense_feature_representations[:1]

Extracting dense features...


[array([ 0.04174805,  0.20410156, -0.26757812,  0.29882812, -0.11181641,
        -0.01470947,  0.16992188, -0.09423828,  0.04785156,  0.05810547,
        -0.07128906, -0.13867188,  0.04589844,  0.00604248, -0.15917969,
         0.10888672,  0.14648438,  0.0145874 ,  0.08398438, -0.23535156,
        -0.13378906, -0.02783203,  0.06982422, -0.22558594,  0.05493164,
        -0.19042969, -0.3125    ,  0.04541016,  0.09277344,  0.01342773,
        -0.01275635, -0.30664062, -0.07275391,  0.1640625 , -0.00075531,
        -0.25976562,  0.28710938,  0.10546875, -0.17382812,  0.09277344,
         0.06542969,  0.00534058,  0.2734375 , -0.05688477, -0.01367188,
        -0.203125  , -0.00601196,  0.11816406, -0.04980469, -0.22851562,
        -0.03808594, -0.04785156, -0.03417969, -0.03979492, -0.33203125,
        -0.02612305, -0.3125    ,  0.02172852, -0.09033203, -0.20800781,
         0.08740234,  0.21289062,  0.11865234, -0.21386719,  0.01428223,
        -0.11767578, -0.265625  , -0.02539062,  0.1

In [58]:
def create_classifier(features, labels):
    '''
    Function that creates classifier from features represented as vectors and gold labels
    
    :param features: list of vector representations of tokens
    :param labels: list of gold labels
    :type features: list of vectors
    :type labels: list of strings
    
    :returns trained logistic regression classifier
    '''
    
    
    lr_classifier = LogisticRegression(solver='saga')
    lr_classifier.fit(features, labels)
    
    return lr_classifier

In [59]:
print('Training classifier....')
classifier = create_classifier(dense_feature_representations, labels)

Training classifier....




In [60]:
def label_data_using_word_embeddings(testfile, word_embedding_model, classifier):
    '''
    Function that extracts word embeddings as features and gold labels from test data and runs a classifier
    
    :param testfile: path to test file
    :param word_embedding_model: distributional semantic model
    :param classifier: trained classifier
    :type testfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    :type classifier: LogisticRegression
    
    :return predictions: list of predicted labels
    :return labels: list of gold labels
    '''
    
    dense_feature_representations, labels = extract_embeddings_as_features_and_gold(testfile,word_embedding_model)
    predictions = classifier.predict(dense_feature_representations)
    
    return predictions, labels


In [61]:
predicted, gold = label_data_using_word_embeddings(testfile, word_embedding_model, classifier)
print_confusion_matrix(predictions, goldlabels)
print_precision_recall_fscore(predicted, gold)

Predicted  B-LOC  B-MISC  B-ORG  B-PER  I-LOC  I-MISC  I-ORG  I-PER      O
Gold                                                                      
B-LOC       1488      12    125     11      7       0     14    116     64
B-MISC        34     662     33     12      0      11     14     45    111
B-ORG         85      17    888     15     13       3     68    137    115
B-PER         17       3     26   1166      0       0      7    518    105
I-LOC         20       2      4      0    169       4     17     25     16
I-MISC         2      31     18      1      9     169      7     31     78
I-ORG         45       5     64     12     41       5    360     83    136
I-PER         13       3     16    120      0       0      7   1108     40
O              6      11     68     11      1       9     26    158  42468
gold           0       0      0      0      0       0      0      0      1


  _warn_prf(average, modifier, msg_start, len(result))


P: 0.6599033972113301 R: 0.5870458883302929 F1: 0.617882004486319


In [62]:
def extract_word_embedding(token, word_embedding_model):
    '''
    Function that returns the word embedding for a given token out of a distributional semantic model and a 300-dimension vector of 0s otherwise
    
    :param token: the token
    :param word_embedding_model: the distributional semantic model
    :type token: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :returns a vector representation of the token
    '''
    if token in word_embedding_model:
        vector = word_embedding_model[token]
    else:
        vector = [0]*300
    return vector

def extract_feature_values(row, selected_features):
    '''
    Function that extracts feature value pairs from row
    
    :param row: row from conll file
    :param selected_features: list of selected features
    :type row: string
    :type selected_features: list of strings
    
    :returns: dictionary of feature value pairs
    '''
    feature_values = {}
    for feature_name in selected_features:
        r_index = feature_to_index.get(feature_name)
        feature_values[feature_name] = row[r_index]
        
    return feature_values


def create_vectorizer_traditional_features(feature_values):
    '''
    Function that creates vectorizer for set of feature values
    
    :param feature_values: list of dictionaries containing feature-value pairs
    :type feature_values: list of dictionairies (key and values are strings)
    
    :returns: vectorizer with feature values fitted
    '''
    vectorizer = DictVectorizer()
    vectorizer.fit(feature_values)
    
    return vectorizer
        
    
    
def combine_sparse_and_dense_features(dense_vectors, sparse_features):
    '''
    Function that takes sparse and dense feature representations and appends their vector representation
    
    :param dense_vectors: list of dense vector representations
    :param sparse_features: list of sparse vector representations
    :type dense_vector: list of arrays
    :type sparse_features: list of lists
    
    :returns: list of arrays in which sparse and dense vectors are concatenated
    '''
    
    combined_vectors = []
    sparse_vectors = np.array(sparse_features.toarray())
    
    for index, vector in enumerate(sparse_vectors):
        combined_vector = np.concatenate((vector,dense_vectors[index]))
        combined_vectors.append(combined_vector)
    return combined_vectors


In [66]:
def extract_traditional_features_and_embeddings_plus_gold_labels(conllfile, word_embedding_model, vectorizer=None):
    '''
    Function that extracts traditional features as well as embeddings and gold labels using word embeddings for current and preceding token
    
    :param conllfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    :return labels: list of gold labels
    '''
    labels = []
    dense_vectors = []
    traditional_features = []
    
    conllinput = open(conllfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    for row in csvreader:
        if len(row) > 6:
            token_vector = extract_word_embedding(row[0], word_embedding_model)
            pt_vector = extract_word_embedding(row[1], word_embedding_model)
            dense_vectors.append(np.concatenate((token_vector,pt_vector)))
            #mixing very sparse representations (for one-hot tokens) and dense representations is a bad idea
            #we thus only use other features with limited values
            other_features = extract_feature_values(row, ['capitals','pos','tag'])
            traditional_features.append(other_features)
            #adding gold label to labels
            labels.append(row[3])
            
    #create vector representation of traditional features
    if vectorizer is None:
        #creates vectorizer that provides mapping (only if not created earlier)
        vectorizer = create_vectorizer_traditional_features(traditional_features)
    sparse_features = vectorizer.transform(traditional_features)
    combined_vectors = combine_sparse_and_dense_features(dense_vectors, sparse_features)
    
    return combined_vectors, vectorizer, labels

In [67]:
def label_data_with_combined_features(testfile, classifier, vectorizer, word_embedding_model):
    '''
    Function that labels data with model using both sparse and dense features
    '''
    feature_vectors, vectorizer, goldlabels = extract_traditional_features_and_embeddings_plus_gold_labels(testfile, word_embedding_model, vectorizer)
    predictions = classifier.predict(feature_vectors)
    
    return predictions, goldlabels


In [68]:
print('Extracting Features...')
feature_vectors, vectorizer, gold_labels = extract_traditional_features_and_embeddings_plus_gold_labels(trainfile, word_embedding_model)
print('Training classifier....')
lr_classifier = create_classifier(feature_vectors, gold_labels)
print('Running the evaluation...')
predictions, goldlabels = label_data_with_combined_features(testfile, lr_classifier, vectorizer, word_embedding_model)
print_confusion_matrix(predictions, goldlabels)
print_precision_recall_fscore(predictions, goldlabels)

Extracting Features...
Training classifier....




Running the evaluation...
Predicted  B-LOC  B-MISC  B-ORG  B-PER  I-LOC  I-MISC  I-ORG  I-PER      O
Gold                                                                      
B-LOC       1516      22    156     28     18       2     18     18     59
B-MISC        33     669     42     15      3      15     15     19    111
B-ORG        109      38    919     57     13       8     80     29     88
B-PER         30       8     36   1518      9       0     10    154     77
I-LOC         13       0      6      3    158       5     40     14     18
I-MISC         4      34     16      2     10     167     11     13     89
I-ORG         39      14     73     15     39      13    366     45    147
I-PER          7       3     16     84      3       5     16   1140     33
O              6      36     73     11      7      16     27    127  42455
gold           0       0      0      0      0       0      0      0      1


  _warn_prf(average, modifier, msg_start, len(result))


P: 0.691322419392516 R: 0.6510204039255932 F1: 0.6669061336101958
