# SVM with Embeddings

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
import gensim
from gensim.models import KeyedVectors
import pandas as pd
import sys
import csv

In [25]:
def extract_embeddings_as_features_and_gold(conllfile,word_embedding_model):
    '''
    Function that extracts features and gold labels using word embeddings
    
    :param conllfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    :return labels: list of gold labels
    '''
    ### This code was partially inspired by code included in the HLT course, obtained from https://github.com/cltl/ma-hlt-labs/, accessed in May 2020.
    labels = []
    features = []
    
    conllinput = open(conllfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    for row in csvreader:
        #check for cases where empty lines mark sentence boundaries (which some conll files do).
        if len(row) > 3:
            if row[0] in word_embedding_model:
                vector = word_embedding_model[row[0]]
            else:
                vector = [0]*300
            features.append(vector)
            labels.append(row[-1])

    return features, labels

In [26]:
language_model = "./models/GoogleNews-vectors-negative300.bin.gz"
trainingfile = "datas/conll2003.train.conll"
inputfile = "datas/conll2003.dev.conll"
outputfile = "output.conll2003_features"

language_model = gensim.models.KeyedVectors.load_word2vec_format(language_model, binary=True)
training_features, gold_labels = extract_embeddings_as_features_and_gold(trainingfile, language_model)

In [43]:
#print(training_features[:5])

In [28]:
print(gold_labels[:5])

['B-ORG', 'O', 'B-MISC', 'O', 'O']


In [29]:
print(len(training_features))
print(len(gold_labels))

203621
203621


In [30]:
def create_classifier_embeddings(train_features, train_labels):
    '''
    Create an SVM classifier and train it with vectorized features and corresponding gold labels
    
    input train_features: features to be transformed into vectors
    input train_labels: gold labels corresponding to features
    
    output model: trained classifier
    '''

    model = LinearSVC(max_iter=10000)
    model.fit(train_features, train_labels)
    
    return model

In [31]:
svm_model = create_classifier_embeddings(training_features[:10], gold_labels[:10])

In [32]:
svm_model

In [33]:
def extract_embeddings_features(inputfile,word_embedding_model):
    '''
    This function extracts features from embeddings
    
    :param inputfile: path to conll file
    :param word_embedding_model: a pretrained word embedding model
    :type conllfile: string
    :type word_embedding_model: gensim.models.keyedvectors.Word2VecKeyedVectors
    
    :return features: list of vector representation of tokens
    '''
    ### This code was partially inspired by code included in the HLT course, obtained from https://github.com/cltl/ma-hlt-labs/, accessed in May 2020.
    features = []
    
    conllinput = open(inputfile, 'r')
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    
    for row in csvreader:
        #check for cases where empty lines mark sentence boundaries (which some conll files do).
        if len(row) > 3:
            if row[0] in word_embedding_model:
                vector = word_embedding_model[row[0]]
            else:
                vector = [0]*300
            features.append(vector)

    return features

In [41]:
def classify_data_embeddings(model, inputdata, outputfile, word_embedding_model):
    '''
    This function creates a classifier for making predictions embedded data
    
    input model: classifier that will make predictions
    input inputdata: path to input data
    input outputfile: path to output file, where the predictions for each feature will be written
    input word_embedding_model : embedding model
    '''
    # extracting features
    features = extract_embeddings_features(inputdata,word_embedding_model)
    print(features)
    
    # making predictions with extracted features
    predictions = model.predict(features[:2])
    
    # Write results to an outputfile
    outfile = open(outputfile, 'w')
    counter = 0
    for line in open(inputdata, 'r'):
        if len(line.rstrip('\n').split()) > 0:
            outfile.write(line.rstrip('\n') + '\t' + predictions[counter] + '\n')
            counter += 1
    outfile.close()

In [None]:
classify_data_embeddings(svm_model, inputfile, outputfile.replace('.conll','.embedded.conll'), language_model)
data_frame = pd.read_table(outputfile.replace('.conll','.embedded.conll'))
data_frame = data_frame.set_axis([*data_frame.columns[:-1], 'NER2'], axis=1, inplace=False)
data_frame.to_csv(outputfile.replace('.conll','.embedded2.conll'), sep='\t')

In [None]:
classify_data_embeddings

In [44]:
features = extract_embeddings_features(trainingfile,language_model)
#features[:2]