# Feature Extraction


In [165]:
# libraries

import numpy as np
import pandas as pd
import csv
import nltk
import sys
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
#nltk.download('omw-1.4') 

In [166]:
def read_conll_file(conll_file_path, delimiter='\t'):
    
    '''
    This function returns a csv reader of conll file
    
    :param conll_file_path: the path to the conll file
    :param delimiter: the name of the column in which the target annotation is provided
    :type conll_file_path: string
    :type delimiter: string
    
    :returns: provides a structured representation of the data in the conll file.
    '''
    
    conll_file = open(conll_file_path, 'r')
    c_file = csv.reader(conll_file, delimiter=delimiter)
    
    return c_file

In [167]:
def tokenization_feature(conll_file):
    
    '''
    This function reads and tokenize all words in the data 
    
    :param conll_file: the path to the conll file
    :type conll_file: string
    
    :returns: provides list with tokenized words
    '''
    
    conll = read_conll_file(conll_file)
    
    tokens = []
    
    for index,row in enumerate(conll):
        if index == 0:
            continue
        if len(row) > 0:
            token = row[0]
            tokens.append(token)
    
    return tokens

In [168]:
def capitalization_feature(tokens):
    
    '''
    This function checks whether tokens are capitalized or not
    :param tokens: the list of tokenized data
    :type tokens: list
    
    :returns: provides list which 0 (not capitalized) and 1 (capitalized) for tokens
    '''

    capitals = []
            
    for t in tokens:
        if t.isupper():
            capitals.append(1)
        else:
            capitals.append(0)
        
    return capitals

In [169]:
def lemmatization_feature(tokens):
    
    '''
    This function applies lemmatization to tokens
    
    :returns: a list with lemmatized tokens
    '''
    wnl = WordNetLemmatizer()
    lemmas = []
            
    for t in tokens:
        lemmas.append(wnl.lemmatize(t))
        
    return lemmas

In [170]:
def prev_latter_token(tokens):
    
    '''
    This function extracts previous and latter tokens and appends them to a list
    
    :returns: two lists with previous and latter tokens
    '''

    previous_tokens = []
    latter_tokens = []
    
    previous = ' '
    latter = ' '

    for index, token in enumerate(tokens):
        
        if index > 0:
            previous = tokens[index - 1]
        previous_tokens.append(previous)
        
        if index < (len(tokens)-1):
            latter = tokens[index + 1]
        latter_tokens.append(latter)
    
    return previous_tokens,latter_tokens

In [171]:
def stemming_feature(tokens):
    
    '''
    This function applies stemming to tokens
    
    :returns: a list with stemmized tokens
    '''
    ps = PorterStemmer()
    stemm = []
            
    for t in tokens:
        stemm.append(ps.stem(t))
        
    return stemm

In [174]:
def main():
    
    args = sys.argv
    trainingfile = args[1]
    testfile = args[2]
    
    training_data = pd.read_csv("datas/conll2003.train.conll", sep='\t')
    training_data.columns = ["token","pos","tag","ner"]
    test_data = pd.read_csv("datas/conll2003.dev.conll", sep='\t')
    test_data.columns = ["token","pos","tag","gold"]
    
    tokens_train = tokenization_feature("datas/conll2003.train.conll")
    previous_tokens_train,latter_tokens_train = prev_latter_token(tokens_train)
    capitalized_tokens_train = capitalization_feature(tokens_train)
    stemmed_tokens_train = stemming_feature(tokens_train)
    lemmatized_tokens_train = lemmatization_feature(tokens_train)
    
    
    training_data['previous'] = previous_tokens_train
    training_data['latter'] = latter_tokens_train
    training_data['capitals'] = capitalized_tokens_train
    training_data['stemm'] = stemmed_tokens_train
    training_data['lemma'] = lemmatized_tokens_train
    
    training_data.to_csv("conll2003.train_extracted_features.conll", sep='\t', index=False)
    
    tokens_test = tokenization_feature("datas/conll2003.dev.conll")
    previous_tokens_test,latter_tokens_test = prev_latter_token(tokens_test)
    capitalized_tokens_test = capitalization_feature(tokens_test)
    stemmed_tokens_test = stemming_feature(tokens_test)
    lemmatized_tokens_test = lemmatization_feature(tokens_test)
    
    test_data['previous'] = previous_tokens_test
    test_data['latter'] = latter_tokens_test
    test_data['capitals'] = capitalized_tokens_test
    test_data['stemm'] = stemmed_tokens_test
    test_data['lemma'] = lemmatized_tokens_test
    
    test_data.to_csv("conll2003.dev_extracted_features.conll",sep='\t', index=False)

In [175]:
if __name__ == '__main__':
    main()