In [18]:
import spacy
import nltk
from sklearn.feature_extraction import DictVectorizer
import os
from lxml import etree
from nltk.corpus import stopwords

# globals
nlp_spacy = spacy.load('en_core_web_sm')

data_file_path = "tagged/"
annotated_index = "annotated_data"

In [19]:
# methods for reading data
def get_lines(filename, flines):
    lines = {}
    with open(data_file_path + filename) as f:
        content = f.readlines()
        content = [x.strip() for x in content]

        for l in flines:
            lines[l] = content[l+1]

    return lines


def get_doc(filename):
    xml_file = etree.parse(data_file_path + filename)
    doc = {
        "title": "",
        "lines": [],
        "file": filename
    }
    
    for elem in xml_file.iter():
        if elem.tag == 'TITLE':
            doc['title'] = elem.text
        if elem.tag == 'A-S' or elem.tag == 'S' and 'AZ' in elem.attrib.keys():
            line_dict = {
                "text": elem.text.strip(),
                "label": elem.attrib['AZ'],
                "id": int(elem.attrib['ID'].split('-')[-1])
            }
            doc["lines"].append(line_dict)
    return doc

def get_all_docs():
    all_docs = []
    files = os.listdir(data_file_path)
    for f in files:
        all_docs.append(get_doc(f))
    return all_docs

all_docs = get_all_docs()

In [23]:
# this cell contains methods to extract features
def line_zone(ln):
    """
    The docuement is divided into 7 zones
    """
    if ln >= 0 and ln <= 20:
        return 'A'
    elif ln >= 21 and ln <= 40:
        return 'B'
    elif ln >= 41 and ln <= 60:
        return 'C'
    elif ln >= 61 and ln <= 80:
        return 'D'
    elif ln >= 81 and ln <= 100:
        return 'E'
    elif ln >= 101 and ln <= 120:
        return 'F'
    elif ln >= 121 and ln <= 140:
        return 'G'
    elif ln >= 141 and ln <= 160:
        return 'H'
    else:
        return 'I'


def title_word(line, title):
    """
    If the line contains word appearing in the title
    """
    stop_words = set(stopwords.words('english'))
    line_tokens = nltk.word_tokenize(line) 
    title_tokens = nltk.word_tokenize(title)
    
    filtered_line = [w for w in line_tokens if not w in stop_words] 
    filtered_title = [w for w in title_tokens if not w in stop_words]
    
    for w in line_tokens:
        if w in title_tokens:
            print(w + " found in title " + title)
            return True
    return False

def get_main_verb(line):
    """
    return the main verb in a sentence
    """
    doc = nlp_spacy(line)
    for t in doc:
        if t.dep_ == 'ROOT':
            return t
    # else return the first verb
    for t in doc:
        if t.pos_ == 'VERB':
            return t
    # sentence has no verb
    return "NoVerb"

def get_vtense(line):
    """
    return the tense of main verb
    """
    vrb = str(get_main_verb(line))
    
    if vrb == "NoVerb":
        return vrb

    ws = nltk.pos_tag(nltk.word_tokenize(line))
    
    vtag = ""
    for a,b in ws:
        if a == vrb:
            vtag = b
    
    future = ['MD']
    present = ["VBP", "VBZ","VBG"]
    past = ["VBD", "VBN"]
    
    if vtag in future:
        return "future"
    if vtag in present:
        return "present"
    if vtag in past:
        return "past"
    
    return "future"

def modal_present(line):
    ws = nltk.pos_tag(nltk.word_tokenize(line))
    
    for a,b in ws:
        if b == 'MD':
            return True
    return False

def citation(line):
    """
    If the line contains any citations
    """
    line = line.lower()
    if '[' or ']' or 'cit' or 'cite' or 'extend' or 'citation':
        return True
    return False


def long_sent(line):
    """
    If the sentence is long
    """
    words = line.split(' ')
    if len(words) > 20:
        return True
    return False


def get_feature_vector(line, title):
    """
    Given a line return the corresponding feature vector
    """
    lt = line['text']
    print(lt, title)
    fv = [line_zone(line['id']), title_word(lt, title), citation(lt), long_sent(lt),
        modal_present(lt), get_vtense(lt)]
    return fv

In [24]:
# now for each line in the documents we create the corresponding feature vectors matrix
get_feature_vector(all_docs[0]['lines'][0], all_docs[0]['title'])

An experiment designed to explore the relationship between tagging accuracy and the nature of the tagset is described , using corpora in English , French and Swedish .  Tagset Design and Inflected Languages 
and found in title  Tagset Design and Inflected Languages 


['A', True, True, True, False, 'past']