In [1]:
'''
Model to train for differentiating restrictive vs. non-restrictive

@author: Mustafa Bal
'''

import sys, time, string
from itertools import chain
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
from classifier import Classifier
from coling_baseline import ColingBaselineClassifier
import SpaCyParserWrapper
import pycrfsuite

In [14]:
def get_input (input_file):
    '''
    get input file by utf8 encoding. Read the file content then return it.
    '''
    sentence = []
    result_list = []

    with open(input_file, 'r') as f:
        for line in f.readlines():
            data = line.strip().split('\t')
            #(0)posInSentence
            #(1)spelling1, (2)spelling2, (3)spelling3
            #(4)type1, (5)type2
            #(6)unk1, (7)unk2
            #(8)connected1, (9)connected2,  
            #(-2)RESTR/NON-RESTR, (-1)modifierType
            
            #(0): posInSentence
            #(1): (0)spelling1, (1)spelling2, (2)spelling3
            #(2): (0)type1, (1)type2
            #(3): (0)unk1, (1)unk2
            #(4): (0)connected1, (1)connected2,  
            #(5): RESTR/NON-RESTR
            #(6): modifiertype
    
            if data == ['']: # end of sentence
                result_list.append(sentence)
                sentence = []
            else:
                tup = (data[0],data[1],data[2],data[3],data[4],data[5],data[6],data[7],data[8],data[9],data[-2],data[-1])
                sentence.append(tup)
        f.close()
    return result_list

In [21]:
def set_features(sentence, i):
    '''
    Set features for each word in a sentence.
    '''
    word = sentence[i][0]
    print(word)
    # Set the features of the word
    features = [
        'word.lower=' + word.lower(),
        'word.length=' + str(len(word)),
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'word.isdot=%s' % isdot(word),    
        'word.isdash=%s' % isdash(word),          
        'word.iscomma=%s' % iscomma(word),
    ]
    if i > 0:
        # Set the features of relationship with previous word.
        word1 = sentence[i-1][0]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.length=' + str(len(word1)),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isdigit=%s' % word1.isdigit(), 
            '-1:word.isdot=%s' % isdot(word1),    
            '-1:word.isdash=%s' % isdash(word1),  
            '-1:word.iscomma=%s' % iscomma(word1),         
        ])
    else:
        features.append('Begin_Of_Sentence')
        
    if i < len(sentence)-1:
        # Set the features of relationship with next word.
        word1 = sentence[i+1][0]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.length=' + str(len(word1)),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:word.isdot=%s' % isdot(word1),    
            '+1:word.isdash=%s' % isdash(word1),  
            '+1:word.iscomma=%s' % iscomma(word1), 
        ])
    else:
        features.append('End_Of_Sentence')
                
    return features

def isdot(word):
    return True if word in '.' else False

def isdash(word):
    return True if word in '-' else False

def iscomma(word):
    return True if word in ',' else False

def ispunctuation(word): 
    return True if word in string.punctuation else False

def get_features(sent):
    return [set_features(sent, i) for i in range(len(sent))]

def get_labels(sent):
    return [x[11] for x in sent]

def get_tokens(sent):
    return [(x[1],x[10]) for x in sent]   

In [25]:
def bio_classification_report(y_true, y_pred):
    '''
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "N" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master) to calculate averages properly!
    '''
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'N'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [26]:
def main(training_file, testing_file, model_file):
    
    start = time.time()
    
    # Get training and testing set of data
    training_set = get_input(training_file)
    testing_set = get_input(testing_file)
    
    # Get features of each word on training set
    X_train = [get_features(s) for s in training_set]
    y_train = [get_labels(s) for s in training_set]
    
    # Get features of each word on testing set
    X_test = [get_features(s) for s in testing_set]
    y_test = [get_labels(s) for s in testing_set]

    # Create trainer model of CRF
    trainer = pycrfsuite.Trainer(verbose=False)

    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': 3.0,   # coefficient for L1 penalty
        'c2': 1e-20,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
        })  
    
    # Train the model and save the trained model into model_file
    trainer.train(model_file)
    #print ("Log of last iteration={}".format(trainer.logparser.iterations[-1]))

    # Initial tagger for prediction task
    trained_model = pycrfsuite.Tagger()
    trained_model.open(model_file) # Load the trained model.
        
    # Get prediction tag results from trained model
    y_pred = [trained_model.tag(xseq) for xseq in X_test]
    
    # Print the Precision, Recall, and F-1 score
    print(bio_classification_report(y_test, y_pred))
    
    end = time.time()
    print('CRF model has been generated.')
    print('runtime:', end - start)

In [28]:
training_file = "corpus/train.txt"
testing_file = "corpus/test.txt"
model_file = "model.crfsuite"

main(training_file, testing_file, model_file)

             precision    recall  f1-score   support

  APPOS-MOD       0.00      0.00      0.00        78
    INF-MOD       0.00      0.00      0.00        55
POSTADJ-MOD       0.00      0.00      0.00         2
     PP-MOD       0.00      0.00      0.00       135
 PREADJ-MOD       0.00      0.00      0.00       109
PREVERB-MOD       0.00      0.00      0.00        20
     RC-MOD       0.00      0.00      0.00        13

avg / total       0.88      0.94      0.91      6625

CRF model has been generated.
('runtime:', 2.63782000541687)
