##### Libraries 

In [1]:
from sklearn.model_selection import train_test_split
import pycrfsuite

##### Pre-processing datset for chunk tag sequence prediction 

* Collecting the two ConLL datasets and combining them
* All annotation tags available as Lists 

In [2]:
## Read
anno_text_ank = open("Data/ConLLformat_annotator_ank.txt").read()
anno_text_ian = open("Data/ConLLformat_annotator_ian.txt").read()

## Combine
anno_data = anno_text_ian.split("\n")
anno_data += anno_text_ank.split("\n")

In [3]:
## The annotation tags
tag = ['$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX', '``']
ner_tags = ['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']
parser = ['ROOT', 'acl', 'acomp', 'advcl', 'advmod', 'agent', 'amod', 'appos', 'attr', 'aux', 'auxpass', 'case', 'cc', 'ccomp', 'compound', 'conj', 'csubj', 'csubjpass', 'dative', 'dep', 'det', 'dobj', 'expl', 'intj', 'mark', 'meta', 'neg', 'nmod', 'npadvmod', 'nsubj', 'nsubjpass', 'nummod', 'oprd', 'parataxis', 'pcomp', 'pobj', 'poss', 'preconj', 'predet', 'prep', 'prt', 'punct', 'quantmod', 'relcl', 'xcomp']
actor_claim_tag = ['O','B-ACT','I-ACT','B-CLAIM','I-CLAIM']


##### Custom Functions to convert data into dataset:
* Format: dataset = [[(w1,t11,t12..)],[(w2,t21,t22..)]] 
* where w = word, t = tags

In [4]:
## Raw annotated data into dataset for training/testing
def dataset_creator(split_data,actor_claim_tag):
    dataset = []
    i = 0
    newLine = []
    prev_label = 'O'
    for line in split_data[1:]:
        l = tuple(line.split("\t"))
        if(len(l)==8):
            dataset.append(newLine)
            newLine = []
            if(len(l[1:-2])==4):
                temp_l = list(l[1:-2]).append(prev_label)
            else:
                temp_l = list(l[1:-2])
                prev_label = temp_l[-1]
        else:
            if(len(l[1:-1])<4):
                newLine = []
            elif(len(l[1:-1])==4):
                temp_l = list(l[1:-1])
                temp_l.append(prev_label)                
            else:
                temp_l = list(l[1:-1])
                prev_label = temp_l[-1]
                
        if(temp_l[-1] not in actor_claim_tag):
            temp_l[-1]='O'
        
        newLine.append(tuple(temp_l))
    return dataset

In [5]:
## dataset is created
data = dataset_creator(anno_data,actor_claim_tag)

##### Feature extraction
* Will create and assign features for CRF model to train with using all the other tags

In [6]:
## Converts each word to feature consumable by a CRF model
def word_make_features(doc, i):
    word = doc[i][0]
    tag = doc[i][1]
    ner_tags = doc[i][2]
    parser= doc[i][3]
    try:
        actor_claim_tag = doc[i][4]
    except:
        print(doc[i])

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word.tag =' + tag,
        'word.ner_tags=' + ner_tags,
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'word.parser=' + parser
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        tag1 = doc[i-1][1]
        ner_tags1 = doc[i-1][2]
        parser1 = doc[i-1][3]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.tag =' + tag1,
            '-1:word.ner_tags=' + ner_tags1,
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:parser=' + parser1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        tag1 = doc[i+1][1]
        ner_tags1 = doc[i+1][2]
        parser1 = doc[i+1][3]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.tag =' + tag1,
            '-1:word.ner_tags=' + ner_tags1,
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:parser=' + parser1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [7]:
## Combine the featire creation to features per sentence consumable by the model
## A function for extracting features in documents
def features_extraction(doc):
    return [word_make_features(doc, i) for i in range(len(doc))]

## A function fo generating the list of labels for each document
def extract_label_per_sent(doc):
    Label = []
    for (token,tag, ner_tags,parser,label) in doc:
        Label.append(label)
    return(Label)

## Collecting all features and labels
all_features = [features_extraction(doc) for doc in data]
all_labels = [extract_label_per_sent(doc) for doc in data]

## The dataset fully prepared
X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.15)

#####  Training a model
* CRF model with c1 and c2 penalties

In [8]:
## In pycrfsuite, A CRF model in can be trained by first creating a 
## trainer, and then submit the training data and corresponding labels
## to the trainer. After that, set the parameters and call train() to 
## start the training process
## CRFSuite: http://www.chokkan.org/software/crfsuite/manual.html#idp8849114176

trainer = pycrfsuite.Trainer(verbose=True)

# initiate training data to the trainer
for x_features, y_labels in zip(X_train, y_train):
    trainer.append(x_features, y_labels)

# Set the parameters of the model
trainer.set_params({
    'c1': 0.85,   # coefficient for L1 penalty
    'c2': 0.0094,  # coefficient for L2 penalty
    # maximum number of iterations
    'max_iterations': 500,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})



In [9]:
# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
# trainer.train('Pickles/crf_KyotoData_version_1.model')

#####  Testing and Evaluating the model
* checking the poutput of tagger for random test inputs adn create an evaluation matrix for the classification problem

In [10]:
## Once the tagger is done and dusted we test it on the test data
tagger = pycrfsuite.Tagger()
tagger.open('Pickles/crf_KyotoData_version_1.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 13
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s \t(%s)" % (y, x))


and 	(O)
i 	(O)
-- 	(O)
i 	(O)
feel 	(O)
that 	(O)
for 	(O)
a 	(O)
simple 	(O)
reason 	(O)
: 	(O)
  	(O)


In [11]:
# dataset for evaluation
labels = {}
line_count = len(actor_claim_tag)
for i in actor_claim_tag:
    labels[i]=line_count
    line_count-=1
print(labels)

{'O': 5, 'B-ACT': 4, 'I-ACT': 3, 'B-CLAIM': 2, 'I-CLAIM': 1}


In [12]:
import numpy as np
from sklearn.metrics import classification_report

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

In [13]:
# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=actor_claim_tag))

              precision    recall  f1-score   support

           O       0.28      0.11      0.16      1244
       B-ACT       0.23      0.05      0.08       101
       I-ACT       0.59      0.17      0.27       128
     B-CLAIM       0.45      0.12      0.18        78
     I-CLAIM       0.88      0.97      0.92     10459

    accuracy                           0.86     12010
   macro avg       0.49      0.28      0.32     12010
weighted avg       0.81      0.86      0.82     12010

