##### Libraries 

In [1]:
from sklearn.model_selection import train_test_split
import pycrfsuite
import pickle

##### Pre-processing datset for chunk tag sequence prediction 

* Collecting the two ConLL datasets and combining them
* All annotation tags available as Lists 

In [2]:

# Read
anno_text_ank = open("ConLLformat_annotator_ank.txt").read()
anno_text_ian = open("ConLLformat_annotator_ian.txt").read()
## Reading the claims cluster to include it in the tags
with open("clustered_claims_final1.pkl", 'rb') as f:
    claim_text = pickle.load(f)[0]

## Combine
anno_data = anno_text_ank.split("\n") + anno_text_ian.split("\n")


In [3]:
len(claim_text['cluster'])

468

In [4]:
## The annotation tags
tag = ['$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX', '``']
ner_tags = ['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']
parser = ['ROOT', 'acl', 'acomp', 'advcl', 'advmod', 'agent', 'amod', 'appos', 'attr', 'aux', 'auxpass', 'case', 'cc', 'ccomp', 'compound', 'conj', 'csubj', 'csubjpass', 'dative', 'dep', 'det', 'dobj', 'expl', 'intj', 'mark', 'meta', 'neg', 'nmod', 'npadvmod', 'nsubj', 'nsubjpass', 'nummod', 'oprd', 'parataxis', 'pcomp', 'pobj', 'poss', 'preconj', 'predet', 'prep', 'prt', 'punct', 'quantmod', 'relcl', 'xcomp']
actor_claim_tag = ['O','B-ACT','I-ACT','B-CLAIM','I-CLAIM']
new_actor_claim_tag = ['O','B-ACT','I-ACT','B-CLAIM-0','I-CLAIM-0','B-CLAIM-1','I-CLAIM-1','B-CLAIM-2','I-CLAIM-2','B-CLAIM-3','I-CLAIM-3','B-CLAIM-4','I-CLAIM-4']

##### Custom Functions to convert data into dataset:
* Format: dataset = [[(w1,t11,t12..)],[(w2,t21,t22..)]] 
* where w = word, t = tags

In [5]:
anno_data[286]

'18\tcould\tMD\t\taux\tO\t'

In [8]:
## Raw annotated data into dataset for training/testing
def dataset_creator(split_data,actor_claim_tag,no_of_claims):
    sup_op_cnt = []
    dataset = []
    claim_count = 0


    the_end = False    ## the function end bool
    i = 0              ## line_no-1
    while not the_end:
            
        linebreak = False   ## for reading per_line of ACTUAL sentences
        line_arr = []
        found_claim = False
        
        newLine = []
        
        while not linebreak:
            ## feature error handling
            try:
                per_tag = split_data[i].split("\t")
                xoxo = per_tag[3]
            except:
                if(claim_count>=(no_of_claims-1)):
                    the_end = True
                i+=1
                linebreak = True
                break
            ## features corrections
            if(len(per_tag)<7):    
                per_tag = per_tag[:-1]+['O','']    
            #######################################
            ## 1. appending to newLine       START
            if(len(line_arr)==0):
                line_arr.append(per_tag[0])
                newLine.append(per_tag[1:7])
                i+=1
            else:
                if(int(per_tag[0]) >= int(line_arr[-1])):
                    line_arr.append(per_tag[0])
                    newLine.append(per_tag[1:7])
                    i+=1
                else:
                    linebreak = True
            ## 1. appending to newLine          END
            #######################################
        ##############################################################
        modLine = []
        for j in newLine:
            ## 2. CLAIM hunting       START
            if(j[4] not in actor_claim_tag):   ## a. if not in act_claim_tag, use 'O'
                j[4] = 'O'
            elif(j[4][2:]=='CLAIM'):           ## b. if a CLAIM tag
                if(j[4][0] == 'B' and not found_claim):
#                     print(i,claim_count)
                    sup_op_cnt.append(j[5])
                    found_claim = True
                j[4] = j[4] + "-" + str(claim_text['cluster'][claim_count])
            modLine.append(tuple(j[:-1]))        
            ## 2. CLAIM hunting          END
            ##############################################################

        if(found_claim):   # increment only if atleast one claim is found
            claim_count+=1
            
        dataset.append(modLine)
    print(i,claim_count)
    return dataset,sup_op_cnt
                    
        

In [9]:
## dataset is created
no_of_claims = len(claim_text['cluster'])
data,sup_opp = dataset_creator(anno_data,actor_claim_tag,no_of_claims)

27
['28', 'China', 'NNP', 'GPE', 'pobj', 'O', '']
41
['42', 'later', 'RB', '', 'advmod', 'O', '']
31
['32', 'position', 'NN', '', 'dobj', 'O', '']
55
['56', 'autonomy', 'NN', '', 'pobj', 'O', '']
51
['52', 'warming', 'NN', '', 'pobj', 'O', '']
32
['33', 'gases', 'NNS', '', 'dobj', 'O', '']
46
['47', 'industries', 'NNS', '', 'conj', 'O', '']
100
['101', 'Illness', 'NNP', '', 'pobj', 'O', '']
101
['102', ' ', '_SP', '', 'dobj', 'O', '']
63
['64', 'use', 'NN', '', 'pobj', 'O', '']
27
['28', 'now', 'RB', '', 'advmod', 'O', '']
36
['37', 'temperature', 'NN', '', 'compound', 'O', '']
37
['38', 'shifts', 'NNS', '', 'pobj', 'O', '']
58
['59', 'emissions', 'NNS', '', 'pobj', 'O', '']
51
['52', 'change', 'NN', '', 'pobj', 'O', '']
39
['40', 'scientist', 'NN', '', 'pobj', 'O', '']
36
['37', '-', 'NNS', '', 'dep', 'O', '']
37
['38', 'chairmen', 'NNS', '', 'pobj', 'O', '']
51
['52', 'industry', 'JJ', '', 'amod', 'O', '']
52
['53', 'rollbacks', 'NNS', '', 'pobj', 'O', '']
50
['51', 'anti', 'JJ', 'NO

In [10]:
print(no_of_claims)

468


In [11]:
sup_opp[-1]

'-'

In [12]:
with open("supp_opp_final1.pkl", 'wb') as f:
    pickle.dump(sup_opp, f)

##### Feature extraction
* Will create and assign features for CRF model to train with using all the other tags

In [13]:
## Converts each word to feature consumable by a CRF model
def word_make_features(doc, i):
    word = doc[i][0]
    tag = doc[i][1]
    ner_tags = doc[i][2]
    parser= doc[i][3]
    try:
        actor_claim_tag = doc[i][4]
    except:
        print(doc[i])

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word.tag =' + tag,
        'word.ner_tags=' + ner_tags,
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'word.parser=' + parser
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        tag1 = doc[i-1][1]
        ner_tags1 = doc[i-1][2]
        parser1 = doc[i-1][3]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.tag =' + tag1,
            '-1:word.ner_tags=' + ner_tags1,
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:parser=' + parser1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        tag1 = doc[i+1][1]
        ner_tags1 = doc[i+1][2]
        parser1 = doc[i+1][3]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.tag =' + tag1,
            '-1:word.ner_tags=' + ner_tags1,
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:parser=' + parser1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [34]:
## Combine the featire creation to features per sentence consumable by the model
## A function for extracting features in documents
def features_extraction(doc):
    return [word_make_features(doc, i) for i in range(len(doc))]

## A function fo generating the list of labels for each document
def extract_label_per_sent(doc):
    Label = []
    for (token,tag, ner_tags,parser,label) in doc:
        Label.append(label)
    return(Label)

## Collecting all features and labels
all_features = [features_extraction(doc) for doc in data]
all_labels = [extract_label_per_sent(doc) for doc in data]

## The dataset fully prepared
X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.30)

#####  Training a model
* CRF model with c1 and c2 penalties

In [35]:
## In pycrfsuite, A CRF model in can be trained by first creating a 
## trainer, and then submit the training data and corresponding labels
## to the trainer. After that, set the parameters and call train() to 
## start the training process
## CRFSuite: http://www.chokkan.org/software/crfsuite/manual.html#idp8849114176

trainer = pycrfsuite.Trainer(verbose=True)

# initiate training data to the trainer
for x_features, y_labels in zip(X_train, y_train):
    trainer.append(x_features, y_labels)

# Set the parameters of the model
trainer.set_params({
    'c1': 0.85,   # coefficient for L1 penalty
    'c2': 0.0094,  # coefficient for L2 penalty
    # maximum number of iterations
    'max_iterations': 1000,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})



In [36]:
# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
#      'c1': 0.85,
#     'c2': 0.0094
#0# trainer.train('crf_KyotoData_version_all_withKmeans_300iter.model') #  15% test split(so got some zero label warnings from test set) 
#1# trainer.train('crf_KyotoData_version_all_withKmeans_500iter.model')   #  25% test split
#2# trainer.train('crf_KyotoData_version_all_withKmeans_1000iter.model')      #  30% test split

Feature generation
type: crf1d (1-st order)
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 21276
Seconds required: 0.197

L-BFGS optimization
c1: 0.850000
c2: 0.009400
num_memories: 6
max_iterations: 1000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 37169.028894
Feature norm: 1.000000
Error norm: 21933.496245
Active features: 18248
Line search trials: 1
Line search step: 0.000005
Seconds required for this iteration: 0.388

***** Iteration #2 *****
Loss: 35189.650690
Feature norm: 1.111794
Error norm: 11978.534639
Active features: 16026
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.207

***** Iteration #3 *****
Loss: 33561.635183
Feature norm: 1.207630
Error norm: 10457.099630
Active features: 12016
Line search trials: 1
Line search step: 1.000000
Second

### freeze test/train splits to display results later with the model 

In [37]:
## Freezing test/train for displaying model features later
#1#
# with open("testtrain_withKmeans_500iter.pkl", 'wb') as f:
#     pickle.dump([X_train, X_test, y_train, y_test], f)  
#2#
# with open("testtrain_withKmeans_1000iter.pkl", 'wb') as f:
#     pickle.dump([X_train, X_test, y_train, y_test], f)  

#####  Testing and Evaluating the model
* checking the poutput of tagger for random test inputs adn create an evaluation matrix for the classification problem

In [38]:
## Once the tagger is done and dusted we test it on the test data
tagger = pycrfsuite.Tagger()
tagger.open('crf_KyotoData_version_all_withKmeans_1000iter.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 299
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s \t(%s)" % (y, x))


should 	(O)
n't 	(O)
the 	(O)
family 	(O)
of 	(O)
man 	(O)
do 	(O)
the 	(O)
same 	(O)
for 	(O)
its 	(O)
home 	(O)
? 	(O)


In [39]:
# dataset for evaluation
labels = {}
line_count = len(new_actor_claim_tag)
for i in new_actor_claim_tag:
    labels[i]=line_count
    line_count-=1
print(labels)

{'O': 13, 'B-ACT': 12, 'I-ACT': 11, 'B-CLAIM-0': 10, 'I-CLAIM-0': 9, 'B-CLAIM-1': 8, 'I-CLAIM-1': 7, 'B-CLAIM-2': 6, 'I-CLAIM-2': 5, 'B-CLAIM-3': 4, 'I-CLAIM-3': 3, 'B-CLAIM-4': 2, 'I-CLAIM-4': 1}


In [40]:
import numpy as np
from sklearn.metrics import classification_report

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
# truths = np.array([labels[tag] for row in y_test for tag in row])
truths = [] 
for row in y_test:
    for tag in row:
        if(tag == ''):
            print('there is empty tag')
            tag='O'
        truths.append(labels[tag])
truths = np.array(truths)

In [41]:
## FOR CLUSTERED INPUT AND OUTPUT 
# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=new_actor_claim_tag))

              precision    recall  f1-score   support

           O       0.28      0.04      0.07       383
       B-ACT       0.00      0.00      0.00        20
       I-ACT       0.00      0.00      0.00        51
   B-CLAIM-0       0.00      0.00      0.00         8
   I-CLAIM-0       0.50      0.10      0.17       145
   B-CLAIM-1       0.50      0.12      0.20         8
   I-CLAIM-1       0.37      0.06      0.10       438
   B-CLAIM-2       0.00      0.00      0.00        33
   I-CLAIM-2       0.15      0.09      0.11       941
   B-CLAIM-3       0.08      0.02      0.03        89
   I-CLAIM-3       0.37      0.20      0.26       177
   B-CLAIM-4       0.39      0.17      0.24       135
   I-CLAIM-4       0.92      0.98      0.95     23208

    accuracy                           0.89     25636
   macro avg       0.27      0.14      0.16     25636
weighted avg       0.85      0.89      0.87     25636

