##### Libraries 

In [1]:
from sklearn.model_selection import train_test_split
import pycrfsuite
import pickle
import numpy as np
from sklearn.metrics import classification_report

##### Pre-processing datset for chunk tag sequence prediction 

* Collecting the two ConLL datasets and combining them
* All annotation tags available as Lists 

In [2]:
datafolder = "Data/"
savefolder = "Save/"
pickle_folder = "Pickles/"

# Read
anno_text_ank = open(savefolder+"ConLLformat_annotator_ank.txt").read()
anno_text_ian = open(savefolder+"ConLLformat_annotator_ian.txt").read()
## Reading the claims cluster to include it in the tags
with open(pickle_folder+"clustered_claims_final3.pkl", 'rb') as f:
    claim_text = pickle.load(f)[0]

## Combine
anno_data = anno_text_ank.split("\n") + anno_text_ian.split("\n")


In [3]:
len(claim_text['claims'])

550

In [4]:
## The annotation tags
tag = ['$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX', '``']
ner_tags = ['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']
parser = ['ROOT', 'acl', 'acomp', 'advcl', 'advmod', 'agent', 'amod', 'appos', 'attr', 'aux', 'auxpass', 'case', 'cc', 'ccomp', 'compound', 'conj', 'csubj', 'csubjpass', 'dative', 'dep', 'det', 'dobj', 'expl', 'intj', 'mark', 'meta', 'neg', 'nmod', 'npadvmod', 'nsubj', 'nsubjpass', 'nummod', 'oprd', 'parataxis', 'pcomp', 'pobj', 'poss', 'preconj', 'predet', 'prep', 'prt', 'punct', 'quantmod', 'relcl', 'xcomp']
actor_claim_tag = ['O','B-ACT','I-ACT','B-CLAIM','I-CLAIM']
new_actor_claim_tag = ['O','B-ACT','I-ACT','B-CLAIM-0','I-CLAIM-0','B-CLAIM-1','I-CLAIM-1','B-CLAIM-2','I-CLAIM-2','B-CLAIM-3','I-CLAIM-3','B-CLAIM-4','I-CLAIM-4']

##### Custom Functions to convert data into dataset:
* Format: dataset = [[(w1,t11,t12..)],[(w2,t21,t22..)]] 
* where w = word, t = tags

In [5]:
## Raw annotated data into dataset for training/testing
def clus_dataset_creator(split_data,actor_claim_tag,no_of_claims):
    sup_op_cnt = []
    dataset = []
    claim_count = 0


    the_end = False    ## the function end bool
    i = 0              ## line_no-1
    while not the_end:
            
        linebreak = False   ## for reading per_line of ACTUAL sentences
        line_arr = []
        found_claim = False
        
        newLine = []
        
        while not linebreak:
            ## feature error handling
            try:
                per_tag = split_data[i].split("\t")
                xoxo = per_tag[3]
            except:
                if(claim_count>=(no_of_claims-1)):
                    the_end = True
                i+=1
                linebreak = True
                break
            ## features corrections
            if(len(per_tag)<7):    
                per_tag = per_tag[:-1]+['O','']    
            #######################################
            ## 1. appending to newLine       START
            if(len(line_arr)==0):
                line_arr.append(per_tag[0])
                newLine.append(per_tag[1:7])
                i+=1
            else:
                if(int(per_tag[0]) >= int(line_arr[-1])):
                    line_arr.append(per_tag[0])
                    newLine.append(per_tag[1:7])
                    i+=1
                else:
                    linebreak = True
            ## 1. appending to newLine          END
        ##############################################################
        modLine = []
        for j in newLine:
            ## 2. CLAIM hunting       START
            if(j[4] not in actor_claim_tag):   ## a. if not in act_claim_tag, use 'O'
                j[4] = 'O'
            elif(j[4][2:]=='CLAIM'):           ## b. if a CLAIM tag
                if(j[4][0] == 'B' and not found_claim):
                    sup_op_cnt.append(j[5])
                    found_claim = True
                    
                try:
                    j[4] = j[4] + "-" + str(claim_text['cluster'][claim_count])
                except:
                    break
            modLine.append(tuple(j[:-1]))        
            ## 2. CLAIM hunting          END
            ##############################################################

        if(found_claim):   # increment only if atleast one claim is found
            claim_count+=1
            
        dataset.append(modLine)
    return dataset,sup_op_cnt
                    
        

In [6]:
## Raw annotated data into dataset for training/testing
def noclus_dataset_creator(split_data,actor_claim_tag):
    sup_op_cnt = []
    dataset = []
    claim_count = 0


    the_end = False    ## the function end bool
    i = 0              ## line_no-1
    while not the_end:
            
        linebreak = False   ## for reading per_line of ACTUAL sentences
        line_arr = []

        
        newLine = []
        
        while not linebreak:
            ## feature error handling
            try:
                per_tag = split_data[i].split("\t")
                xoxo = per_tag[3]
            except:
                if(i>=(len(split_data)-1)):
                    the_end = True
                i+=1
                linebreak = True
                break
            ## features corrections
            if(len(per_tag)<7):    
                per_tag = per_tag[:-1]+['O','']    
            #######################################
            ## Appending to newLine       START
            if(len(line_arr)==0):
                line_arr.append(per_tag[0])
                newLine.append(per_tag[1:6])
                i+=1
            else:
                if(int(per_tag[0]) >= int(line_arr[-1])):
                    line_arr.append(per_tag[0])
                    newLine.append(per_tag[1:6])
                    i+=1
                else:
                    linebreak = True
            ## Appending to newLine          END
            #######################################
            
        dataset.append(newLine)
    return dataset

In [7]:
## dataset is created
no_of_claims = len(claim_text['cluster'])
clus_data , sup_opp = clus_dataset_creator(anno_data,actor_claim_tag,no_of_claims)
noclus_data = noclus_dataset_creator(anno_data,actor_claim_tag)

In [8]:
with open(pickle_folder+"supp_opp_final3.pkl", 'wb') as f:
    pickle.dump(sup_opp, f)

##### Feature extraction
* Will create and assign features for CRF model to train with using all the other tags

In [9]:
## Converts each word to feature consumable by a CRF model
def word_make_features(doc, i):
    word = doc[i][0]
    tag = doc[i][1]
    ner_tags = doc[i][2]
    parser= doc[i][3]
    try:
        actor_claim_tag = doc[i][4]
    except:
        print(doc[i])

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word.tag =' + tag,
        'word.ner_tags=' + ner_tags,
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'word.parser=' + parser
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        tag1 = doc[i-1][1]
        ner_tags1 = doc[i-1][2]
        parser1 = doc[i-1][3]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.tag =' + tag1,
            '-1:word.ner_tags=' + ner_tags1,
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:parser=' + parser1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        tag1 = doc[i+1][1]
        ner_tags1 = doc[i+1][2]
        parser1 = doc[i+1][3]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.tag =' + tag1,
            '-1:word.ner_tags=' + ner_tags1,
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:parser=' + parser1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [10]:
## Combine the featire creation to features per sentence consumable by the model
## A function for extracting features in documents
def features_extraction(doc):
    return [word_make_features(doc, i) for i in range(len(doc))]

## A function fo generating the list of labels for each document
def extract_label_per_sent(doc):
    Label = []
    for (token,tag, ner_tags,parser,label) in doc:
        Label.append(label)
    return(Label)

def model_data(data,test_split):
    ## Collecting all features and labels
    all_features = [features_extraction(doc) for doc in data]
    all_labels = [extract_label_per_sent(doc) for doc in data]

    ## The dataset fully prepared
    return(train_test_split(all_features, all_labels, test_size=test_split))
    

#### For Clustered data and unclustered data

In [11]:
cX_train, cX_test, cy_train, cy_test =  model_data(clus_data,0.30)
X_train, X_test, y_train, y_test =  model_data(noclus_data,0.30)

#####  Training a model
* CRF model with c1 and c2 penalties

#### a . Unclustered data 

In [12]:
## In pycrfsuite, A CRF model in can be trained by first creating a 
## trainer, and then submit the training data and corresponding labels
## to the trainer. After that, set the parameters and call train() to 
## start the training process
## CRFSuite: http://www.chokkan.org/software/crfsuite/manual.html#idp8849114176

trainer = pycrfsuite.Trainer(verbose=True)

# initiate training data to the trainer
for x_features, y_labels in zip(X_train, y_train):
    trainer.append(x_features, y_labels)

# Set the parameters of the model
trainer.set_params({
    'c1': 0.85,   # coefficient for L1 penalty
    'c2': 0.0094,  # coefficient for L2 penalty
    # maximum number of iterations
    'max_iterations': 1500,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})



#### b. Clustered data 

In [16]:
ctrainer = pycrfsuite.Trainer(verbose=True)

# initiate training data to the trainer
for cx_features, cy_labels in zip(cX_train, cy_train):
    ctrainer.append(cx_features, cy_labels)

# Set the parameters of the model
ctrainer.set_params({
    'c1': 0.85,   # coefficient for L1 penalty
    'c2': 0.0094,  # coefficient for L2 penalty
    # maximum number of iterations
    'max_iterations': 1500,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})



### Frozen CRF models 

In [17]:
# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
#      'c1': 0.85,
#     'c2': 0.0094
## clustered claims
#1# ctrainer.train(pickle_folder+'crf_KyotoData_version_all_withKmeans_500iter.model')   #  25% test split  # some zero-labels in testsplit
#2# ctrainer.train(pickle_folder+'crf_KyotoData_version_all_withKmeans_1000iter.model')      #  30% test split
# ctrainer.train(pickle_folder+'crf_KyotoData_version_all_withKmeans_1500iter.model')      #  30% test split
# ctrainer.train(pickle_folder+'crf_KyotoData_version_all_withKmeans_100iter_v2.model')      #  30% test split
ctrainer.train(pickle_folder+'crf_KyotoData_version_all_withKmeans_1500iter_v3.model')      #  30% test split


## unclustered claims
#1# trainer.train(pickle_folder+'crf_KyotoData_version_all_noClus_500iter.model')    #  25% test split
#2# trainer.train(pickle_folder+'crf_KyotoData_version_all_noClus_1000iter.model')       #  30% test split
#3# trainer.train(pickle_folder+'crf_KyotoData_version_all_noClus_1500iter.model')        #  30% test split
# trainer.train(pickle_folder+'crf_KyotoData_version_all_noClus_1500iter_v3.model')        #  30% test split

Feature generation
type: crf1d (1-st order)
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 24152
Seconds required: 0.241

L-BFGS optimization
c1: 0.850000
c2: 0.009400
num_memories: 6
max_iterations: 1500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 43064.071886
Feature norm: 1.000000
Error norm: 30808.604991
Active features: 21049
Line search trials: 1
Line search step: 0.000004
Seconds required for this iteration: 0.495

***** Iteration #2 *****
Loss: 40024.358563
Feature norm: 1.128896
Error norm: 14468.579785
Active features: 18233
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.251

***** Iteration #3 *****
Loss: 38834.210066
Feature norm: 1.223833
Error norm: 11660.304059
Active features: 13314
Line search trials: 1
Line search step: 1.000000
Second

### Frozen test/train splits to display results later with the model 

In [19]:
## Freezing test/train for displaying model features later
## Clustered
#1#
# with open(pickle_folder+"testtrain_withKmeans_500iter.pkl", 'wb') as f:
#     pickle.dump([X_train, X_test, y_train, y_test], f)  
#2#
# with open(pickle_folder+"testtrain_withKmeans_1000iter.pkl", 'wb') as f:
#     pickle.dump([X_train, X_test, y_train, y_test], f)  
#3#
# with open(pickle_folder+"testtrain_withKmeans_1500iter.pkl", 'wb') as f:
#     pickle.dump([cX_train, cX_test, cy_train, cy_test], f)
#4#
# with open(pickle_folder+"testtrain_withKmeans_1500iter_v2.pkl", 'wb') as f:
#     pickle.dump([cX_train, cX_test, cy_train, cy_test], f)
#5#
# with open(pickle_folder+"testtrain_withKmeans_100iter_v2.pkl", 'wb') as f:
#     pickle.dump([cX_train, cX_test, cy_train, cy_test], f)
#6#
with open(pickle_folder+"testtrain_withKmeans_1500iter_v3.pkl", 'wb') as f:
    pickle.dump([cX_train, cX_test, cy_train, cy_test], f)

## Unclustered
#1#
# with open(pickle_folder+"testtrain_noClus_500iter.pkl", 'wb') as f:
#     pickle.dump([X_train, X_test, y_train, y_test], f)  
#2#
# with open(pickle_folder+"testtrain_noClus_1000iter.pkl", 'wb') as f:
#     pickle.dump([X_train, X_test, y_train, y_test], f) 
# #3#
# with open(pickle_folder+"testtrain_noClus_1500iter.pkl", 'wb') as f:
#     pickle.dump([X_train, X_test, y_train, y_test], f)
# #4#
with open(pickle_folder+"testtrain_noClus_1500iter_v3.pkl", 'wb') as f:
    pickle.dump([X_train, X_test, y_train, y_test], f)

#####  Testing and Evaluating the model
* checking the poutput of tagger for random test inputs adn create an evaluation matrix for the classification problem

### A. CLUSTERED DATA RESULTS

In [20]:
## Once the tagger is done and dusted we test it on the test data
tagger = pycrfsuite.Tagger()
tagger.open(pickle_folder+'crf_KyotoData_version_all_withKmeans_1500iter_v3.model')  ## with claim cluster
             # pickle_folder+"crf_KyotoData_version_all_noClus_1500iter.model"    ## without claim cluster

def prediction(X_test):
    y_pred = [tagger.tag(xseq) for xseq in X_test]
    return y_pred

def tagger_func(i,X_test,y_pred):
    for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
        print("%s \t(%s)" % (y, x))
    return 0


In [21]:
# example outputs
# Let's take a look at a random sample in the testing set
# tagger_func(321,cX_test,cy_pred) # forclus1000
# tag_choice = 1
cy_pred = prediction(cX_test)
tagger_func(131,cX_test,cy_pred) # forclus1000
tag_choice = 1

mr. 	(O)
razali 	(O)
, 	(O)
who 	(O)
was 	(O)
a 	(O)
major 	(O)
player 	(O)
in 	(O)
the 	(O)
rio 	(O)
conference 	(O)
in 	(O)
1992 	(O)
, 	(O)
said 	(O)
that 	(O)
not 	(O)
only 	(O)
were 	(O)
indicators 	(O)
of 	(O)
environmental 	(O)
destruction 	(O)
worse 	(O)
this 	(O)
year 	(O)
, 	(O)
but 	(O)
also 	(O)
the 	(O)
spirit 	(O)
of 	(O)
rio 	(O)
was 	(O)
gone 	(O)
. 	(O)
''we 	(O)
reached 	(O)
the 	(O)
zenith 	(O)
of 	(O)
our 	(O)
enthusiasm 	(O)
and 	(O)
commitment 	(O)
for 	(O)
sustainable 	(O)
development 	(O)
and 	(O)
the 	(O)
environment 	(O)
in 	(O)
1992 	(O)
, 	(O)
'' 	(O)
he 	(O)
said 	(O)
. 	(O)
'' 	(O)


In [22]:
tag = [actor_claim_tag,new_actor_claim_tag]
# dataset for evaluation
labels = {}
line_count = len(tag[tag_choice])
for i in tag[tag_choice]:
    labels[i]=line_count
    line_count-=1
print(labels)

{'O': 13, 'B-ACT': 12, 'I-ACT': 11, 'B-CLAIM-0': 10, 'I-CLAIM-0': 9, 'B-CLAIM-1': 8, 'I-CLAIM-1': 7, 'B-CLAIM-2': 6, 'I-CLAIM-2': 5, 'B-CLAIM-3': 4, 'I-CLAIM-3': 3, 'B-CLAIM-4': 2, 'I-CLAIM-4': 1}


In [23]:
# Convert the sequences of tags into a 1-dimensional array
c_predictions = []
# predictions = np.array([labels[tag] for row in y_pred for tag in row])
for row in cy_pred:
    for tag in row:
        if(tag == ''):
            print('there is empty tag')
            tag='O'
        c_predictions.append(labels[tag])
c_predictions = np.array(c_predictions)
# truths = np.array([labels[tag] for row in y_test for tag in row])
c_truths = [] 
for row in cy_test:
    for tag in row:
        if(tag == ''):
            print('there is empty tag')
            tag='O'
        c_truths.append(labels[tag])
c_truths = np.array(c_truths)

In [24]:
# Print out the classification report
print(classification_report(
    c_truths, c_predictions,
    target_names= new_actor_claim_tag))

              precision    recall  f1-score   support

           O       0.54      0.16      0.25       274
       B-ACT       0.50      0.15      0.23        20
       I-ACT       0.29      0.09      0.14       579
   B-CLAIM-0       0.12      0.03      0.04        39
   I-CLAIM-0       0.00      0.00      0.00       114
   B-CLAIM-1       0.00      0.00      0.00        10
   I-CLAIM-1       0.32      0.04      0.07       204
   B-CLAIM-2       0.00      0.00      0.00        17
   I-CLAIM-2       0.24      0.05      0.09      1311
   B-CLAIM-3       0.13      0.02      0.03       117
   I-CLAIM-3       0.38      0.16      0.22       249
   B-CLAIM-4       0.45      0.15      0.23       176
   I-CLAIM-4       0.91      0.99      0.95     29125

    accuracy                           0.90     32235
   macro avg       0.30      0.14      0.17     32235
weighted avg       0.85      0.90      0.87     32235



### B. UNCLUSTERED DATA RESULTS 

In [25]:
## Once the tagger is done and dusted we test it on the test data
tagger = pycrfsuite.Tagger()
tagger.open(pickle_folder+'crf_KyotoData_version_all_noClus_1500iter_v3.model')  ## with claim cluster
             # pickle_folder+"crf_KyotoData_version_all_noClus_1500iter.model"    ## without claim cluster

def prediction(X_test):
    y_pred = [tagger.tag(xseq) for xseq in X_test]
    return y_pred

def tagger_func(i,X_test,y_pred):
    for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
        print("%s \t(%s)" % (y, x))
    return 0


In [26]:
# example outputs
# tagger_func(282,X_test,y_pred) # forNOTclus1000
# tagger_func(244,X_test,y_pred) # forNOTclus1000
# tag_choice = 0
y_pred = prediction(X_test)
tagger_func(141,X_test,y_pred) # forNOTclus1000
tag_choice = 0

any 	(O)
pollution 	(O)
pegged 	(O)
to 	(O)
economic 	(O)
activity 	(O)
is 	(O)
unprecedented 	(O)
and 	(O)
unwarranted 	(O)
. 	(O)


In [27]:
tag = [actor_claim_tag,new_actor_claim_tag]
# dataset for evaluation
labels = {}
line_count = len(tag[tag_choice])
for i in tag[tag_choice]:
    labels[i]=line_count
    line_count-=1
print(labels)

{'O': 5, 'B-ACT': 4, 'I-ACT': 3, 'B-CLAIM': 2, 'I-CLAIM': 1}


In [28]:
# Convert the sequences of tags into a 1-dimensional array
predictions = []
# predictions = np.array([labels[tag] for row in y_pred for tag in row])
for row in y_pred:
    for tag in row:
        if(tag == ''):
            print('there is empty tag')
            tag='O'
        predictions.append(labels[tag])
predictions = np.array(predictions)
# truths = np.array([labels[tag] for row in y_test for tag in row])
truths = [] 
for row in y_test:
    for tag in row:
        if(tag == ''):
            print('there is empty tag')
            tag='O'
        truths.append(labels[tag])
truths = np.array(truths)

there is empty tag
there is empty tag
there is empty tag


In [29]:
# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=actor_claim_tag))

              precision    recall  f1-score   support

           O       0.37      0.15      0.21      2377
       B-ACT       0.20      0.05      0.08       208
       I-ACT       0.33      0.08      0.12       264
     B-CLAIM       0.37      0.08      0.13       170
     I-CLAIM       0.92      0.98      0.95     29139

    accuracy                           0.90     32158
   macro avg       0.44      0.26      0.30     32158
weighted avg       0.86      0.90      0.87     32158

