In [41]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import os
import numpy as np

print(sklearn.__version__)

0.19.1


In [42]:
# Divide in train and test files [80:20] 

# Directory having content

doc_dir = '../../../../Data/input/rectified/content'

train_file_list = []
test_file_list = []

for f in os.listdir(doc_dir):
    #Random Sampling
    if np.random.uniform(0,1)< 0.8:
        train_file_list.append(f)
    else:
        test_file_list.append(f)

print (len(train_file_list))
print (len(test_file_list))

870
215


In [43]:
# Build Tagged dataset to be fed in nltk hmm tagger
# It is a list of tagged sentences. 
# training_data = [ [(word, tag), (word, tag).....]
# [(word, tag), (word, tag).....]
#]

# Directory having tags
tag_dir = '../../../../Data/input/rectified/new_tags'

training_data = []

for f in train_file_list:
    training_sentences =[]
    word_file_path = os.path.join(doc_dir, f)
    tag_file_path = os.path.join(tag_dir, f)
    lines_in_word_file = []
    lines_in_tag_file = []
    with open(word_file_path, "rt") as word_file:
        for line in word_file:
            lines_in_word_file.append(line)
    with open(tag_file_path, "rt") as tag_file:
        for line in tag_file:
            lines_in_tag_file.append(line)
    if (len(lines_in_word_file) == len(lines_in_tag_file)) and len(lines_in_word_file) > 0:
        for i in range(len(lines_in_word_file)):
            word_in_file = lines_in_word_file[i].split()
            tag_in_file = lines_in_tag_file[i].split()
            pairs_in_line = []
            length = min(len(word_in_file), len(tag_in_file))
            #Create the word_tag pair
            for j in range(length):
                pairs_in_line.append((word_in_file[j], tag_in_file[j]));
            training_sentences.append(pairs_in_line)
    if len(training_sentences) > 0:
        training_data.extend(training_sentences)

print(training_data[0])

[('In', 'O'), ('a', 'O'), ('major', 'O'), ('breakthrough', 'O'), ('for', 'O'), ('security', 'O'), ('agencies', 'O'), ('in', 'O'), ('India', 'LOC_Others'), (',', 'O'), ('Yasin', 'PER_Accused'), ('Bhatkal', 'PER_Accused'), (',', 'O'), ('a', 'O'), ('key', 'O'), ('conspirator', 'O'), ('in', 'O'), ('several', 'O'), ('bomb', 'O'), ('blasts', 'O'), ('and', 'O'), ('co-founder', 'O'), ('of', 'O'), ('the', 'O'), ('banned', 'O'), ('Indian', 'ORG_Accused'), ('Mujahideen', 'ORG_Accused'), ('(', 'O'), ('IM', 'ORG_Accused'), (')', 'O'), (',', 'O'), ('and', 'O'), ('another', 'O'), ('IM', 'ORG_Accused'), ('operative', 'O'), ('Asadullah', 'PER_Accused'), ('Akhtar', 'PER_Accused'), ('alias', 'O'), ('Haddi', 'PER_Accused'), (',', 'O'), ('were', 'O'), ('arrested', 'O'), ('from', 'O'), ('the', 'O'), ('India-Nepal', 'LOC_Accused'), ('border', 'O'), ('in', 'O'), ('Bihar', 'LOC_Accused'), ('Raxaul', 'LOC_Accused'), ('town', 'LOC_Accused'), ('on', 'O'), ('Thursday', 'O'), ('.', 'O'), ('Yasin', 'PER_Accused'), (

In [44]:
def word2features(sent, i):
    word = sent[i][0]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
    ]
    if i > 0:
        word1 = sent[i-1][0]
        tag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
           # '-1:tag1=' + tag1,
        ])
    else:
        features.append('BOS')
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('EOS')
        
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2tags(sent):
    return [tag for word, tag in sent]

def sent2words(sent):
    return [word for word, tag in sent] 

In [45]:
print (training_data[0])
print (sent2features(training_data[0])[0])
print (sent2tags(training_data[0])[0])
print (sent2words(training_data[0])[0])

[('In', 'O'), ('a', 'O'), ('major', 'O'), ('breakthrough', 'O'), ('for', 'O'), ('security', 'O'), ('agencies', 'O'), ('in', 'O'), ('India', 'LOC_Others'), (',', 'O'), ('Yasin', 'PER_Accused'), ('Bhatkal', 'PER_Accused'), (',', 'O'), ('a', 'O'), ('key', 'O'), ('conspirator', 'O'), ('in', 'O'), ('several', 'O'), ('bomb', 'O'), ('blasts', 'O'), ('and', 'O'), ('co-founder', 'O'), ('of', 'O'), ('the', 'O'), ('banned', 'O'), ('Indian', 'ORG_Accused'), ('Mujahideen', 'ORG_Accused'), ('(', 'O'), ('IM', 'ORG_Accused'), (')', 'O'), (',', 'O'), ('and', 'O'), ('another', 'O'), ('IM', 'ORG_Accused'), ('operative', 'O'), ('Asadullah', 'PER_Accused'), ('Akhtar', 'PER_Accused'), ('alias', 'O'), ('Haddi', 'PER_Accused'), (',', 'O'), ('were', 'O'), ('arrested', 'O'), ('from', 'O'), ('the', 'O'), ('India-Nepal', 'LOC_Accused'), ('border', 'O'), ('in', 'O'), ('Bihar', 'LOC_Accused'), ('Raxaul', 'LOC_Accused'), ('town', 'LOC_Accused'), ('on', 'O'), ('Thursday', 'O'), ('.', 'O'), ('Yasin', 'PER_Accused'), (

In [46]:
x_train = [sent2features(s) for s in training_data]
y_train = [sent2tags(s) for s in training_data]

In [47]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(x_train, y_train):
    trainer.append(xseq, yseq)

In [48]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [49]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [50]:
# Train and save the model
trainer.train('osint-bomb-blast.crfsuite')

In [51]:
tagger = pycrfsuite.Tagger()
tagger.open('osint-bomb-blast.crfsuite')

<contextlib.closing at 0x7f7c5b214b00>

In [52]:
# Create test data
test_data = []
for f in test_file_list:
    test_sentences =[]
    word_file_path = os.path.join(doc_dir, f)
    tag_file_path = os.path.join(tag_dir, f)
    lines_in_word_file = []
    lines_in_tag_file = []
    with open(word_file_path, "rt") as word_file:
        for line in word_file:
            lines_in_word_file.append(line)
    with open(tag_file_path, "rt") as tag_file:
        for line in tag_file:
            lines_in_tag_file.append(line)
    if (len(lines_in_word_file) == len(lines_in_tag_file)) and len(lines_in_word_file) > 0:
        for i in range(len(lines_in_word_file)):
            word_in_file = lines_in_word_file[i].split()
            tag_in_file = lines_in_tag_file[i].split()
            pairs_in_line = []
            length = min(len(word_in_file), len(tag_in_file))
            #Create the word_tag pair
            for j in range(length):
                pairs_in_line.append((word_in_file[j], tag_in_file[j]));
            test_sentences.append(pairs_in_line)
    if len(test_sentences) > 0:
        test_data.extend(test_sentences)

print(test_data[0])

[('The', 'O'), ('charge', 'O'), ('sheet', 'O'), ('filed', 'O'), ('by', 'O'), ('the', 'O'), ('Anti-Terrorism', 'ORG_Others'), ('Squad', 'ORG_Others'), ('(', 'ORG_Others'), ('ATS', 'ORG_Others'), (')', 'ORG_Others'), ('of', 'ORG_Others'), ('the', 'ORG_Others'), ('Rajasthan', 'ORG_Others'), ('police', 'ORG_Others'), ('against', 'O'), ('five', 'O'), ('accused', 'O'), ('in', 'O'), ('the', 'O'), ('2007', 'O'), ('Ajmer', 'LOC_Event'), ('dargah', 'LOC_Event'), ('blast', 'O'), ('case', 'O'), ('on', 'O'), ('Friday', 'O'), ('has', 'O'), ('named', 'O'), ('senior', 'O'), ('Rashtriya', 'ORG_Accused'), ('Swayamsevak', 'ORG_Accused'), ('Sangh', 'ORG_Accused'), ('leader', 'O'), ('Indresh', 'PER_Accused'), ('Kumar', 'PER_Accused'), ('as', 'O'), ('having', 'O'), ('provided', 'O'), ('guidance', 'O'), ('for', 'O'), ('a', 'O'), ('conspiracy', 'O'), ('hatched', 'O'), ('by', 'O'), ('radical', 'O'), ('elements', 'O'), ('for', 'O'), ('planting', 'O'), ('bombs', 'O'), ('in', 'O'), ('several', 'O'), ('cities', 'O

In [53]:
example_sent = test_data[0]
print(' '.join(sent2words(example_sent)))

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2tags(example_sent)))

The charge sheet filed by the Anti-Terrorism Squad ( ATS ) of the Rajasthan police against five accused in the 2007 Ajmer dargah blast case on Friday has named senior Rashtriya Swayamsevak Sangh leader Indresh Kumar as having provided guidance for a conspiracy hatched by radical elements for planting bombs in several cities across the country . Despite pointing the finger at Mr. Kumar for addressing a secret meeting in Jaipur on October 31 , 2005 , the charge sheet , filed in an Ajmer court , has not arraigned him as an accused . It only says the probe is still continuing , though the RSS has denied Mr. Kumar 's involvement . The ATS has not identified the RSS or the radical outfit Abhinav Bharat as being involved in the conspiracy for the blast , drawing flak from activist groups here . They have alleged that the investigating agencies are trying to pre-empt the conclusions about the Sangh Parivar 's direct role in terrorism . The 806-page charge sheet says the office-bearers of some 

In [54]:
def report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [55]:
x_test = [sent2features(s) for s in test_data]
y_test = [sent2tags(s) for s in test_data]

In [56]:
%%time
y_pred = [tagger.tag(xseq) for xseq in x_test]

CPU times: user 1.18 s, sys: 0 ns, total: 1.18 s
Wall time: 1.18 s


In [57]:
print(report(y_test, y_pred))

             precision    recall  f1-score   support

LOC_Accused       0.18      0.04      0.06       152
  LOC_Event       0.59      0.52      0.56      1390
 LOC_Others       0.52      0.56      0.54      1356
 LOC_Victim       0.80      0.07      0.13        58
  LOC_event       0.00      0.00      0.00         1
ORG_Accused       0.80      0.72      0.76       597
 ORG_Others       0.73      0.67      0.70      1849
 ORG_Victim       0.62      0.19      0.29       106
PER_Accused       0.66      0.53      0.59       658
 PER_Others       0.75      0.75      0.75      1513
 PER_Victim       0.36      0.27      0.31       184

avg / total       0.65      0.60      0.62      7864



  'precision', 'predicted', average, warn_for)


In [58]:
num_test_data = len(y_test)
print (len(y_pred))
print (num_test_data)

215
215


In [59]:
# Test
relevant_tag_count = dict() # TP + FN
relevant_retrieved_tag_count = dict() # TP
retrieved_tag_count = dict() # TP + FP

relevant_tag_count[u'PER_Others'] = 0
relevant_retrieved_tag_count[u'PER_Others'] = 0
retrieved_tag_count[u'PER_Others'] = 0

relevant_tag_count[u'PER_Victim'] = 0
relevant_retrieved_tag_count[u'PER_Victim'] = 0
retrieved_tag_count[u'PER_Victim'] = 0

relevant_tag_count[u'PER_Accused'] = 0
relevant_retrieved_tag_count[u'PER_Accused'] = 0
retrieved_tag_count[u'PER_Accused'] = 0

relevant_tag_count[u'ORG_Victim'] = 0
relevant_retrieved_tag_count[u'ORG_Victim'] = 0
retrieved_tag_count[u'ORG_Victim'] = 0

relevant_tag_count[u'ORG_Accused'] = 0
relevant_retrieved_tag_count[u'ORG_Accused'] = 0
retrieved_tag_count[u'ORG_Accused'] = 0

relevant_tag_count[u'ORG_Others'] = 0
relevant_retrieved_tag_count[u'ORG_Others'] = 0
retrieved_tag_count[u'ORG_Others'] = 0

relevant_tag_count[u'LOC_Accused'] = 0
relevant_retrieved_tag_count[u'LOC_Accused'] = 0
retrieved_tag_count[u'LOC_Accused'] = 0

relevant_tag_count[u'LOC_Others'] = 0
relevant_retrieved_tag_count[u'LOC_Others'] = 0
retrieved_tag_count[u'LOC_Others'] = 0

relevant_tag_count[u'LOC_Event'] = 0
relevant_retrieved_tag_count[u'LOC_Event'] = 0
retrieved_tag_count[u'LOC_Event'] = 0

relevant_tag_count[u'LOC_Victim'] = 0
relevant_retrieved_tag_count[u'LOC_Victim'] = 0
retrieved_tag_count[u'LOC_Victim'] = 0

for i in range(num_test_data):
    y_test_i = y_test[i]
    y_pred_i = y_pred[i]
    test_tag_length = len(y_test[i])
    predicted_tag_length = len(y_pred[i])
    for j in range(test_tag_length):
        if y_test_i[j] in relevant_tag_count.keys():
            relevant_tag_count[y_test_i[j]] = relevant_tag_count[y_test_i[j]] + 1
            
        if y_pred_i[j] in retrieved_tag_count.keys():
                retrieved_tag_count[y_pred_i[j]] = retrieved_tag_count[y_pred_i[j]] + 1
        
        if y_test_i[j] == y_pred_i[j]:
            if y_test_i[j] in relevant_retrieved_tag_count.keys():
                relevant_retrieved_tag_count[y_test_i[j]] = relevant_retrieved_tag_count[y_test_i[j]] + 1

In [60]:
avg_precision = 0
avg_recall = 0

print('======= Precision Class Wise =====================\n')

print('Class    Precision %')
print('--------------------------')

count_keys = 0
for key in retrieved_tag_count.keys():
    if key[4:]!='Others':
        count_keys = count_keys + 1
        if retrieved_tag_count[key] != 0:
            prec = float(relevant_retrieved_tag_count[key])/retrieved_tag_count[key]
            print(key, prec)
            avg_precision = avg_precision + prec
        else:
            print(key, 0.0)
        
        recall = float(relevant_retrieved_tag_count[key])/relevant_tag_count[key]
        avg_recall = avg_recall + recall
        
        
print('\n======= Avergae Precision =====================\n')
print('Average Precision: ',avg_precision/count_keys)
print('Average Recall: ',avg_recall/count_keys)

avg_f1 = (2*avg_precision*avg_recall)/(avg_precision + avg_recall)/count_keys

print('Average f1: ',avg_f1)


Class    Precision %
--------------------------
LOC_Victim 0.8
LOC_Accused 0.17647058823529413
ORG_Accused 0.8026070763500931
ORG_Victim 0.625
LOC_Event 0.593469387755102
PER_Accused 0.6579439252336449
PER_Victim 0.35714285714285715


Average Precision:  0.5732334049595701
Average Recall:  0.3355395165392239
Average f1:  0.4233014761202313
