In [1]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import os
import numpy as np

print(sklearn.__version__)

0.19.1


In [3]:
# Divide in train and test files [80:20] 

# Directory having content

doc_dir = '../../../../Data/input/content'

train_file_list = []
test_file_list = []

for f in os.listdir(doc_dir):
    #Random Sampling
    if np.random.uniform(0,1)< 0.8:
        train_file_list.append(f)
    else:
        test_file_list.append(f)

print (len(train_file_list))
print (len(test_file_list))

863
224


In [7]:
# Build Tagged dataset to be fed in nltk hmm tagger
# It is a list of tagged sentences. 
# training_data = [ [(word, tag), (word, tag).....]
# [(word, tag), (word, tag).....]
#]

# Directory having tags
tag_dir = '../../../../Data/input/new_tags'

training_data = []

for f in train_file_list:
    training_sentences =[]
    word_file_path = os.path.join(doc_dir, f)
    tag_file_path = os.path.join(tag_dir, f)
    lines_in_word_file = []
    lines_in_tag_file = []
    with open(word_file_path, "rt") as word_file:
        for line in word_file:
            lines_in_word_file.append(line)
    with open(tag_file_path, "rt") as tag_file:
        for line in tag_file:
            lines_in_tag_file.append(line)
    if (len(lines_in_word_file) == len(lines_in_tag_file)) and len(lines_in_word_file) > 0:
        for i in range(len(lines_in_word_file)):
            word_in_file = lines_in_word_file[i].split()
            tag_in_file = lines_in_tag_file[i].split()
            pairs_in_line = []
            length = min(len(word_in_file), len(tag_in_file))
            #Create the word_tag pair
            for j in range(length):
                pairs_in_line.append((word_in_file[j], tag_in_file[j]));
            training_sentences.append(pairs_in_line)
    if len(training_sentences) > 0:
        training_data.extend(training_sentences)

print(training_data[0])

[('Officials', 'O'), ('of', 'O'), ('the', 'O'), ('Special', 'O'), ('Enquiry', 'O'), ('Squad', 'O'), ('of', 'O'), ('Central', 'O'), ('Crime', 'O'), ('Branch', 'O'), ('(', 'O'), ('CCB', 'O'), (')', 'O'), ('interrogated', 'O'), ('Shahzad', 'PER_Accused'), (',', 'O'), ('an', 'O'), ('alleged', 'O'), ('Indian', 'ORG_Others'), ('Mujahideen', 'ORG_Others'), ('militant', 'O'), (',', 'O'), ('in', 'O'), ('connection', 'O'), ('with', 'O'), ('the', 'O'), ('April', 'O'), ('17', 'O'), ('Chinnaswamy', 'LOC_Event'), ('Stadium', 'LOC_Event'), ('blasts', 'O'), (',', 'O'), ('here', 'O'), ('on', 'O'), ('Friday', 'O'), ('.', 'O'), ('Shahzad', 'PER_Accused'), ('and', 'O'), ('his', 'O'), ('associate', 'O'), ('Salman', 'PER_Accused'), ('were', 'O'), ('brought', 'O'), ('here', 'O'), ('from', 'O'), ('Delhi', 'LOC_Others'), ('on', 'O'), ('Thursday', 'O'), ('.', 'O'), ('The', 'O'), ('CCB', 'O'), ('officials', 'O'), ('produced', 'O'), ('the', 'O'), ('two', 'O'), ('in', 'O'), ('the', 'O'), ('first', 'O'), ('ACMM', '

In [8]:
def word2features(sent, i):
    word = sent[i][0]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
    ]
    if i > 0:
        word1 = sent[i-1][0]
        tag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
           # '-1:tag1=' + tag1,
        ])
    else:
        features.append('BOS')
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('EOS')
        
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2tags(sent):
    return [tag for word, tag in sent]

def sent2words(sent):
    return [word for word, tag in sent] 

In [9]:
print (training_data[0])
print (sent2features(training_data[0])[0])
print (sent2tags(training_data[0])[0])
print (sent2words(training_data[0])[0])

[('Officials', 'O'), ('of', 'O'), ('the', 'O'), ('Special', 'O'), ('Enquiry', 'O'), ('Squad', 'O'), ('of', 'O'), ('Central', 'O'), ('Crime', 'O'), ('Branch', 'O'), ('(', 'O'), ('CCB', 'O'), (')', 'O'), ('interrogated', 'O'), ('Shahzad', 'PER_Accused'), (',', 'O'), ('an', 'O'), ('alleged', 'O'), ('Indian', 'ORG_Others'), ('Mujahideen', 'ORG_Others'), ('militant', 'O'), (',', 'O'), ('in', 'O'), ('connection', 'O'), ('with', 'O'), ('the', 'O'), ('April', 'O'), ('17', 'O'), ('Chinnaswamy', 'LOC_Event'), ('Stadium', 'LOC_Event'), ('blasts', 'O'), (',', 'O'), ('here', 'O'), ('on', 'O'), ('Friday', 'O'), ('.', 'O'), ('Shahzad', 'PER_Accused'), ('and', 'O'), ('his', 'O'), ('associate', 'O'), ('Salman', 'PER_Accused'), ('were', 'O'), ('brought', 'O'), ('here', 'O'), ('from', 'O'), ('Delhi', 'LOC_Others'), ('on', 'O'), ('Thursday', 'O'), ('.', 'O'), ('The', 'O'), ('CCB', 'O'), ('officials', 'O'), ('produced', 'O'), ('the', 'O'), ('two', 'O'), ('in', 'O'), ('the', 'O'), ('first', 'O'), ('ACMM', '

In [10]:
x_train = [sent2features(s) for s in training_data]
y_train = [sent2tags(s) for s in training_data]

In [11]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(x_train, y_train):
    trainer.append(xseq, yseq)

In [12]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [13]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [14]:
# Train and save the model
trainer.train('osint-bomb-blast.crfsuite')

In [15]:
tagger = pycrfsuite.Tagger()
tagger.open('osint-bomb-blast.crfsuite')

<contextlib.closing at 0x7f6833ef94a8>

In [17]:
# Create test data
test_data = []
for f in test_file_list:
    test_sentences =[]
    word_file_path = os.path.join(doc_dir, f)
    tag_file_path = os.path.join(tag_dir, f)
    lines_in_word_file = []
    lines_in_tag_file = []
    with open(word_file_path, "rt") as word_file:
        for line in word_file:
            lines_in_word_file.append(line)
    with open(tag_file_path, "rt") as tag_file:
        for line in tag_file:
            lines_in_tag_file.append(line)
    if (len(lines_in_word_file) == len(lines_in_tag_file)) and len(lines_in_word_file) > 0:
        for i in range(len(lines_in_word_file)):
            word_in_file = lines_in_word_file[i].split()
            tag_in_file = lines_in_tag_file[i].split()
            pairs_in_line = []
            length = min(len(word_in_file), len(tag_in_file))
            #Create the word_tag pair
            for j in range(length):
                pairs_in_line.append((word_in_file[j], tag_in_file[j]));
            test_sentences.append(pairs_in_line)
    if len(test_sentences) > 0:
        test_data.extend(test_sentences)

print(test_data[0])

[('The', 'O'), ('Karnataka', 'LOC_Event'), ('Police', 'O'), ('have', 'O'), ('arrested', 'O'), ('Ibrahim', 'PER_Accused'), ('Moulavi', 'PER_Accused'), (',', 'O'), ('an', 'O'), ('accused', 'O'), ('in', 'O'), ('the', 'O'), ('Bangalore', 'LOC_Event'), ('bomb', 'O'), ('blasts', 'O'), ('case', 'O'), (',', 'O'), ('near', 'O'), ('Kasaragod', 'LOC_Event'), ('.', 'O'), ('Page', 'O'), ('4', 'O'), ('OPEN', 'O')]


In [18]:
example_sent = test_data[0]
print(' '.join(sent2words(example_sent)))

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2tags(example_sent)))

The Karnataka Police have arrested Ibrahim Moulavi , an accused in the Bangalore bomb blasts case , near Kasaragod . Page 4 OPEN
Predicted: O ORG_Others ORG_Others O O PER_Others PER_Others O O O O O LOC_Event O O O O O LOC_Others O O O O
Correct:   O LOC_Event O O O PER_Accused PER_Accused O O O O O LOC_Event O O O O O LOC_Event O O O O


In [19]:
def report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [20]:
x_test = [sent2features(s) for s in test_data]
y_test = [sent2tags(s) for s in test_data]

In [21]:
%%time
y_pred = [tagger.tag(xseq) for xseq in x_test]

CPU times: user 752 ms, sys: 0 ns, total: 752 ms
Wall time: 756 ms


In [22]:
print(report(y_test, y_pred))

             precision    recall  f1-score   support

LOC_Accused       0.35      0.09      0.14        80
  LOC_Event       0.57      0.44      0.50      1168
 LOC_Others       0.59      0.62      0.60      1625
 LOC_Victim       0.33      0.05      0.09        41
ORG_Accused       0.82      0.82      0.82       528
 ORG_Others       0.71      0.65      0.68      1957
 ORG_Victim       0.15      0.07      0.09        45
PER_Accused       0.61      0.58      0.59       504
 PER_Others       0.80      0.77      0.78      1669
 PER_Victim       0.53      0.27      0.36       211

avg / total       0.67      0.62      0.64      7828



In [24]:
num_test_data = len(y_test)
print (len(y_pred))
print (num_test_data)

224
224


In [26]:
# Test
actual_tag_count = dict()
matched_tag_count = dict()
matched_tag_count['PER_Others'] = 0
actual_tag_count['PER_Others'] = 0
matched_tag_count['PER_Victim'] = 0
actual_tag_count['PER_Victim'] = 0
matched_tag_count['PER_Accused'] = 0
actual_tag_count['PER_Accused'] = 0
matched_tag_count['ORG_Victim'] = 0
actual_tag_count['ORG_Victim'] = 0
matched_tag_count['ORG_Accused'] = 0
actual_tag_count['ORG_Accused'] = 0
matched_tag_count['ORG_Others'] = 0
actual_tag_count['ORG_Others'] = 0
matched_tag_count['LOC_Accused'] = 0
actual_tag_count['LOC_Accused'] = 0
matched_tag_count['LOC_Others'] = 0
actual_tag_count['LOC_Others'] = 0
matched_tag_count['LOC_Event'] = 0
actual_tag_count['LOC_Event'] = 0
matched_tag_count['LOC_Victim'] = 0
actual_tag_count['LOC_Victim'] = 0

for i in range(num_test_data):
    y_test_i = y_test[i]
    y_pred_i = y_pred[i]
    test_tag_length = len(y_test[i])
    predicted_tag_length = len(y_pred[i])
    for j in range(test_tag_length):
        if y_test_i[j] in actual_tag_count.keys():
            actual_tag_count[y_test_i[j]] = actual_tag_count[y_test_i[j]] + 1
        
        if y_test_i[j] == y_pred_i[j]:
            if y_test_i[j] in matched_tag_count.keys():
                matched_tag_count[y_test_i[j]] = matched_tag_count[y_test_i[j]] + 1

In [27]:
print('======= Accuracy Class Wise =====================\n')

print('Class    Matched Total %')
print('--------------------------')
print ('PER_Others: '+str(matched_tag_count['PER_Others'])+ ' ' + str(actual_tag_count['PER_Others'])+ ' ' + str(matched_tag_count['PER_Others']*100/actual_tag_count['PER_Others'])+'%')
print ('PER_Victim: '+str(matched_tag_count['PER_Victim'])+ ' ' + str(actual_tag_count['PER_Victim'])+ ' '+str(matched_tag_count['PER_Victim']*100/actual_tag_count['PER_Victim'])+'%')
print ('PER_Accused: '+str(matched_tag_count['PER_Accused'])+ ' ' + str(actual_tag_count['PER_Accused'])+ ' '+str(matched_tag_count['PER_Accused']*100/actual_tag_count['PER_Accused'])+'%')
print ('ORG_Victim: '+str(matched_tag_count['ORG_Victim']) + ' '+ str(actual_tag_count['ORG_Victim'])+ ' '+str(matched_tag_count['ORG_Victim']*100/actual_tag_count['ORG_Victim'])+'%')
print ('ORG_Accused: '+str(matched_tag_count['ORG_Accused']) + ' '+ str(actual_tag_count['ORG_Accused'])+ ' '+str(matched_tag_count['ORG_Accused']*100/actual_tag_count['ORG_Accused'])+'%')
print ('ORG_Others: '+str(matched_tag_count['ORG_Others']) + ' '+ str(actual_tag_count['ORG_Others'])+ ' '+str(matched_tag_count['ORG_Others']*100/actual_tag_count['ORG_Others'])+'%')
print ('LOC_Accused: '+str(matched_tag_count['LOC_Accused'])+ ' ' + str(actual_tag_count['LOC_Accused'])+ ' '+str(matched_tag_count['LOC_Accused']*100/actual_tag_count['LOC_Accused'])+'%')
print ('LOC_Others: '+str(matched_tag_count['LOC_Others'])+ ' ' + str(actual_tag_count['LOC_Others'])+ ' '+str(matched_tag_count['LOC_Others']*100/actual_tag_count['LOC_Others'])+'%')
print ('LOC_Event: '+str(matched_tag_count['LOC_Event']) + ' '+ str(actual_tag_count['LOC_Event'])+ ' '+str(matched_tag_count['LOC_Event']*100/actual_tag_count['LOC_Event'])+'%')
print ('LOC_Victim: '+str(matched_tag_count['LOC_Victim']) + ' '+ str(actual_tag_count['LOC_Victim'])+ ' '+str(matched_tag_count['LOC_Victim']*100/actual_tag_count['LOC_Victim'])+'%')


Class    Matched Total %
--------------------------
PER_Others: 1278 1669 76.57279808268424%
PER_Victim: 58 211 27.488151658767773%
PER_Accused: 291 504 57.73809523809524%
ORG_Victim: 3 45 6.666666666666667%
ORG_Accused: 431 528 81.62878787878788%
ORG_Others: 1277 1957 65.2529381706694%
LOC_Accused: 7 80 8.75%
LOC_Others: 1000 1625 61.53846153846154%
LOC_Event: 517 1168 44.263698630136986%
LOC_Victim: 2 41 4.878048780487805%
