In [1]:
import nltk
from nltk.tag import CRFTagger
import os
import numpy as np
import io

In [2]:
# Prepare dataset for NLTK HMM Tagger
# Divide in train and test files [80:20] 

# Directory having content
doc_dir = '../../../../Data/input/content'

train_file_list = []
test_file_list = []

for f in os.listdir(doc_dir):
    #Random Sampling
    if np.random.uniform(0,1)< 0.8:
        train_file_list.append(f)
    else:
        test_file_list.append(f)

print (len(train_file_list))
print (len(test_file_list))

857
230


In [3]:
# Build Tagged dataset to be fed in nltk hmm tagger
# It is a list of tagged sentences. 
# training_data = [ [(word, tag), (word, tag).....]
# [(word, tag), (word, tag).....]
#]

# Directory having tags
tag_dir = '../../../../Data/input/new_tags'

training_data = []

for f in train_file_list:
    training_sentences =[]
    word_file_path = os.path.join(doc_dir, f)
    tag_file_path = os.path.join(tag_dir, f)
    lines_in_word_file = []
    lines_in_tag_file = []
    with io.open(word_file_path, "rt", encoding="utf-8") as word_file:
        for line in word_file:
            lines_in_word_file.append(line)
    with io.open(tag_file_path, "rt", encoding="utf-8") as tag_file:
        for line in tag_file:
            lines_in_tag_file.append(line)
    if (len(lines_in_word_file) == len(lines_in_tag_file)) and len(lines_in_word_file) > 0:
        for i in range(len(lines_in_word_file)):
            word_in_file = lines_in_word_file[i].split()
            tag_in_file = lines_in_tag_file[i].split()
            pairs_in_line = []
            length = min(len(word_in_file), len(tag_in_file))
            #Create the word_tag pair
            for j in range(length):
                pairs_in_line.append((word_in_file[j], tag_in_file[j]));
            training_sentences.append(pairs_in_line)
    if len(training_sentences) > 0:
        training_data.extend(training_sentences)

#print training_data[0]
            

In [4]:
ct = CRFTagger()
ct.train(training_data,'model.crf.tagger')

In [5]:
for f in test_file_list:
    word_file_path = os.path.join(doc_dir, f)
    with io.open(word_file_path, "rt", encoding="utf-8") as word_file:
        for line in word_file:
            predicted_tags = ct.tag_sents([line.split()])
            print predicted_tags[0]
    break

[(u'Five', 'O'), (u'people', 'O'), (u'were', 'O'), (u'killed', 'O'), (u'and', 'O'), (u'over', 'O'), (u'50', 'O'), (u'injured', 'O'), (u'in', 'O'), (u'three', 'O'), (u'blasts', 'O'), (u'set', 'O'), (u'off', 'O'), (u'by', 'O'), (u'insurgent', 'O'), (u'outfit', 'O'), (u'ULFA', 'ORG_Accused'), (u'here', 'O'), (u'today', 'O'), (u'hours', 'O'), (u'before', 'O'), (u'Union', 'O'), (u'Home', 'O'), (u'Minister', 'O'), (u'P', 'PER_Others'), (u'Chidambaram', 'PER_Others'), (u"'s", 'O'), (u'visit', 'O'), (u'to', 'O'), (u'review', 'O'), (u'law', 'O'), (u'and', 'O'), (u'order', 'O'), (u'situation', 'O'), (u'in', 'O'), (u'the', 'O'), (u'state', 'O'), (u'rocked', 'O'), (u'by', 'O'), (u'deadly', 'O'), (u'blasts', 'O'), (u'that', 'O'), (u'left', 'O'), (u'88', 'O'), (u'dead', 'O'), (u'two', 'O'), (u'months', 'O'), (u'ago', 'O'), (u'.', 'O'), (u'.', 'O'), (u'.', 'O'), (u'Three', 'O'), (u'people', 'O'), (u'were', 'O'), (u'killed', 'O'), (u'and', 'O'), (u'35', 'O'), (u'injured', 'O'), (u',', 'O'), (u'includi

In [5]:
# Test
actual_tag_count = dict()
matched_tag_count = dict()
matched_tag_count[u'PER_Others'] = 0
actual_tag_count[u'PER_Others'] = 0
matched_tag_count[u'PER_Victim'] = 0
actual_tag_count[u'PER_Victim'] = 0
matched_tag_count[u'PER_Accused'] = 0
actual_tag_count[u'PER_Accused'] = 0
matched_tag_count[u'ORG_Victim'] = 0
actual_tag_count[u'ORG_Victim'] = 0
matched_tag_count[u'ORG_Accused'] = 0
actual_tag_count[u'ORG_Accused'] = 0
matched_tag_count[u'ORG_Others'] = 0
actual_tag_count[u'ORG_Others'] = 0
matched_tag_count[u'LOC_Accused'] = 0
actual_tag_count[u'LOC_Accused'] = 0
matched_tag_count[u'LOC_Others'] = 0
actual_tag_count[u'LOC_Others'] = 0
matched_tag_count[u'LOC_Event'] = 0
actual_tag_count[u'LOC_Event'] = 0
matched_tag_count[u'LOC_Victim'] = 0
actual_tag_count[u'LOC_Victim'] = 0

for f in test_file_list:
    word_file_path = os.path.join(doc_dir, f)
    tag_file_path = os.path.join(tag_dir, f)
    predicted_tags = []
    with io.open(word_file_path, "rt", encoding="utf-8") as word_file:
        for line in word_file:
            predicted_tags = ct.tag_sents([line.split()])
            #print predicted_tags
    if len(predicted_tags) > 0 and len(predicted_tags[0]) > 0:
        actual_tags = []
        with io.open(tag_file_path, "rt", encoding="utf-8") as tag_file:
            for line in tag_file:
                #print(line + '\n')
                actual_tags = line.split()
                #print actual_tags
        result_len = min(len(predicted_tags[0]), len(actual_tags))
        
        for i in range(result_len):
            #print predicted_tags[i][1], actual_tags[i]
            if actual_tags[i] in actual_tag_count.keys():
                actual_tag_count[actual_tags[i]] = actual_tag_count[actual_tags[i]] + 1
                
            if actual_tags[i] == predicted_tags[0][i][1]:
                if actual_tags[i] in matched_tag_count.keys():
                    matched_tag_count[actual_tags[i]] = matched_tag_count[actual_tags[i]] + 1

print(actual_tag_count)

{'LOC_Others': 1743, 'LOC_Accused': 120, 'ORG_Accused': 546, 'LOC_Victim': 27, 'PER_Others': 1702, 'ORG_Others': 2015, 'LOC_Event': 1100, 'PER_Victim': 268, 'ORG_Victim': 81, 'PER_Accused': 599}


In [6]:
print('======= Accuracy Class Wise =====================\n')

print('Class    Matched Total %')
print('--------------------------')
print ('PER_Others: '+str(matched_tag_count['PER_Others'])+ ' ' + str(actual_tag_count['PER_Others'])+ ' ' + str(matched_tag_count['PER_Others']*100/actual_tag_count['PER_Others'])+'%')
print ('PER_Victim: '+str(matched_tag_count['PER_Victim'])+ ' ' + str(actual_tag_count['PER_Victim'])+ ' '+str(matched_tag_count['PER_Victim']*100/actual_tag_count['PER_Victim'])+'%')
print ('PER_Accused: '+str(matched_tag_count['PER_Accused'])+ ' ' + str(actual_tag_count['PER_Accused'])+ ' '+str(matched_tag_count['PER_Accused']*100/actual_tag_count['PER_Accused'])+'%')
print ('ORG_Victim: '+str(matched_tag_count['ORG_Victim']) + ' '+ str(actual_tag_count['ORG_Victim'])+ ' '+str(matched_tag_count['ORG_Victim']*100/actual_tag_count['ORG_Victim'])+'%')
print ('ORG_Accused: '+str(matched_tag_count['ORG_Accused']) + ' '+ str(actual_tag_count['ORG_Accused'])+ ' '+str(matched_tag_count['ORG_Accused']*100/actual_tag_count['ORG_Accused'])+'%')
print ('ORG_Others: '+str(matched_tag_count['ORG_Others']) + ' '+ str(actual_tag_count['ORG_Others'])+ ' '+str(matched_tag_count['ORG_Others']*100/actual_tag_count['ORG_Others'])+'%')
print ('LOC_Accused: '+str(matched_tag_count['LOC_Accused'])+ ' ' + str(actual_tag_count['LOC_Accused'])+ ' '+str(matched_tag_count['LOC_Accused']*100/actual_tag_count['LOC_Accused'])+'%')
print ('LOC_Others: '+str(matched_tag_count['LOC_Others'])+ ' ' + str(actual_tag_count['LOC_Others'])+ ' '+str(matched_tag_count['LOC_Others']*100/actual_tag_count['LOC_Others'])+'%')
print ('LOC_Event: '+str(matched_tag_count['LOC_Event']) + ' '+ str(actual_tag_count['LOC_Event'])+ ' '+str(matched_tag_count['LOC_Event']*100/actual_tag_count['LOC_Event'])+'%')
print ('LOC_Victim: '+str(matched_tag_count['LOC_Victim']) + ' '+ str(actual_tag_count['LOC_Victim'])+ ' '+str(matched_tag_count['LOC_Victim']*100/actual_tag_count['LOC_Victim'])+'%')


Class    Matched Total %
--------------------------
PER_Others: 1257 1702 73.85428907168037%
PER_Victim: 28 268 10.447761194029852%
PER_Accused: 205 599 34.223706176961606%
ORG_Victim: 0 81 0.0%
ORG_Accused: 379 546 69.41391941391942%
ORG_Others: 1289 2015 63.97022332506204%
LOC_Accused: 0 120 0.0%
LOC_Others: 961 1743 55.134825014343086%
LOC_Event: 475 1100 43.18181818181818%
LOC_Victim: 0 27 0.0%
