In [1]:
import nltk
from nltk.tag import CRFTagger
import os
import numpy as np
import io

In [9]:
# Prepare dataset for NLTK HMM Tagger
# Divide in train and test files [80:20] 

# Directory having content
doc_dir = '../../../../Data/input/rectified/content'

train_file_list = []
test_file_list = []

for f in os.listdir(doc_dir):
    #Random Sampling
    if np.random.uniform(0,1)< 0.8:
        train_file_list.append(f)
    else:
        test_file_list.append(f)

print (len(train_file_list))
print (len(test_file_list))

855
230


In [10]:
# Build Tagged dataset to be fed in nltk hmm tagger
# It is a list of tagged sentences. 
# training_data = [ [(word, tag), (word, tag).....]
# [(word, tag), (word, tag).....]
#]

# Directory having tags
tag_dir = '../../../../Data/input/rectified/new_tags'

training_data = []

for f in train_file_list:
    training_sentences =[]
    word_file_path = os.path.join(doc_dir, f)
    tag_file_path = os.path.join(tag_dir, f)
    lines_in_word_file = []
    lines_in_tag_file = []
    with io.open(word_file_path, "rt", encoding="utf-8") as word_file:
        for line in word_file:
            lines_in_word_file.append(line)
    with io.open(tag_file_path, "rt", encoding="utf-8") as tag_file:
        for line in tag_file:
            lines_in_tag_file.append(line)
    if (len(lines_in_word_file) == len(lines_in_tag_file)) and len(lines_in_word_file) > 0:
        for i in range(len(lines_in_word_file)):
            word_in_file = lines_in_word_file[i].split()
            tag_in_file = lines_in_tag_file[i].split()
            pairs_in_line = []
            length = min(len(word_in_file), len(tag_in_file))
            #Create the word_tag pair
            for j in range(length):
                pairs_in_line.append((word_in_file[j], tag_in_file[j]));
            training_sentences.append(pairs_in_line)
    if len(training_sentences) > 0:
        training_data.extend(training_sentences)

#print training_data[0]
            

In [11]:
ct = CRFTagger()
ct.train(training_data,'model.crf.tagger')

In [6]:
for f in test_file_list:
    word_file_path = os.path.join(doc_dir, f)
    with io.open(word_file_path, "rt", encoding="utf-8") as word_file:
        for line in word_file:
            predicted_tags = ct.tag_sents([line.split()])
            print (predicted_tags[0])
    break

[('The', 'O'), ('death', 'O'), ('toll', 'O'), ('from', 'O'), ('a', 'O'), ('suicide', 'O'), ('bombing', 'O'), ('at', 'O'), ('a', 'O'), ('coffee', 'O'), ('house', 'O'), ('in', 'O'), ('central', 'LOC_Others'), ('Iraq', 'LOC_Others'), ("'s", 'O'), ('restive', 'O'), ('Diyala', 'LOC_Event'), ('province', 'LOC_Event'), ('has', 'O'), ('risen', 'O'), ('to', 'O'), ('30', 'O'), (',', 'O'), ('a', 'O'), ('medical', 'O'), ('official', 'O'), ('said', 'O'), ('on', 'O'), ('Saturday', 'O'), (',', 'O'), ('making', 'O'), ('the', 'O'), ('deadliest', 'O'), ('attack', 'O'), ('in', 'O'), ('October', 'O'), ('.', 'O'), ('Friday', 'O'), ('night', 'O'), ("'s", 'O'), ('bombing', 'O'), ('in', 'O'), ('the', 'O'), ('town', 'O'), ('of', 'O'), ('Balad', 'PER_Accused'), ('Ruz', 'PER_Accused'), ('also', 'O'), ('wounded', 'O'), ('68', 'O'), ('people', 'O'), (',', 'O'), ('according', 'O'), ('to', 'O'), ('Ahmed', 'PER_Accused'), ('Alwan', 'PER_Accused'), (',', 'O'), ('a', 'O'), ('doctor', 'O'), ('at', 'O'), ('the', 'O'), ('

In [12]:
# Test

relevant_tag_count = dict() # TP + FN
relevant_retrieved_tag_count = dict() # TP
retrieved_tag_count = dict() # TP + FP

relevant_tag_count[u'PER_Others'] = 0
relevant_retrieved_tag_count[u'PER_Others'] = 0
retrieved_tag_count[u'PER_Others'] = 0

relevant_tag_count[u'PER_Victim'] = 0
relevant_retrieved_tag_count[u'PER_Victim'] = 0
retrieved_tag_count[u'PER_Victim'] = 0

relevant_tag_count[u'PER_Accused'] = 0
relevant_retrieved_tag_count[u'PER_Accused'] = 0
retrieved_tag_count[u'PER_Accused'] = 0

relevant_tag_count[u'ORG_Victim'] = 0
relevant_retrieved_tag_count[u'ORG_Victim'] = 0
retrieved_tag_count[u'ORG_Victim'] = 0

relevant_tag_count[u'ORG_Accused'] = 0
relevant_retrieved_tag_count[u'ORG_Accused'] = 0
retrieved_tag_count[u'ORG_Accused'] = 0

relevant_tag_count[u'ORG_Others'] = 0
relevant_retrieved_tag_count[u'ORG_Others'] = 0
retrieved_tag_count[u'ORG_Others'] = 0

relevant_tag_count[u'LOC_Accused'] = 0
relevant_retrieved_tag_count[u'LOC_Accused'] = 0
retrieved_tag_count[u'LOC_Accused'] = 0

relevant_tag_count[u'LOC_Others'] = 0
relevant_retrieved_tag_count[u'LOC_Others'] = 0
retrieved_tag_count[u'LOC_Others'] = 0

relevant_tag_count[u'LOC_Event'] = 0
relevant_retrieved_tag_count[u'LOC_Event'] = 0
retrieved_tag_count[u'LOC_Event'] = 0

relevant_tag_count[u'LOC_Victim'] = 0
relevant_retrieved_tag_count[u'LOC_Victim'] = 0
retrieved_tag_count[u'LOC_Victim'] = 0

for f in test_file_list:
    word_file_path = os.path.join(doc_dir, f)
    tag_file_path = os.path.join(tag_dir, f)
    predicted_tags = []
    with io.open(word_file_path, "rt", encoding="utf-8") as word_file:
        for line in word_file:
            predicted_tags = ct.tag_sents([line.split()])
            #print predicted_tags
    if len(predicted_tags) > 0 and len(predicted_tags[0]) > 0:
        actual_tags = []
        with io.open(tag_file_path, "rt", encoding="utf-8") as tag_file:
            for line in tag_file:
                #print(line + '\n')
                actual_tags = line.split()
                #print actual_tags
        result_len = min(len(predicted_tags[0]), len(actual_tags))
        
        for i in range(result_len):
            #print predicted_tags[i][1], actual_tags[i]
            if actual_tags[i] in relevant_tag_count.keys():
                relevant_tag_count[actual_tags[i]] = relevant_tag_count[actual_tags[i]] + 1
                
            if predicted_tags[0][i][1] in retrieved_tag_count.keys():
                retrieved_tag_count[predicted_tags[0][i][1]] = retrieved_tag_count[predicted_tags[0][i][1]] + 1
                
            if actual_tags[i] == predicted_tags[0][i][1]:
                if actual_tags[i] in relevant_retrieved_tag_count.keys():
                    relevant_retrieved_tag_count[actual_tags[i]] = relevant_retrieved_tag_count[actual_tags[i]] + 1

print(relevant_tag_count)

{'ORG_Victim': 111, 'LOC_Event': 1402, 'PER_Accused': 577, 'LOC_Victim': 36, 'ORG_Accused': 637, 'PER_Victim': 271, 'LOC_Accused': 152, 'PER_Others': 1579, 'LOC_Others': 1486, 'ORG_Others': 1961}


In [13]:
avg_precision = 0

print('======= Precision Class Wise =====================\n')

print('Class    Precision %')
print('--------------------------')

count_keys = 0
for key in retrieved_tag_count.keys():
    if key[4:]!='Others':
        count_keys = count_keys + 1
        if retrieved_tag_count[key] != 0:
            prec = float(relevant_retrieved_tag_count[key]*100)/retrieved_tag_count[key]
            print(key, prec)
            avg_precision = avg_precision + prec
        else:
            print(key, 0.0)
        
print('\n======= Avergae Precision =====================\n')
print('Average Precision: ',avg_precision/count_keys)


Class    Precision %
--------------------------
ORG_Victim 20.0
LOC_Event 57.18725718725719
PER_Accused 53.546910755148744
LOC_Victim 0.0
ORG_Accused 83.87650085763293
PER_Victim 46.478873239436616
LOC_Accused 25.0


Average Precision:  40.86993457706792
