In [1]:
import nltk
from nltk.corpus import treebank
from nltk.tag import hmm
import os
import numpy as np

In [2]:
#train_data = treebank.tagged_sents()[:3000]
#print train_data[0][:10]

In [3]:
# Prepare dataset for NLTK HMM Tagger
# Divide in train and test files [80:20] 

# Directory having content
doc_dir = '../../../../Data/input/rectified/content'

train_file_list = []
test_file_list = []

for f in os.listdir(doc_dir):
    #Random Sampling
    if np.random.uniform(0,1)< 0.8:
        train_file_list.append(f)
    else:
        test_file_list.append(f)

print (len(train_file_list))
print (len(test_file_list))

855
230


In [4]:
# Build Tagged dataset to be fed in nltk hmm tagger
# It is a list of tagged sentences. 
# training_data = [ [(word, tag), (word, tag).....]
# [(word, tag), (word, tag).....]
#]

# Directory having tags
tag_dir = '../../../../Data/input/rectified/new_tags'

training_data = []

for f in train_file_list:
    training_sentences =[]
    word_file_path = os.path.join(doc_dir, f)
    tag_file_path = os.path.join(tag_dir, f)
    lines_in_word_file = []
    lines_in_tag_file = []
    with open(word_file_path, "rt") as word_file:
        for line in word_file:
            lines_in_word_file.append(line)
    with open(tag_file_path, "rt") as tag_file:
        for line in tag_file:
            lines_in_tag_file.append(line)
    if (len(lines_in_word_file) == len(lines_in_tag_file)) and len(lines_in_word_file) > 0:
        for i in range(len(lines_in_word_file)):
            word_in_file = lines_in_word_file[i].split()
            tag_in_file = lines_in_tag_file[i].split()
            pairs_in_line = []
            length = min(len(word_in_file), len(tag_in_file))
            #Create the word_tag pair
            for j in range(length):
                pairs_in_line.append((word_in_file[j], tag_in_file[j]));
            training_sentences.append(pairs_in_line)
    if len(training_sentences) > 0:
        training_data.extend(training_sentences)

#print training_data[0]
            

In [5]:
trainer = hmm.HiddenMarkovModelTrainer()
tagger =  trainer.train_supervised(training_data)
print(tagger)
print (tagger._states)

<HiddenMarkovModelTagger 14 states and 18316 output symbols>
['O', 'ORG_Others', 'PER_Accused', 'LOC_Event', 'LOC_Accused', 'ORG_Accused', 'LOC_Others', 'PER_Others', 'PER_Victim', 'ORG_Victim', 'LOC_event', 'Org_Others', 'OORG_Others', 'LOC_Victim']


In [6]:
# Test
relevant_tag_count = dict() # TP + FN
relevant_retrieved_tag_count = dict() # TP
retrieved_tag_count = dict() # TP + FP

relevant_tag_count[u'PER_Others'] = 0
relevant_retrieved_tag_count[u'PER_Others'] = 0
retrieved_tag_count[u'PER_Others'] = 0

relevant_tag_count[u'PER_Victim'] = 0
relevant_retrieved_tag_count[u'PER_Victim'] = 0
retrieved_tag_count[u'PER_Victim'] = 0

relevant_tag_count[u'PER_Accused'] = 0
relevant_retrieved_tag_count[u'PER_Accused'] = 0
retrieved_tag_count[u'PER_Accused'] = 0

relevant_tag_count[u'ORG_Victim'] = 0
relevant_retrieved_tag_count[u'ORG_Victim'] = 0
retrieved_tag_count[u'ORG_Victim'] = 0

relevant_tag_count[u'ORG_Accused'] = 0
relevant_retrieved_tag_count[u'ORG_Accused'] = 0
retrieved_tag_count[u'ORG_Accused'] = 0

relevant_tag_count[u'ORG_Others'] = 0
relevant_retrieved_tag_count[u'ORG_Others'] = 0
retrieved_tag_count[u'ORG_Others'] = 0

relevant_tag_count[u'LOC_Accused'] = 0
relevant_retrieved_tag_count[u'LOC_Accused'] = 0
retrieved_tag_count[u'LOC_Accused'] = 0

relevant_tag_count[u'LOC_Others'] = 0
relevant_retrieved_tag_count[u'LOC_Others'] = 0
retrieved_tag_count[u'LOC_Others'] = 0

relevant_tag_count[u'LOC_Event'] = 0
relevant_retrieved_tag_count[u'LOC_Event'] = 0
retrieved_tag_count[u'LOC_Event'] = 0

relevant_tag_count[u'LOC_Victim'] = 0
relevant_retrieved_tag_count[u'LOC_Victim'] = 0
retrieved_tag_count[u'LOC_Victim'] = 0

for f in test_file_list:
    word_file_path = os.path.join(doc_dir, f)
    tag_file_path = os.path.join(tag_dir, f)
    predicted_tags = []
    with open(word_file_path, "rt") as word_file:
        for line in word_file:
            predicted_tags = tagger.tag(line.split())
    if len(predicted_tags) > 0:
        actual_tags = []
        with open(tag_file_path, "rt") as tag_file:
            for line in tag_file:
                actual_tags = line.split()
        result_len = min(len(predicted_tags), len(actual_tags))
        
        for i in range(result_len):
            #print predicted_tags[i][1], actual_tags[i]
            if actual_tags[i] in relevant_tag_count.keys():
                relevant_tag_count[actual_tags[i]] = relevant_tag_count[actual_tags[i]] + 1
            
            if predicted_tags[i][1] in retrieved_tag_count.keys():
                retrieved_tag_count[predicted_tags[i][1]] = retrieved_tag_count[predicted_tags[i][1]] + 1
                
            if actual_tags[i] == predicted_tags[i][1]:
                if actual_tags[i] in relevant_retrieved_tag_count.keys():
                    relevant_retrieved_tag_count[actual_tags[i]] = relevant_retrieved_tag_count[actual_tags[i]] + 1
                    

In [10]:
avg_precision = 0
avg_recall = 0

print('======= Precision Class Wise =====================\n')

print('Class    Precision %')
print('--------------------------')

count_keys = 0
for key in retrieved_tag_count.keys():
    if key[4:]!='Others':
        count_keys = count_keys + 1
        if retrieved_tag_count[key] != 0:
            prec = float(relevant_retrieved_tag_count[key])/retrieved_tag_count[key]
            print(key, prec)
            avg_precision = avg_precision + prec
        else:
            print(key, 0.0)
        
        recall = float(relevant_retrieved_tag_count[key])/relevant_tag_count[key]
        avg_recall = avg_recall + recall
        
print('\n======= Avergae Precision =====================\n')
print('Average Precision: ',avg_precision/count_keys)

avg_f1 = (2*avg_precision*avg_recall)/(avg_precision + avg_recall)/count_keys
print('Average Recall: ',avg_recall)
print('Average f1: ',avg_f1)


Class    Precision %
--------------------------
LOC_Accused 0.0
PER_Accused 0.7625
LOC_Event 0.5928571428571429
ORG_Accused 0.7789473684210526
LOC_Victim 0.0
PER_Victim 0.22857142857142856
ORG_Victim 0.24324324324324326


Average Precision:  0.37230274044183814
Average Recall:  0.5460502081332692
Average f1:  0.12898798489238156


In [8]:
result = tagger.tag("New Delhi : In the wake of Fridays bomb blasts in Kabul".split())
print (result)

[('New', 'LOC_Others'), ('Delhi', 'LOC_Others'), (':', 'O'), ('In', 'O'), ('the', 'O'), ('wake', 'O'), ('of', 'O'), ('Fridays', 'O'), ('bomb', 'O'), ('blasts', 'O'), ('in', 'O'), ('Kabul', 'LOC_Event')]


In [9]:
# Dummy tab for test
#print type("Chicago is the birthplace of Ginny".split())
x = [1,2,3]
x.append(4)
print x
x.append([4,5])
print x
x.exten


SyntaxError: Missing parentheses in call to 'print' (<ipython-input-9-2ca18c6e007a>, line 5)