In [12]:
#Load the dependencies
import nltk
from nltk import word_tokenize, pos_tag
import pprint 
from nltk.corpus.reader.tagged import TaggedCorpusReader
from sklearn.metrics import precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
import pandas as pd

In [2]:
#Location where your files are saved
root = './/'

In [3]:
#Read the tagged corpus

corpus = TaggedCorpusReader(root,"train.txt")
tagged_sentences = corpus.tagged_sents()

In [4]:
tagged_sentences

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

In [5]:

# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]
 
print(len(training_sentences))   # 2935
print(len(test_sentences))         # 979

7791
2597


In [6]:
def features(sentence, index):
    """ Compute some very basic word features.
        :param sentence_terms: [w1, w2, ...]
        :type sentence_terms: list
        :param index: the index of the word
        :type index: int
        :return: dict containing features
        :rtype: dict
    """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
 

pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))

{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': False,
 'is_all_lower': True,
 'is_capitalized': False,
 'is_first': False,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'sentence',
 'prefix-1': 'a',
 'prefix-2': 'a',
 'prefix-3': 'a',
 'prev_word': 'is',
 'suffix-1': 'a',
 'suffix-2': 'a',
 'suffix-3': 'a',
 'word': 'a'}


In [7]:
def untag(tagged_sentence):
    """
    Remove the tag for each tagged term.
    :param tagged_sentence: a POS tagged sentence
    :type tagged_sentence: list
    :return: a list of tags
    :rtype: list of strings
    """
    return [w for w, t in tagged_sentence]

In [8]:
def transform_to_dataset(tagged_sentences):
    """
    Split tagged sentences to X and y datasets and append some basic features.
    :param tagged_sentences: a list of POS tagged sentences
    :param tagged_sentences: list of list of tuples (term_i, tag_i)
    :return:
    """
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y

In [9]:
#Transform the data for to be used in the model
X, y = transform_to_dataset(training_sentences)

In [10]:
len(y)#number of records in training

165816

In [13]:
print(sorted(Counter(y).items(), key = lambda kv:(kv[1], kv[0]),reverse=True)) 
# The top five POS tags are: NN,IN, AT, NP, JJ 

[('NN', 21305), ('IN', 17167), ('AT', 14530), ('NP', 8965), ('JJ', 8203), ('NNS', 8187), (',', 8069), ('.', 7615), ('VB', 4655), ('CC', 4606), ('RB', 4097), ('VBN', 3857), ('NN-TL', 3692), ('VBD', 3286), ('CS', 2765), ('CD', 2743), ('VBG', 2327), ('TO', 2232), ('MD', 1935), ('PPS', 1889), ('PP$', 1860), ('AP', 1550), ('BEZ', 1513), ('PPSS', 1294), ('``', 1156), ('DT', 1127), ("''", 1112), ('VBZ', 1107), ('JJ-TL', 1107), ('NP-TL', 1101), ('QL', 1047), ('BEDZ', 1041), ('BE', 959), ('PPO', 897), ('RP', 725), ('WPS', 677), ('WDT', 663), ('BER', 638), ('WRB', 607), ('HVZ', 579), ('*', 576), ('HV', 557), ('NR', 554), ('NNS-TL', 542), ('--', 497), ('NP$', 447), ('OD', 412), ('DTI', 401), ('HVD', 396), ('BEN', 373), ('NN-HL', 372), ('BED', 362), ('ABN', 345), ('NPS', 334), ('NN$', 291), ('DTS', 281), ('NP-HL', 270), ('IN-TL', 267), ('EX', 265), (')', 250), ('JJR', 248), (':', 247), ('(', 247), ('PN', 205), ('IN-HL', 179), ('VBN-TL', 176), ('DO', 175), ('JJT', 169), ('NNS-HL', 142), ('RBR', 135

In [14]:
#Model pipeline is created and the fitted on training data
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', RandomForestClassifier(n_estimators=20, criterion = 'entropy'))])
 
clf.fit(X[:25000], y[:25000])   # Use only the first 25K samples if you're running it multiple times. It takes a fair bit :)
 
print('Training completed')
 
X_test, y_test = transform_to_dataset(test_sentences)
 
print ("Accuracy:", clf.score(X_test, y_test))

Training completed
Accuracy: 0.8469216373146504


In [15]:
y_true = y_test
y_pred = clf.predict(X_test)

precision_recall_fscore_support(y_true, y_pred,average = 'micro')

(0.8469216373146504, 0.8469216373146504, 0.8469216373146504, None)

In [16]:
#Function to be used when we are predicting on the unseen on untagged data
def pos_tag(sentence):
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return sentence, tags

In [17]:
#Read the untagged data
with open('test.txt', 'r') as myfile:
    data = myfile.read()

In [18]:
#remove all extra spaces
data = " ".join(data.split())

In [19]:
# Do the prediction on untagged data using the pos_tag function
x_1,y_1 = pos_tag(word_tokenize(data))

In [21]:
corpus_test = TaggedCorpusReader(root,"test.tag")
a,b = transform_to_dataset(corpus_test.tagged_sents())

In [22]:
len(a)

892

In [24]:
len(x_1)
# Now we have see that we donot have all the tags in the tagged file, hence we will be using the tokens from tagged 
# file to do the prediction and the calculate the accuracy score

898

In [25]:
y_true = b
y_pred = clf.predict(a)
precision_recall_fscore_support(y_true, y_pred, average='micro')

(0.8890134529147982, 0.8890134529147982, 0.8890134529147982, None)

In [26]:
print ("Accuracy:", clf.score(a, b))

Accuracy: 0.8890134529147982


In [27]:
Output = [i +'/'+ j for i, j in zip(x_1, list(y_1))]

In [28]:
Out_String = " ".join(Output)

In [29]:
file1 = open("test.out","w")
file1.writelines(Out_String) 
file1.close() 