In [2]:
def gen_feature_set(tags, offset):
    def tags_to_word_pos(tags):
        if tags[0] is None:
            return ''
        else :
            return " ".join((tag[0] + "_" + tag[2] for tag in tags))
    
    def find_FH(tags, offset):
        sentence_length = len(tags)
        # Lemma of headword of following phrase with PoS tag
        temp_tag = None
        for tag in tags[offset+1:sentence_length]:
            if tag[3][0]=='B' and temp_tag is None:
                temp_tag = tag
            if temp_tag is not None and tag[3][0]=='I':
                return tag
            
        if temp_tag is None:
            return ('','','','')
        return temp_tag
    
    def find_FP(tags, offset):
        sentence_length = len(tags)
        # Following phrase including PoS tags
        temp_tag = None
        for tag in tags[offset+1:sentence_length]:
            if tag[3][0]=='B' and temp_tag is None:
                temp_tag = [tag]
            if temp_tag is not None and tag[3][0]=='I':
                temp_tag.append(tag)
                return temp_tag

        if temp_tag is None:
            return [None]
        return temp_tag
    
    def find_PHR_pre(tags, offset):
        sentence_length = len(tags)
        # Preceding phrase type
        temp_tag = ''
        for tag in tags[0:offset]:
            if tag[3][0]=='B':
                temp_tag = tag[3].split('-')[1]
        return temp_tag 

    def find_PV(tags, offset):
        sentence_length = len(tags)
        # Preceding verb lemma with PoS tag
        temp_tag = None
        for tag in tags[0:offset]:
            if tag[2][0]=='V':
                temp_tag = tag
        return temp_tag

        
    def find_FHtag(tags, offset):
        sentence_length = len(tags)
        # PoS tag of headword of the following phrase
        temp_tag = None
        for tag in tags[offset+1:sentence_length]:
            if tag[3][0]=='B' and temp_tag is None:
                temp_tag = tag
            if temp_tag is not None and tag[3][0]=='I':
                return tag

        if temp_tag is None:
            return ('','','','')
        return temp_tag
    
    def find_PVtag(tags, offset):
        sentence_length = len(tags)
        # PoS tag of the preceding verb
        temp_tag = None
        for tag in tags[0:offset]:
            if tag[2][0]=='V':
                temp_tag = tag

        if temp_tag is None:
            return ('','','','')
        return temp_tag
    
    features = {}
    features['TGLR'] = tags_to_word_pos([tags[offset - 1], tags[offset + 1]]) if 0 < offset < len(tags) - 1 else ''
    features['TGL'] = tags_to_word_pos([tags[offset - 2], tags[offset - 1]]) if 1 < offset else ''
    features['TGR'] = tags_to_word_pos([tags[offset + 1], tags[offset + 2]]) if offset < len(tags) - 2 else ''
    features['BGL'] = tags_to_word_pos([tags[offset - 1]]) if 0 < offset else ''
    features['BGR'] = tags_to_word_pos([tags[offset + 1]]) if offset < len(tags) - 1 else ''
    features['FH'] = tags_to_word_pos([find_FH(tags, offset)]) 
    features['FP'] = tags_to_word_pos(find_FP(tags, offset))
    features['FHword'] = find_FH(tags, offset)[0]
    features['PHR_pre'] = find_PHR_pre(tags, offset)
    features['PV'] = tags_to_word_pos([find_PV(tags, offset)]) if 0 < offset else ''
    features['FHtag'] = find_FHtag(tags, offset)[2]
    features['PVtag'] = find_PVtag(tags, offset)[2] if 0 < offset else ''
    return features


In [3]:
gen_feature_set([('For', 'For', 'IN', 'B-PP'), ('example', 'example', 'NN', 'B-NP'), (',', ',', ',', 'O'), ('an', 'an', 'DT', 'B-NP'), ('open-toe', 'open-toe', 'JJ', 'I-NP'), ('sandal', 'sandal', 'NN', 'I-NP'), ('will', 'will', 'MD', 'B-VP'), ('use', 'use', 'VB', 'I-VP'), ('a', 'a', 'DT', 'B-NP'), ('different', 'different', 'JJ', 'I-NP'), ('style', 'style', 'NN', 'I-NP'), ('last', 'last', 'JJ', 'B-ADJP'), ('to', 'to', 'TO', 'B-PP'), ('a', 'a', 'DT', 'B-NP'), ('boot', 'boot', 'NN', 'I-NP'), ('.', '.', '.', 'O')], 12)



{'BGL': 'last_JJ',
 'BGR': 'a_DT',
 'FH': 'boot_NN',
 'FHtag': 'NN',
 'FHword': 'boot',
 'FP': 'a_DT boot_NN',
 'PHR_pre': 'ADJP',
 'PV': 'use_VB',
 'PVtag': 'VB',
 'TGL': 'style_NN last_JJ',
 'TGLR': 'last_JJ a_DT',
 'TGR': 'a_DT boot_NN'}

In [4]:
from __future__ import division
from sklearn.linear_model import LogisticRegression
from nltk.classify.scikitlearn import SklearnClassifier

def split_mul(text):
    data_list = text.split("\t")
    return list(zip(data_list[0].split(' '), data_list[1].split(' '), data_list[2].split(' '), data_list[3].split(' ')))

def split_t(text):
    return text.split("\t")
    
if __name__ == '__main__':
    corrections = list(map(split_t, open('./data10k/wiki.prep.corrections.clean.10k')))
    data = list(map(split_mul, open("./data10k/wiki.prep.sents.clean.genia.10k")))
    
    size = len(data)
    train = data[:-(size//10)]
    test = data[-(size//10):]

    # prepare your train and test data
    featuresets = []
    for index, tags in enumerate(data):
        offset = int(corrections[index][0]) - 1
        featuresets.append((gen_feature_set(tags, offset), corrections[index][3]))
    
    split_point = len(featuresets)*9//10
    trainData = featuresets[:split_point]
    testData = featuresets[split_point:]

    # train your classifier
#     classifier = SklearnClassifier(LogisticRegression())
#     classifier.train(trainData)

    # test your classifier
#     # ...
#     correct = 0

#     precision = correct / len(testData)

#     print('precision:', precision)
#     print('recall:')
#     print('f1-measure:')


In [6]:
trainData[:1]

[({'BGL': 'last_JJ',
   'BGR': 'a_DT',
   'FH': 'boot_NN',
   'FHtag': 'NN',
   'FHword': 'boot',
   'FP': 'a_DT boot_NN',
   'PHR_pre': 'ADJP',
   'PV': 'use_VB',
   'PVtag': 'VB',
   'TGL': 'style_NN last_JJ',
   'TGLR': 'last_JJ a_DT',
   'TGR': 'a_DT boot_NN'},
  'from')]

In [10]:
import nltk
from nltk.classify import SklearnClassifier 
from sklearn.linear_model import LogisticRegression
%time sklearn_classifier = SklearnClassifier(LogisticRegression()).train(trainData)

CPU times: user 11.1 s, sys: 149 ms, total: 11.2 s
Wall time: 11.5 s


In [28]:
from nltk.classify import MaxentClassifier
%time nltk_classifier = MaxentClassifier.train(trainData, nltk.classify.MaxentClassifier.ALGORITHMS[0])

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -4.81218        0.001
             2          -0.62498        0.942
             3          -0.24841        0.976
             4          -0.15927        0.985
      Training stopped: keyboard interrupt
         Final          -0.12019        0.989
CPU times: user 5min 56s, sys: 2.11 s, total: 5min 58s
Wall time: 6min 2s


In [11]:
print('== SkLearn MaxEnt ==')
print(sklearn_classifier.classify(gen_feature_set([('For', 'For', 'IN', 'B-PP'), ('example', 'example', 'NN', 'B-NP'), (',', ',', ',', 'O'), ('an', 'an', 'DT', 'B-NP'), ('open-toe', 'open-toe', 'JJ', 'I-NP'), ('sandal', 'sandal', 'NN', 'I-NP'), ('will', 'will', 'MD', 'B-VP'), ('use', 'use', 'VB', 'I-VP'), ('a', 'a', 'DT', 'B-NP'), ('different', 'different', 'JJ', 'I-NP'), ('style', 'style', 'NN', 'I-NP'), ('last', 'last', 'JJ', 'B-ADJP'), ('to', 'to', 'TO', 'B-PP'), ('a', 'a', 'DT', 'B-NP'), ('boot', 'boot', 'NN', 'I-NP'), ('.', '.', '.', 'O')], 12)))
print(nltk.classify.accuracy(sklearn_classifier, testData))
print()
# print('== NLTK MaxEnt ==')
# print(nltk_classifier.classify(gender_features('mark')))
# print(nltk.classify.accuracy(nltk_classifier, test_set))
# print(nltk_classifier.show_most_informative_features(10))

== SkLearn MaxEnt ==
from
0.393

