In [1]:
def gen_feature_set(tags, offset):
    def tags_to_word_pos(tags):
        if tags[0] is None:
            return ''
        else :
            return " ".join((tag[0] + "_" + tag[2] for tag in tags))
    
    def find_FH(tags, offset):
        sentence_length = len(tags)
        # Lemma of headword of following phrase with PoS tag
        temp_tag = None
        for tag in tags[offset+1:sentence_length]:
            if tag[3][0]=='B' and temp_tag is None:
                temp_tag = tag
            if temp_tag is not None and tag[3][0]=='I':
                return tag
            
        if temp_tag is None:
            return ('','','','')
        return temp_tag
    
    def find_FP(tags, offset):
        sentence_length = len(tags)
        # Following phrase including PoS tags
        temp_tag = None
        for tag in tags[offset+1:sentence_length]:
            if tag[3][0]=='B' and temp_tag is None:
                temp_tag = [tag]
            if temp_tag is not None and tag[3][0]=='I':
                temp_tag.append(tag)
                return temp_tag

        if temp_tag is None:
            return [None]
        return temp_tag
    
    def find_PHR_pre(tags, offset):
        sentence_length = len(tags)
        # Preceding phrase type
        temp_tag = ''
        for tag in tags[0:offset]:
            if tag[3][0]=='B':
                temp_tag = tag[3].split('-')[1]
        return temp_tag 

    def find_PV(tags, offset):
        sentence_length = len(tags)
        # Preceding verb lemma with PoS tag
        temp_tag = None
        for tag in tags[0:offset]:
            if tag[2][0]=='V':
                temp_tag = tag
        return temp_tag

        
    def find_FHtag(tags, offset):
        sentence_length = len(tags)
        # PoS tag of headword of the following phrase
        temp_tag = None
        for tag in tags[offset+1:sentence_length]:
            if tag[3][0]=='B' and temp_tag is None:
                temp_tag = tag
            if temp_tag is not None and tag[3][0]=='I':
                return tag

        if temp_tag is None:
            return ('','','','')
        return temp_tag
    
    def find_PVtag(tags, offset):
        sentence_length = len(tags)
        # PoS tag of the preceding verb
        temp_tag = None
        for tag in tags[0:offset]:
            if tag[2][0]=='V':
                temp_tag = tag

        if temp_tag is None:
            return ('','','','')
        return temp_tag
    
    features = {}
    features['TGLR'] = tags_to_word_pos([tags[offset - 1], tags[offset + 1]]) if 0 < offset < len(tags) - 1 else ''
    features['TGL'] = tags_to_word_pos([tags[offset - 2], tags[offset - 1]]) if 1 < offset else ''
    features['TGR'] = tags_to_word_pos([tags[offset + 1], tags[offset + 2]]) if offset < len(tags) - 2 else ''
    features['BGL'] = tags_to_word_pos([tags[offset - 1]]) if 0 < offset else ''
    features['BGR'] = tags_to_word_pos([tags[offset + 1]]) if offset < len(tags) - 1 else ''
    features['FH'] = tags_to_word_pos([find_FH(tags, offset)]) 
    features['FP'] = tags_to_word_pos(find_FP(tags, offset))
    features['FHword'] = find_FH(tags, offset)[0]
    features['PHR_pre'] = find_PHR_pre(tags, offset)
    features['PV'] = tags_to_word_pos([find_PV(tags, offset)]) if 0 < offset else ''
    features['FHtag'] = find_FHtag(tags, offset)[2]
    features['PVtag'] = find_PVtag(tags, offset)[2] if 0 < offset else ''
    return features


In [2]:
feture_test = [('For', 'For', 'IN', 'B-PP'), ('example', 'example', 'NN', 'B-NP'), (',', ',', ',', 'O'), ('an', 'an', 'DT', 'B-NP'), ('open-toe', 'open-toe', 'JJ', 'I-NP'), ('sandal', 'sandal', 'NN', 'I-NP'), ('will', 'will', 'MD', 'B-VP'), ('use', 'use', 'VB', 'I-VP'), ('a', 'a', 'DT', 'B-NP'), ('different', 'different', 'JJ', 'I-NP'), ('style', 'style', 'NN', 'I-NP'), ('last', 'last', 'JJ', 'B-ADJP'), ('to', 'to', 'TO', 'B-PP'), ('a', 'a', 'DT', 'B-NP'), ('boot', 'boot', 'NN', 'I-NP'), ('.', '.', '.', 'O')]
gen_feature_set(feture_test, 12)



{'BGL': 'last_JJ',
 'BGR': 'a_DT',
 'FH': 'boot_NN',
 'FHtag': 'NN',
 'FHword': 'boot',
 'FP': 'a_DT boot_NN',
 'PHR_pre': 'ADJP',
 'PV': 'use_VB',
 'PVtag': 'VB',
 'TGL': 'style_NN last_JJ',
 'TGLR': 'last_JJ a_DT',
 'TGR': 'a_DT boot_NN'}

In [3]:
from __future__ import division
from sklearn.linear_model import LogisticRegression
from nltk.classify.scikitlearn import SklearnClassifier
import re

def split_mul(text):
    data_list = text.split("\t")
    return list(zip(data_list[0].split(' '), data_list[1].split(' '), data_list[2].split(' '), data_list[3].split(' ')))

def split_t(text):
    return text.split("\t")
    
if __name__ == '__main__':
    corrections = list(map(split_t, open('./data100k/wiki.prep.corrections.clean.100k')))
    data = list(map(split_mul, open("./data100k/wiki.prep.sents.clean.genia.100k")))
    
    size = len(data)
    train = data[:-(size//10)]
    test = data[-(size//10):]

    # prepare your train and test data
    featuresets = []
    for index, tags in enumerate(data):
        offset = int(corrections[index][0]) - 1
        
        answer = re.sub('[^0-9a-zA-Z]+', '', corrections[index][3].lower())
        original_answer = re.sub('[^0-9a-zA-Z]+', '', corrections[index][1].lower())
        featuresets.append(((gen_feature_set(tags, offset), answer), original_answer))
    
    split_point = len(featuresets)*9//10
    train_data_total = featuresets[:split_point]
    test_data_total = featuresets[split_point:]
    
    trainData = list(map(lambda a: a[0], train_data_total))
    testData = list(map(lambda a: a[0], test_data_total))


In [4]:
trainData[:1]

[({'BGL': 'last_JJ',
   'BGR': 'a_DT',
   'FH': 'boot_NN',
   'FHtag': 'NN',
   'FHword': 'boot',
   'FP': 'a_DT boot_NN',
   'PHR_pre': 'ADJP',
   'PV': 'use_VB',
   'PVtag': 'VB',
   'TGL': 'style_NN last_JJ',
   'TGLR': 'last_JJ a_DT',
   'TGR': 'a_DT boot_NN'},
  'from')]

In [5]:
import nltk
from nltk.classify import SklearnClassifier 
from sklearn.linear_model import LogisticRegression
%time sklearn_classifier = SklearnClassifier(LogisticRegression()).train(trainData)

CPU times: user 17min 13s, sys: 52min 31s, total: 1h 9min 45s
Wall time: 4min 26s


In [6]:
print('== SkLearn MaxEnt ==')
print(sklearn_classifier.classify(gen_feature_set(feture_test, 12)))
print(nltk.classify.accuracy(sklearn_classifier, testData))

== SkLearn MaxEnt ==
from
0.4626


In [7]:
testData[:1]

[({'BGL': 'capacity_NN',
   'BGR': 'power_NN',
   'FH': 'generation_NN',
   'FHtag': 'NN',
   'FHword': 'generation',
   'FP': 'power_NN generation_NN',
   'PHR_pre': 'NP',
   'PV': 'installed_VBN',
   'PVtag': 'VBN',
   'TGL': 'installed_VBN capacity_NN',
   'TGLR': 'capacity_NN power_NN',
   'TGR': 'power_NN generation_NN'},
  'for')]

In [8]:
pdist = sklearn_classifier.prob_classify(gen_feature_set(feture_test, 12))
for label in pdist.samples():
    print("%s: %f" % (label, pdist.prob(label)))


behind: 0.000561
afte: 0.000186
over: 0.004591
underneath: 0.000322
up: 0.000651
aboard: 0.000371
alpoog: 0.000186
abou: 0.000272
around: 0.000824
aebove: 0.000359
after: 0.006642
amongst: 0.000542
during: 0.010799
per: 0.000359
asa: 0.000167
onto: 0.043416
that: 0.008059
across: 0.001712
below: 0.000235
fhom: 0.000170
beyond: 0.000717
astride: 0.000428
like: 0.002556
til: 0.000230
na: 0.000182
nearer: 0.000183
though: 0.000423
arond: 0.000186
unlike: 0.000241
byetmn: 0.000236
amidst: 0.000226
albeit: 0.000464
beneath: 0.000460
accross: 0.000233
with: 0.023962
bewteen: 0.000174
although: 0.001244
alongside: 0.000620
than: 0.021757
versus: 0.000183
on: 0.016411
past: 0.000947
notwithstanding: 0.000225
aboards: 0.000170
huggimg: 0.000254
under: 0.001743
en: 0.000179
whilst: 0.000254
ago: 0.000239
wiff: 0.000192
att: 0.000171
since: 0.002180
o: 0.000236
along: 0.001795
befor: 0.000164
betwen: 0.000176
except: 0.000335
whether: 0.004016
before: 0.001428
within: 0.009024
off: 0.002282
ta: 0

In [9]:
# test your classifier without threshold
correct = 0
total_altered_data = len(testData)

for data in test_data_total:
    test_data = data[0]
    original_label = data[1]

    pdist = sklearn_classifier.prob_classify(test_data[0])
    label = pdist.max()
    label_prob = pdist.prob(label)
    if test_data[1] == label:
        correct += 1
    elif test_data[1] == original_label:
        total_altered_data-=1

precision = correct / total_altered_data
recall = correct / len(testData)

print('Amount of all data:', len(testData))
print('Correction:', correct)
print('Amount of data classifier altered:' ,total_altered_data)

print('precision:', precision)
print('recall:', recall)
print('f1-measure:', 2*precision*recall / (precision+recall) )

Amount of all data: 10000
Correction: 4626
Amount of data classifier altered: 9996
precision: 0.46278511404561823
recall: 0.4626
f1-measure: 0.46269253850770153


In [10]:
# recall 要改的當中 我改對的
# precision 我改得當中 我改對的

# test your classifier
correct = 0
threshold = 0.1835
total_altered_data = len(testData)

for data in test_data_total:
    test_data = data[0]
    original_label = data[1]

    pdist = sklearn_classifier.prob_classify(test_data[0])
    label = pdist.max()
    label_prob = pdist.prob(label)
    if label_prob > threshold:
        if test_data[1] == label:
            correct += 1
        elif test_data[1] == original_label:
            total_altered_data-=1
    else:
        total_altered_data-=1

precision = correct / total_altered_data
recall = correct / len(testData)

print('Amount of all data:', len(testData))
print('Correction:', correct)
print('Amount of data classifier altered:' ,total_altered_data)

print('precision:', precision)
print('recall:', recall)
print('f1-measure:', 2*precision*recall / (precision+recall) )

Amount of all data: 10000
Correction: 4573
Amount of data classifier altered: 9550
precision: 0.478848167539267
recall: 0.4573
f1-measure: 0.4678260869565217
