In [1]:
import numpy as np
import pandas as pd
import scipy
from scipy import optimize
from utils import preprocess, score, features

In [2]:
# params, maybe need to remove that
model_name = 'model.pickle'
model_matrices = 'model_matrices.pickle'
model_preprocess = 'model_preprocess.pickle'
load_model = False
load_matrices = False
load_preprocess = False
verbose = 1

# variables
limit_common_words = 5
threshold = 0
args = None
lamda = 0.1

# data files
comp1_path = 'data/comp1.words'
comp2_path = 'data/comp2.words'
test1_path = 'data/test1.wtag'
train1_path = 'data/train1.wtag'
train2_path = 'data/train2.wtag'

In [3]:
%%time
train1_statistics = preprocess.feature_statistics_class(train1_path, limit_common_words, lamda)
train1_statistics.get_statistics()
# train2_statistics = feature_statistics_class(train2_path, limit_common_words, lamda)
# train2_statistics.get_statistics()
id_class = preprocess.feature2id_class(train1_statistics, threshold)
id_class.get_features()

CPU times: user 9.92 s, sys: 219 ms, total: 10.1 s
Wall time: 10.2 s


In [4]:
all_features = dict()

In [5]:
all_tags = set(train1_statistics.tags)
print(all_tags)

{',', 'JJ', '-LRB-', 'VBG', 'VBP', 'NNPS', 'WP', 'VB', 'TO', 'CC', 'SYM', '-RRB-', 'PRP$', 'PRP', '#', 'MD', 'WRB', 'VBN', ':', '.', 'JJR', 'EX', 'UH', 'RBR', 'NN', 'POS', 'VBZ', "''", 'WDT', 'NNP', 'JJS', '``', 'IN', '*', 'RB', 'NNS', 'RBS', 'FW', 'VBD', 'DT', 'RP', '$', 'CD', 'PDT', 'WP$'}


## Create FeatureGroup params

In [6]:
group_name = 'wi_t'
hash_rules = ('w[i]', 't')
comb_template = ('word', 'tag')

combs_dict = dict()
for word in train1_statistics.TagsPerWord:
    for tag in train1_statistics.TagsPerWord[word]:
        comb = tuple([eval(feat) for feat in comb_template])
        if comb in combs_dict:
            combs_dict[comb] += 1
        else:
            combs_dict[comb] = 1

all_features[group_name] = features.FeatureGroup(hash_rules, set(combs_dict.keys()))

In [7]:
group_name = 'pre3_wi_t'
hash_rules = ('prefix(w[i], 3)', 't')
comb_template = ('features.prefix(word, 3)', 'tag')

combs_dict = dict()
for word in train1_statistics.TagsPerWord:
    for tag in train1_statistics.TagsPerWord[word]:
        comb = tuple([eval(feat) for feat in comb_template])
        if comb in combs_dict:
            combs_dict[comb] += 1
        else:
            combs_dict[comb] = 1

all_features[group_name] = features.FeatureGroup(hash_rules, set(combs_dict.keys()))

In [8]:
group_name = 'pre2_wi_t'
hash_rules = ('prefix(w[i], 2)', 't')
comb_template = ('features.prefix(word, 2)', 'tag')

combs_dict = dict()
for word in train1_statistics.TagsPerWord:
    for tag in train1_statistics.TagsPerWord[word]:
        comb = tuple([eval(feat) for feat in comb_template])
        if comb in combs_dict:
            combs_dict[comb] += 1
        else:
            combs_dict[comb] = 1

all_features[group_name] = features.FeatureGroup(hash_rules, set(combs_dict.keys()))

In [9]:
group_name = 'suf3_wi_t'
hash_rules = ('suffix(w[i], 3)', 't')
comb_template = ('features.suffix(word, 3)', 'tag')

combs_dict = dict()
for word in train1_statistics.TagsPerWord:
    for tag in train1_statistics.TagsPerWord[word]:
        comb = tuple([eval(feat) for feat in comb_template])
        if comb in combs_dict:
            combs_dict[comb] += 1
        else:
            combs_dict[comb] = 1

all_features[group_name] = features.FeatureGroup(hash_rules, set(combs_dict.keys()))

In [10]:
group_name = 'suf2_wi_t'
hash_rules = ('suffix(w[i], 2)', 't')
comb_template = ('features.suffix(word, 2)', 'tag')

combs_dict = dict()
for word in train1_statistics.TagsPerWord:
    for tag in train1_statistics.TagsPerWord[word]:
        comb = tuple([eval(feat) for feat in comb_template])
        if comb in combs_dict:
            combs_dict[comb] += 1
        else:
            combs_dict[comb] = 1

all_features[group_name] = features.FeatureGroup(hash_rules, set(combs_dict.keys()))

In [11]:
group_name = 't1_t'
hash_rules = ('t1', 't')
comb_template = 'tuple(sentence[i: i + 2])'

combs_dict = dict()
for sentence in train1_statistics.sentences_with_only_tag:
    sentence = ['*', '*'] + sentence
    for i in range(len(sentence) - 1):
        comb = eval(comb_template)
        if comb in combs_dict:
            combs_dict[comb] += 1
        else:
            combs_dict[comb] = 1

all_features[group_name] = features.FeatureGroup(hash_rules, set(combs_dict.keys()))

In [12]:
group_name = 't2_t1_t'
hash_rules = ('t2', 't1', 't')
comb_template = 'tuple(sentence[i: i + 3])'

combs_dict = dict()
for sentence in train1_statistics.sentences_with_only_tag:
    sentence = ['*', '*'] + sentence
    for i in range(len(sentence) - 2):
        comb = eval(comb_template)
        if comb in combs_dict:
            combs_dict[comb] += 1
        else:
            combs_dict[comb] = 1

all_features[group_name] = features.FeatureGroup(hash_rules, set(combs_dict.keys()))

In [13]:
total_feats = 0
for feat in all_features:
    feat_len = len(all_features[feat].hash_dict)
    total_feats += feat_len
    print('feat_group:', feat, '| feats:', feat_len)
print('feat_groups:', len(all_features), '| total_feats:', total_feats)

feat_group: wi_t | feats: 15415
feat_group: pre3_wi_t | feats: 8061
feat_group: pre2_wi_t | feats: 3009
feat_group: suf3_wi_t | feats: 11844
feat_group: suf2_wi_t | feats: 13612
feat_group: t1_t | feats: 1061
feat_group: t2_t1_t | feats: 8150
feat_groups: 7 | total_feats: 61152


In [15]:
# features = []

# # one-to-one features
# for word in strange_words:
#     features.append(Feature(f'w[i] == "{word}"', t=train1_model.TagsPerWord[word][0]))
#     print(word, train1_model.WordCount[word], train1_model.TagsPerWord[word])

. 4914 ['.']
`` 838 ['``']
, 6044 [',']
'' 813 ["''"]
$ 773 ['$']
-- 256 [':']
-LRB- 148 ['-LRB-']
-RRB- 148 ['-RRB-']
: 149 [':']
-LCB- 30 ['-LRB-']
-RCB- 30 ['-RRB-']
; 133 [':']
... 19 [':']
C$ 13 ['$']
US$ 15 ['$']
A$ 1 ['$']
? 43 ['.']
! 5 ['.']
` 8 ['``']
- 8 [':']
# 7 ['#']
HK$ 6 ['$']


In [75]:
t2 = 'VB'
t1 = 'NN'
w = ('preprocess', 'abc', 'being')
i = 1
t = 'BB'

In [76]:
print(feat(t2, t1, w, i, t))

True
