In [1]:
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *
from io import open

In [2]:
from NoisyNLP.utils import *
from NoisyNLP.features import *

In [3]:
train_sequences = load_sequences("./data/cleaned/train.BIEOU.tsv", sep="\t", notypes=False)
dev_sequences = (load_sequences("./data/cleaned/dev.BIEOU.tsv", sep="\t", notypes=False) 
                 + load_sequences("./data/cleaned/dev_2015.BIEOU.tsv", sep="\t", notypes=False))
test_sequences = load_sequences("./data/cleaned/test.BIEOU.tsv", sep="\t", notypes=False)
vocab = load_vocab("./vocab.no_extras.txt")
cat_names = get_cat_names(train_sequences)
print("Vocab: %s, cat_names: %s\nTrain: %s, Dev: %s, Test: %s" % (len(vocab), cat_names, len(train_sequences), len(dev_sequences), len(test_sequences)))

Vocab: 8023, cat_names: set([u'product', u'facility', u'movie', u'company', u'sportsteam', u'musicartist', u'person', u'other', u'geo-loc', u'tvshow'])
Train: 2394, Dev: 1420, Test: 3856


In [4]:
all_sentences = [[t[0] for t in seq] for seq in (train_sequences+dev_sequences)]

In [5]:
results = """processed 23050 tokens with 937 phrases; found: 748 phrases; correct: 343.
accuracy:  94.34%; precision:  45.86%; recall:  36.61%; FB1:  40.71
          company: precision:  46.94%; recall:  31.94%; FB1:  38.02  49
         facility: precision:  10.81%; recall:   8.70%; FB1:   9.64  37
          geo-loc: precision:  45.45%; recall:  58.64%; FB1:  51.21  209
            movie: precision:   0.00%; recall:   0.00%; FB1:   0.00  8
      musicartist: precision:  28.57%; recall:   3.51%; FB1:   6.25  7
            other: precision:  26.45%; recall:  17.88%; FB1:  21.33  121
           person: precision:  61.39%; recall:  65.16%; FB1:  63.22  259
          product: precision:  24.14%; recall:  15.22%; FB1:  18.67  29
       sportsteam: precision:  77.78%; recall:  20.00%; FB1:  31.82  27
           tvshow: precision:   0.00%; recall:   0.00%; FB1:   0.00  2
"""

In [6]:
r = parse_results(results)
print(r)

{u'overall_accuracy': 94.34, u'processed_tokens': u'23050', u'prfs':        category  precision  recall     F1  support
0       overall      45.86   36.61  40.71      0.0
1       company      46.94   31.94  38.02     49.0
2      facility      10.81    8.70   9.64     37.0
3       geo-loc      45.45   58.64  51.21    209.0
4         movie       0.00    0.00   0.00      8.0
5   musicartist      28.57    3.51   6.25      7.0
6         other      26.45   17.88  21.33    121.0
7        person      61.39   65.16  63.22    259.0
8       product      24.14   15.22  18.67     29.0
9    sportsteam      77.78   20.00  31.82     27.0
10       tvshow       0.00    0.00   0.00      2.0, u'found': u'748', u'total': u'937', u'correct': u'343'}


In [7]:
r["prfs"]

Unnamed: 0,category,precision,recall,F1,support
0,overall,45.86,36.61,40.71,0.0
1,company,46.94,31.94,38.02,49.0
2,facility,10.81,8.7,9.64,37.0
3,geo-loc,45.45,58.64,51.21,209.0
4,movie,0.0,0.0,0.0,8.0
5,musicartist,28.57,3.51,6.25,7.0
6,other,26.45,17.88,21.33,121.0
7,person,61.39,65.16,63.22,259.0
8,product,24.14,15.22,18.67,29.0
9,sportsteam,77.78,20.0,31.82,27.0


In [8]:
get_conll_eval("./test_wv.crf.bieou.tsv")

Running:
cat "./test_wv.crf.bieou.tsv" | tr '\t' ' ' | perl -ne '{chomp;s/\r//g;print $_,"\n";}' | python NoisyNLP/conlleval.py


{u'correct': '839',
 u'found': '1661',
 u'overall_accuracy': 91.84,
 u'prfs':        category  precision  recall     F1  support
 0       overall      50.51   24.05  32.58      0.0
 1       company      63.01   14.79  23.96    146.0
 2      facility      43.33   15.42  22.74     90.0
 3       geo-loc      66.38   52.71  58.76    702.0
 4         movie       0.00    0.00   0.00      9.0
 5   musicartist      20.00    1.04   1.98     10.0
 6         other      29.14    8.63  13.32    175.0
 7        person      40.44   33.88  36.87    408.0
 8       product      18.31    5.28   8.20     71.0
 9    sportsteam      24.39    6.80  10.64     41.0
 10       tvshow      11.11    3.03   4.76      9.0,
 u'processed_tokens': '61908',
 u'total': '3489'}

In [9]:
regex_features = RegexFeatures()

Initialized RegexFeature


In [10]:
regex_features.process("ABC123")

{u'containsDigit': True, u'isAlphaNumeric': True}

In [11]:
dict_features = DictionaryFeatures("./data/test/lexicons")

read dict cap.1000


In [12]:
dict_features.GetDictFeatures(["new", "york", "is", "a", "great", "place", "in", "america"], 1)

[u'DICT=cap.1000']

In [13]:
dict_features.GetHashtagDictFeatures("#york")

[u'DICT_HASHTAG=cap.1000']

In [14]:
sequences = [("the", "man", "crossed", "the", "road", "in", "new", "york"),
             ("new", "york", "is", "a", "great", "place", "in", "america")
            ]

word2vec = WordVectors(sequences, "./data/test/glove.twitter.200d.txt.processed.txt")

In [15]:
set(word2vec.model.vocab.keys()) == set(sum(sequences, ()))

True

In [16]:
word2vec.get_clusters(n_clusters=3)

{u'a': 2,
 u'america': 0,
 u'crossed': 1,
 u'great': 1,
 u'in': 1,
 u'is': 1,
 u'man': 2,
 u'new': 1,
 u'place': 0,
 u'road': 0,
 u'the': 1,
 u'york': 1}

In [17]:
word2vec.model.syn0norm

array([[-0.01950729, -0.0318368 , -0.03900977, ..., -0.04715571,
         0.06079158, -0.01819291],
       [ 0.0069145 , -0.00876959,  0.06516815, ...,  0.00377096,
         0.0426897 , -0.05388267],
       [ 0.03771846,  0.02122468, -0.05407947, ..., -0.04112153,
         0.03916675, -0.00868494],
       ..., 
       [ 0.01300056, -0.03810946,  0.0111307 , ..., -0.07383171,
         0.05161401,  0.01348415],
       [ 0.06153985,  0.03178791,  0.03036125, ...,  0.08684369,
        -0.0501973 ,  0.00344352],
       [ 0.02337532,  0.10477841, -0.08604416, ...,  0.05988278,
        -0.013699  ,  0.08674569]], dtype=float32)

In [18]:
cf = ClusterFeatures("data/test/brown_clusters/", cluster_type="brown")
cf.set_cluster_file_path("data/test/brown_clusters/paths")
clusters = cf.read_clusters()

In [19]:
clusters

{u'15mins': u'00000',
 u'Burrows': u'00000',
 u'Carolina': u'00000',
 u'Mythological': u'00000',
 u'day-to-day': u'00000',
 u'elbows': u'00000',
 u'rearview': u'00000',
 u'sec': u'00000',
 u'sexiness': u'00000',
 u'walls': u'00000'}

In [20]:
cf = ClusterFeatures("data/test/clark_clusters/", cluster_type="clark")
cf.set_cluster_file_path("data/test/clark_clusters/clark_clusters.32.txt")
clusters = cf.read_clusters()

In [21]:
clusters

{u'!': (u'1', 0.183724),
 u"'s": (u'30', 0.303898),
 u',': (u'27', 0.553379),
 u'.': (u'1', 0.439693),
 u'...': (u'1', 0.121445),
 u':': (u'13', 0.655373),
 u'I': (u'5', 0.404412),
 u'RT': (u'22', 0.613343),
 u'a': (u'14', 0.273296),
 u'and': (u'27', 0.249616),
 u'at': (u'2', 0.115745),
 u'for': (u'2', 0.146101),
 u'in': (u'2', 0.156466),
 u'it': (u'10', 0.224207),
 u'of': (u'2', 0.127345),
 u'on': (u'2', 0.136476),
 u'the': (u'14', 0.359631),
 u'to': (u'28', 0.925121),
 u'you': (u'19', 0.452206)}