In [1]:
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *
from io import open

import time, datetime

from NoisyNLP.utils import *
from NoisyNLP.features import *
from NoisyNLP.models import *
from NoisyNLP.experiments import *

In [2]:
train_files = ["./data/cleaned/train.BIEOU.tsv"]
dev_files = ["./data/cleaned/dev.BIEOU.tsv", "./data/cleaned/dev_2015.BIEOU.tsv"]
test_files = ["./data/cleaned/test.BIEOU.tsv"]
vocab_file = "./vocab.no_extras.txt"
outdir = "./test_exp"
wordvec_file = "/home/entity/Downloads/GloVe/glove.twitter.27B.200d.txt.processed.txt"
dictionary_dir="./data/cleaned/custom_lexicons/"
gimple_twitter_brown_clusters_dir="/home/entity/Downloads/GloVe/50mpaths2"
data_brown_cluster_dir="word_clusters/"
data_clark_cluster_dir="clark_clusters/"

In [3]:
exp = Experiment(outdir, train_files, dev_files, test_files, vocab_file)
all_sequences = [[t[0] for t in seq] 
                        for seq in (exp.train_sequences + exp.dev_sequences + exp.test_sequences)]
print("Total sequences: ", len(all_sequences))

Total sequences:  7670


In [4]:
wv_model = WordVectors(all_sequences,wordvec_file)
word2vec_clusters = wv_model.get_clusters(n_clusters=50)

In [5]:
dict_features = DictionaryFeatures(dictionary_dir)

read dict time.holiday
read dict location.country
read dict award.award
read dict location
read dict movies.txt.results.txt
read dict lastname.5000
read dict sports.sports_team
read dict tv.tv_program
read dict book.newspaper
read dict internet.website
read dict base.events.festival_series
read dict products.txt.results.txt
read dict persons.txt.results.txt
read dict tv.tv_network
read dict cvg.computer_videogame
read dict business.consumer_product
read dict people.family_name
read dict tvshows.txt.results.txt
read dict government.government_agency
read dict venues
read dict broadcast.tv_channel
read dict music_artists.txt.results.txt
read dict automotive.make
read dict product
read dict business.consumer_company
read dict music_artists.txt
read dict venture_capital.venture_funded_company
read dict cvg.cvg_developer
read dict architecture.museum
read dict lower.5000
read dict automotive.model
read dict companynames.txt.results.txt
read dict musicartist_namevariants.unique.txt
read dict

In [6]:
gimple_brown_cf = ClusterFeatures(gimple_twitter_brown_clusters_dir, cluster_type="brown")
gimple_brown_cf.set_cluster_file_path(gimple_twitter_brown_clusters_dir)
gimple_brown_clusters = gimple_brown_cf.read_clusters()

In [7]:
data_brown_cf = ClusterFeatures(data_brown_cluster_dir, cluster_type="brown")
data_brown_cf.set_cluster_file_path()
data_brown_clusters = data_brown_cf.read_clusters()

In [8]:
data_clark_cf = ClusterFeatures(data_clark_cluster_dir, cluster_type="clark", n_clusters=32)
data_clark_cf.set_cluster_file_path()
data_clark_clusters = data_clark_cf.read_clusters()

In [9]:
import pickle

In [10]:
with open("pickled_data/wv_model.pkl", "wb+") as fp:
    pickle.dump(wv_model, fp)

In [11]:
with open("pickled_data/word2vec_clusters.pkl", "wb+") as fp:
    pickle.dump(word2vec_clusters, fp)

In [14]:
print("Done")

Done


In [15]:
def get_X_y_exp1(sequences):
    X = [sent2features(s, vocab=None,
                         dict_features=dict_features, vocab_presence_only=False,
                         window=2, interactions=True, dict_interactions=False,
                         lowercase=True, dropout=0, word2vec_model=wv_model.model,
                        cluster_vocabs=[
            gimple_brown_clusters,
            data_brown_clusters,
            data_clark_clusters
        ])
         for s in sequences]
    y = [sent2labels(s) for s in sequences]
    return X, y

In [16]:
exp.gen_model_data(proc_func=get_X_y_exp1)

Train feature generation took: 0:00:32.735653
Dev feature generation took: 0:00:14.997115
Test feature generation took: 0:00:36.278326
Train: 2394, 2394
Dev: 1420, 1420
Test: 3856, 3856


In [17]:
exp.fit_evaluate()

Model fitting took: 0:09:09.681945
Evaluating train data
Running:
cat "./test_exp/train.bieou.tsv" | tr '\t' ' ' | perl -ne '{chomp;s/\r//g;print $_,"\n";}' | python NoisyNLP/conlleval.py
Processed 46469 tokens.
Phrases: total=1499, found=1261, correct=1226
Overall accuracy: 99.21
       category  precision  recall     F1  support
0       overall      97.22   81.79  88.84      0.0
1       company      98.50   76.61  86.18    133.0
2      facility      96.59   80.95  88.08     88.0
3       geo-loc      96.12   89.53  92.71    258.0
4         movie     100.00   79.41  88.52     27.0
5   musicartist     100.00   74.55  85.42     41.0
6         other      98.31   77.33  86.57    177.0
7        person      96.38   88.86  92.47    414.0
8       product     100.00   69.39  81.93     68.0
9    sportsteam      94.59   68.63  79.55     37.0
10       tvshow     100.00   52.94  69.23     18.0
Running:
cat "./test_exp/train.notypes.tsv" | tr '\t' ' ' | perl -ne '{chomp;s/\r//g;print $_,"\n";}' | py

In [18]:
exp.describe_model()

Top 25 likely transitions:
O      -> O       4.644402
B-person -> E-person 4.411072
I-other -> E-other 4.175164
I-other -> I-other 4.124702
B-other -> I-other 3.928763
B-other -> E-other 3.731435
B-facility -> E-facility 3.701140
B-product -> E-product 3.352413
I-facility -> E-facility 3.131877
B-musicartist -> E-musicartist 3.066101
B-company -> E-company 2.957187
B-facility -> I-facility 2.954348
B-geo-loc -> E-geo-loc 2.939255
I-product -> I-product 2.675727
B-sportsteam -> E-sportsteam 2.524979
B-movie -> E-movie 2.385254
B-tvshow -> E-tvshow 2.330000
I-product -> E-product 2.308091
B-product -> I-product 2.251306
I-person -> E-person 2.223817
U-person -> O       2.219954
O      -> U-person 2.197314
B-movie -> I-movie 2.187085
I-movie -> E-movie 2.111887
E-person -> O       2.062839
3.155129 O        __EOS__
2.953221 O        __BOS__
2.699114 U-geo-loc __BROWN_CLUSTER_0__:11100110101
1.955702 U-person __BROWN_CLUSTER_0__:111001110
1.927359 U-geo-loc __BROWN_CLUSTER_0__:11100110100


## Train + Dev

In [19]:
outdir = "./test_exp_train_dev"
exp = Experiment(outdir, train_files + dev_files, dev_files, test_files, vocab_file)

Directory ./test_exp_train_dev doesn't exist.
Directory ./test_exp_train_dev created.


In [20]:
exp.gen_model_data(proc_func=get_X_y_exp1)

Train feature generation took: 0:00:41.075128
Dev feature generation took: 0:00:13.437942
Test feature generation took: 0:00:34.648778
Train: 3814, 3814
Dev: 1420, 1420
Test: 3856, 3856


In [22]:
exp.fit_evaluate()

Model fitting took: 0:15:40.636907
Evaluating train data
Running:
cat "./test_exp_train_dev/train.bieou.tsv" | tr '\t' ' ' | perl -ne '{chomp;s/\r//g;print $_,"\n";}' | python NoisyNLP/conlleval.py
Processed 69519 tokens.
Phrases: total=2436, found=2033, correct=1949
Overall accuracy: 99.10
       category  precision  recall     F1  support
0       overall      95.87   80.01  87.22      0.0
1       company      97.30   74.07  84.11    185.0
2      facility      95.04   76.16  84.56    121.0
3       geo-loc      93.38   86.79  89.96    408.0
4         movie     100.00   80.77  89.36     42.0
5   musicartist      96.51   74.11  83.84     86.0
6         other      96.56   76.49  85.36    320.0
7        person      95.62   88.17  91.74    639.0
8       product      98.04   69.44  81.30    102.0
9    sportsteam      98.18   69.23  81.20    110.0
10       tvshow     100.00   47.62  64.52     20.0
Running:
cat "./test_exp_train_dev/train.notypes.tsv" | tr '\t' ' ' | perl -ne '{chomp;s/\r//g;p

In [23]:
exp.describe_model()

Top 25 likely transitions:
B-person -> E-person 5.003097
O      -> O       4.906769
I-other -> E-other 4.725323
I-other -> I-other 4.586412
B-other -> I-other 4.405346
B-other -> E-other 4.210546
I-product -> I-product 4.137379
B-facility -> E-facility 3.962764
B-product -> E-product 3.861498
I-facility -> E-facility 3.755069
B-facility -> I-facility 3.638113
B-company -> E-company 3.637893
I-product -> E-product 3.546175
B-musicartist -> E-musicartist 3.532204
B-sportsteam -> E-sportsteam 3.431834
B-geo-loc -> E-geo-loc 3.311323
B-product -> I-product 3.293329
B-movie -> E-movie 3.077283
I-person -> E-person 2.893064
B-tvshow -> E-tvshow 2.783769
I-musicartist -> E-musicartist 2.735140
B-company -> I-company 2.685208
I-geo-loc -> E-geo-loc 2.682392
B-musicartist -> I-musicartist 2.669835
O      -> U-person 2.583387
3.628546 O        __EOS__
3.055463 O        __BOS__
2.542522 U-sportsteam __BROWN_CLUSTER_0__:1111011010
2.536144 U-geo-loc __BROWN_CLUSTER_0__:11100110101
2.019569 U-perso

## Word vectors and resources enriched by test sequences

In [46]:
test_enriched_data_brown_cluster_dir="test_enriched/brown_clusters/"
test_enriched_data_clark_cluster_dir="test_enriched/clark_clusters/"

In [47]:
brown_exec_path="/home/entity/Downloads/brown-cluster/wcluster"
brown_input_data_path="test_enriched/all_sequences.brown.txt"
test_enriched_data_brown_cf = ClusterFeatures(test_enriched_data_brown_cluster_dir,
                                              cluster_type="brown", n_clusters=100)

In [48]:
test_enriched_data_brown_cf.set_cluster_file_path()
test_enriched_data_brown_clusters = test_enriched_data_brown_cf.read_clusters()

In [49]:
test_enriched_data_clark_cf = ClusterFeatures(test_enriched_data_clark_cluster_dir,
                                              cluster_type="clark", n_clusters=32)
test_enriched_data_clark_cf.set_cluster_file_path()
test_enriched_data_clark_clusters = test_enriched_data_clark_cf.read_clusters()

In [50]:
def get_X_y_exp2(sequences):
    X = [sent2features(s, vocab=None,
                         dict_features=dict_features, vocab_presence_only=False,
                         window=2, interactions=True, dict_interactions=False,
                         lowercase=True, dropout=0, word2vec_model=wv_model.model,
                        cluster_vocabs=[
            gimple_brown_clusters,
            test_enriched_data_brown_clusters,
            test_enriched_data_clark_clusters
        ])
         for s in sequences]
    y = [sent2labels(s) for s in sequences]
    return X, y

In [51]:
outdir = "./test_exp_train_dev_test_enriched"
exp = Experiment(outdir, train_files + dev_files, dev_files, test_files, vocab_file)

In [52]:
exp.gen_model_data(proc_func=get_X_y_exp2)

Train feature generation took: 0:00:40.592333
Dev feature generation took: 0:00:13.204734
Test feature generation took: 0:00:34.990218
Train: 3814, 3814
Dev: 1420, 1420
Test: 3856, 3856


In [53]:
exp.fit_evaluate()

Model fitting took: 0:14:49.150418
Evaluating train data
Running:
cat "./test_exp_train_dev_test_enriched/train.bieou.tsv" | tr '\t' ' ' | perl -ne '{chomp;s/\r//g;print $_,"\n";}' | python NoisyNLP/conlleval.py
Processed 69519 tokens.
Phrases: total=2436, found=2125, correct=2076
Overall accuracy: 99.36
       category  precision  recall     F1  support
0       overall      97.69   85.22  91.03      0.0
1       company      98.98   79.84  88.38    196.0
2      facility      96.85   81.46  88.49    127.0
3       geo-loc      96.87   91.57  94.15    415.0
4         movie     100.00   82.69  90.53     43.0
5   musicartist      97.75   77.68  86.57     89.0
6         other      97.07   81.93  88.86    341.0
7        person      97.54   91.63  94.49    651.0
8       product      99.12   77.78  87.16    113.0
9    sportsteam      99.18   77.56  87.05    122.0
10       tvshow     100.00   66.67  80.00     28.0
Running:
cat "./test_exp_train_dev_test_enriched/train.notypes.tsv" | tr '\t' ' ' 

## Dict interactions and larger window

In [54]:
def get_X_y_exp3(sequences):
    X = [sent2features(s, vocab=None,
                         dict_features=dict_features, vocab_presence_only=False,
                         window=4, interactions=True, dict_interactions=True,
                         lowercase=True, dropout=0, word2vec_model=wv_model.model,
                        cluster_vocabs=[
            gimple_brown_clusters,
            test_enriched_data_brown_clusters,
            test_enriched_data_clark_clusters
        ])
         for s in sequences]
    y = [sent2labels(s) for s in sequences]
    return X, y

In [55]:
outdir = "./test_exp_tdt_enriched_win4_dict_intract"
exp = Experiment(outdir, train_files + dev_files, dev_files, test_files, vocab_file)

Directory ./test_exp_tdt_enriched_win4_dict_intract doesn't exist.
Directory ./test_exp_tdt_enriched_win4_dict_intract created.


In [56]:
exp.gen_model_data(proc_func=get_X_y_exp3)

Train feature generation took: 0:00:40.270651
Dev feature generation took: 0:00:13.110668
Test feature generation took: 0:00:33.877503
Train: 3814, 3814
Dev: 1420, 1420
Test: 3856, 3856


In [57]:
exp.fit_evaluate()

Model fitting took: 0:15:07.623600
Evaluating train data
Running:
cat "./test_exp_tdt_enriched_win4_dict_intract/train.bieou.tsv" | tr '\t' ' ' | perl -ne '{chomp;s/\r//g;print $_,"\n";}' | python NoisyNLP/conlleval.py
Processed 69519 tokens.
Phrases: total=2436, found=2125, correct=2076
Overall accuracy: 99.36
       category  precision  recall     F1  support
0       overall      97.69   85.22  91.03      0.0
1       company      98.98   79.84  88.38    196.0
2      facility      96.85   81.46  88.49    127.0
3       geo-loc      96.87   91.57  94.15    415.0
4         movie     100.00   82.69  90.53     43.0
5   musicartist      97.75   77.68  86.57     89.0
6         other      97.07   81.93  88.86    341.0
7        person      97.54   91.63  94.49    651.0
8       product      99.12   77.78  87.16    113.0
9    sportsteam      99.18   77.56  87.05    122.0
10       tvshow     100.00   66.67  80.00     28.0
Running:
cat "./test_exp_tdt_enriched_win4_dict_intract/train.notypes.tsv" 

## With word features

In [58]:
def get_X_y_exp4(sequences):
    X = [sent2features(s, vocab=None,
                         dict_features=dict_features, vocab_presence_only=False,
                         window=4, interactions=True, dict_interactions=True,
                         lowercase=False, dropout=0, word2vec_model=wv_model.model,
                        cluster_vocabs=[
            gimple_brown_clusters,
            test_enriched_data_brown_clusters,
            test_enriched_data_clark_clusters
        ])
         for s in sequences]
    y = [sent2labels(s) for s in sequences]
    return X, y

In [59]:
outdir = "./test_exp_tdt_enriched_word_Features"
exp = Experiment(outdir, train_files + dev_files, dev_files, test_files, vocab_file)

Directory ./test_exp_tdt_enriched_word_Features doesn't exist.
Directory ./test_exp_tdt_enriched_word_Features created.


In [60]:
exp.gen_model_data(proc_func=get_X_y_exp4)

Train feature generation took: 0:00:40.389356
Dev feature generation took: 0:00:12.946482
Test feature generation took: 0:00:34.710360
Train: 3814, 3814
Dev: 1420, 1420
Test: 3856, 3856


In [61]:
exp.fit_evaluate()

Model fitting took: 0:16:02.068697
Evaluating train data
Running:
cat "./test_exp_tdt_enriched_word_Features/train.bieou.tsv" | tr '\t' ' ' | perl -ne '{chomp;s/\r//g;print $_,"\n";}' | python NoisyNLP/conlleval.py
Processed 69519 tokens.
Phrases: total=2436, found=2167, correct=2129
Overall accuracy: 99.45
       category  precision  recall     F1  support
0       overall      98.25   87.40  92.50      0.0
1       company      98.53   82.72  89.93    204.0
2      facility      96.92   83.44  89.68    130.0
3       geo-loc      97.37   92.94  95.10    419.0
4         movie     100.00   86.54  92.78     45.0
5   musicartist      97.87   82.14  89.32     94.0
6         other      97.97   83.66  90.25    345.0
7        person      98.63   93.22  95.85    655.0
8       product      99.14   79.86  88.46    116.0
9    sportsteam      99.22   81.41  89.44    128.0
10       tvshow     100.00   73.81  84.93     31.0
Running:
cat "./test_exp_tdt_enriched_word_Features/train.notypes.tsv" | tr '\t

## With Global features

In [96]:
%load_ext autoreload

In [108]:
%aimport NoisyNLP.experiments

In [160]:
%autoreload 2

In [161]:
outdir = "./test_exp_tdt_enriched_word_Features_global"
exp = Experiment(outdir, train_files + dev_files, dev_files, test_files, vocab_file)
cat_names = list(set([t[1][2:] for seq in exp.train_sequences for t in seq if t[1] != "O"]))
cat_names

[u'product',
 u'facility',
 u'movie',
 u'company',
 u'sportsteam',
 u'musicartist',
 u'person',
 u'other',
 u'geo-loc',
 u'tvshow']

In [162]:
global_features = GlobalFeatures(word2vec_model=wv_model.model, cluster_vocabs=gimple_brown_clusters, cat_names=cat_names)

In [163]:
exp.train_sequences[:1]

[[Tag(token=u'@SammieLynnsMom', tag=u'O'),
  Tag(token=u'@tg10781', tag=u'O'),
  Tag(token=u'they', tag=u'O'),
  Tag(token=u'will', tag=u'O'),
  Tag(token=u'be', tag=u'O'),
  Tag(token=u'all', tag=u'O'),
  Tag(token=u'done', tag=u'O'),
  Tag(token=u'by', tag=u'O'),
  Tag(token=u'Sunday', tag=u'O'),
  Tag(token=u'trust', tag=u'O'),
  Tag(token=u'me', tag=u'O'),
  Tag(token=u'*wink*', tag=u'O')]]

In [164]:
global_features.fit_model(exp.train_sequences, test_sequences=exp.dev_sequences)

Processing: product
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1377
          1       1.00      0.95      0.98        43

avg / total       1.00      1.00      1.00      1420

Processing: facility
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1375
          1       0.97      0.87      0.92        45

avg / total       0.99      1.00      0.99      1420

Processing: movie
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1406
          1       1.00      0.93      0.96        14

avg / total       1.00      1.00      1.00      1420

Processing: company
             precision    recall  f1-score   support

          0       0.99      1.00      0.99      1360
          1       0.98      0.75      0.85        60

avg / total       0.99      0.99      0.99      1420

Processing: sportsteam
             precision    recall  f1-s

In [165]:
def get_X_y_exp5(sequences):
    preds = global_features.get_global_predictions(sequences)
    X = [sent2features(s, vocab=None,
                         dict_features=dict_features, vocab_presence_only=False,
                         window=4, interactions=True, dict_interactions=True,
                         lowercase=False, dropout=0, word2vec_model=wv_model.model,
                        cluster_vocabs=[
            gimple_brown_clusters,
            test_enriched_data_brown_clusters,
            test_enriched_data_clark_clusters
        ],
                       extra_features=global_features.get_global_sequence_features(s, predictions=p))
         for s,p in zip(sequences, preds)]
    y = [sent2labels(s) for s in sequences]
    return X, y

In [166]:
exp.gen_model_data(proc_func=get_X_y_exp5)

Train feature generation took: 0:01:25.250826
Dev feature generation took: 0:00:27.527947
Test feature generation took: 0:01:13.002616
Train: 3814, 3814
Dev: 1420, 1420
Test: 3856, 3856


In [167]:
exp.fit_evaluate()

Model fitting took: 0:26:30.177333
Evaluating train data
Running:
cat "./test_exp_tdt_enriched_word_Features_global/train.bieou.tsv" | tr '\t' ' ' | perl -ne '{chomp;s/\r//g;print $_,"\n";}' | python NoisyNLP/conlleval.py
Processed 69519 tokens.
Phrases: total=2436, found=2295, correct=2273
Overall accuracy: 99.71
       category  precision  recall     F1  support
0       overall      99.04   93.31  96.09      0.0
1       company      98.68   92.59  95.54    228.0
2      facility      98.55   90.07  94.12    138.0
3       geo-loc      98.36   95.90  97.12    428.0
4         movie     100.00   96.15  98.04     50.0
5   musicartist      98.11   92.86  95.41    106.0
6         other      99.46   90.35  94.68    367.0
7        person      99.55   95.38  97.42    664.0
8       product      98.48   90.28  94.20    132.0
9    sportsteam      99.33   94.87  97.05    149.0
10       tvshow     100.00   78.57  88.00     33.0
Running:
cat "./test_exp_tdt_enriched_word_Features_global/train.notypes