In [11]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import doc2vec, word2vec
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold, StratifiedShuffleSplit

glove50 = '/Users/nadbordrozd/workspace/duedil_repos/experiments/similarity/data/glove.6B.50d.txt'
TRAIN_PATH = "r8-train-no-stop.txt"
TEST_PATH = "r8-test-no-stop.txt"

# TRAIN_PATH = "r52-train-no-stop.txt"
# TEST_PATH = "r52-test-no-stop.txt"

# TRAIN_PATH = "20ng-train-no-stop.txt"
# TEST_PATH = "20ng-train-no-stop.txt"
WORD_VECTORS_PATH = "glove50"
# WORD_VECTORS_PATH = "glove.840B.300d.txt"

In [12]:
X_train, y_train = [], []
with open(TRAIN_PATH, "rb") as infile:
    for line in infile:
        label, text = line.split("\t")
        X_train.append(text.split())
        y_train.append(label)


X_test, y_test = [], []
with open(TEST_PATH, "rb") as infile:
    for line in infile:
        label, text = line.split("\t")
        X_test.append(text.split())
        y_test.append(label)

X = np.array(X_train + X_test)
y = np.array(y_train + y_test)

X_train = np.array(X_train)
y_train = np.array(y_train)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [13]:
# pretrained word vectors
w2v_pre = {}
with open(WORD_VECTORS_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0]
        nums = map(float, parts[1:])
        w2v_pre[word] = np.array(nums)

In [14]:
model = gensim.models.Word2Vec(X, size=52)
w2v50 = dict(zip(model.index2word, model.syn0))

model = gensim.models.Word2Vec(X, size=100)
w2v100 = dict(zip(model.index2word, model.syn0))

model = gensim.models.Word2Vec(X, size=200)
w2v200 = dict(zip(model.index2word, model.syn0))

In [15]:
def benchmark(clf, X, y, test_size=0.5, splits=5):
    f1 = []
    acc = []
    for train_index, test_index in StratifiedShuffleSplit(
            y, splits, test_size=test_size, random_state=0):
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        f1_micro = metrics.f1_score(y_test, pred, average='micro')
        f1.append(f1_micro)
        accuracy = (pred == y_test).mean()
        acc.append(accuracy)
    assert abs(np.mean(f1) - np.mean(acc)) < 0.001
    return np.mean(f1)

In [16]:
class MeanVectorizer(object):
    def __init__(self, w2v):
        self.w2v = w2v
    
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[t] for t in x if t in self.w2v], axis=0)
            for x in X
        ])

In [17]:
extra_pretrained = Pipeline([
        ("pretrained w2v mean vectorizer", MeanVectorizer(w2v_pre)),
        ("extra trees", ExtraTreesClassifier(n_estimators=300))])

extra50 = Pipeline([
        ("w2v50d mean vectorizer", MeanVectorizer(w2v50)),
        ("extra trees", ExtraTreesClassifier(n_estimators=300))])

extra100 = Pipeline([
        ("w2v100d mean vectorizer", MeanVectorizer(w2v100)),
        ("extra trees", ExtraTreesClassifier(n_estimators=300))])

extra200 = Pipeline([
        ("w2v200d mean vectorizer", MeanVectorizer(w2v200)),
        ("extra trees", ExtraTreesClassifier(n_estimators=300))])

mult_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])
svc = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("svc", SVC())])


mult_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])

svc_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("svc", SVC())])

names = ["mult_nb", "mult_nb_tfidf", "bern_nb", "bern_neb_tfidf", 
         "extra_pre", "extra50", "extra100", "extra200"]
models = [mult_nb, mult_nb_tfidf, bern_nb, bern_nb_tfidf,
              extra_pretrained, extra50, extra100, extra200]

In [18]:
rows = [
    names,
    ["0.2"] + [benchmark(model, X, y, test_size=0.2) for model in models],
    ["0.5"] + [benchmark(model, X, y, test_size=0.5) for model in models],
    ["0.8"] + [benchmark(model, X, y, test_size=0.8) for model in models],
    ["0.9"] + [benchmark(model, X, y, test_size=0.9) for model in models],
    ["0.95"] + [benchmark(model, X, y, test_size=0.95) for model in models],
    ["0.99"] + [benchmark(model, X, y, test_size=0.99) for model in models],
]

In [19]:
import tabulate
print len(X) 
print tabulate.tabulate(rows, headers="firstrow", floatfmt=".2f")

7674
        mult_nb    mult_nb_tfidf    bern_nb    bern_neb_tfidf    extra_pre    extra50    extra100    extra200
----  ---------  ---------------  ---------  ----------------  -----------  ---------  ----------  ----------
0.20       0.95             0.87       0.80              0.80         0.93       0.96        0.96        0.96
0.50       0.95             0.85       0.79              0.79         0.92       0.95        0.95        0.95
0.80       0.94             0.80       0.75              0.75         0.91       0.94        0.95        0.95
0.90       0.93             0.76       0.72              0.72         0.89       0.94        0.94        0.94
0.95       0.92             0.72       0.66              0.66         0.88       0.94        0.94        0.94
0.99       0.86             0.58       0.58              0.58         0.82       0.89        0.89        0.90


In [57]:
r99 = ["0.99"] + [benchmark(model, X, y, test_size=0.99) for model in models] 

In [58]:
r99

['0.99',
 0.41273816978262817,
 0.58859468646569457,
 0.35317112442973431,
 0.35317112442973431,
 0.49194024510242418,
 0.59604615797477423,
 0.62545844887735935,
 0.62939440021468829]

In [10]:
len(X)

22586

In [59]:
import multi_multi_nb as mm
reload(mm)

import multi_multi_kernel_nb as mmk
reload(mmk)

<module 'multi_multi_kernel_nb' from 'multi_multi_kernel_nb.pyc'>

In [60]:
%%time
benchmark(mm.MMGNB(w2v_pre, 0.1, 0.1), X, y, test_size=0.99)

CPU times: user 4min 33s, sys: 743 ms, total: 4min 34s
Wall time: 4min 34s


0.58750335450398072

In [61]:
%%time
benchmark(mmk.MMGKNB(w2v_pre, 0.1, 0.1), X, y, test_size=0.99, splits=1)

CPU times: user 6min 57s, sys: 1.97 s, total: 6min 58s
Wall time: 6min 59s


0.072054745504964671

In [None]:
%%time
benchmark(mmk.Better(w2v_pre, 0.1, 0.1), X, y, test_size=0.3, splits=1)

In [None]:
benchmark(mmk.(w2v_pre, 0.1, 0.1), X, y, test_size=0.3, splits=1)

In [189]:
for alpha in [0.001, 0.01, 0.05, 0.1]:
    print alpha, benchmark(mm.MMGNB(w2v, alpha))


0.001 0.494746459571
0.01 0.494746459571
0.05 0.00456829602558
0.1 0.868889904066


In [190]:
benchmark(mm.MMGNB(w2v, 0.2))

0.76838739150296942

In [191]:
for alpha in [0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5]:
    clf = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), 
                    ("multinomial nb", MultinomialNB(alpha=alpha))])
    print alpha, benchmark(clf)


0.001 0.881224303335
0.01 0.895386021014
0.05 0.906349931476
0.1 0.909090909091
0.2 0.905893101873
0.5 0.903152124258
1 0.887619917771
2 0.871174052079
5 0.845134764733


In [47]:
Counter(y_train[1000:2000])

Counter({'earn': 464, 'acq': 307, 'crude': 75, 'interest': 44, 'ship': 38, 'trade': 35, 'money-fx': 32, 'grain': 5})

In [84]:
Counter(y_train[:250])

Counter({'earn': 141, 'acq': 67, 'trade': 13, 'crude': 13, 'grain': 5, 'money-fx': 4, 'interest': 4, 'ship': 3})

In [9]:
import multi_multi_kernel_nb as mmk
reload(mmk)

<module 'multi_multi_kernel_nb' from 'multi_multi_kernel_nb.pyc'>

In [1]:
%%time
benchmark(mmk.MMGKNB(w2v_pre, sigma=0.1))

NameError: name 'benchmark' is not defined

In [145]:
for sigma in [0.2, 0.5, 1, 2, 5, 10, 20]:
    clf = mmk.MMGKNB(w2v, sigma=sigma)
    print sigma, benchmark(clf)

0.2 0.808588396528
0.5 0.857012334399
1 0.861123800822
2 0.68935587026
5 0.494746459571
10 0.494746459571
20 0.494746459571


In [175]:
len(w2v)

10570

In [11]:
w2v = {}
with open(WORD_VECTORS_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0]
        nums = map(float, parts[1:])
        w2v[word] = np.array(nums)

In [15]:
X_train1 = [['cat', 'dog', 'hamster'],
            ['paris', 'berlin', 'london']]
y_train1 = ['pets',
           'capitals']
X_test1 = [['parrot', 'goldfish'],
           ['london', 'madrid']]
y_test1 = np.array(['pets', 'capitals'])
print benchmark(mm.MMGNB(w2v, 0.1, 0.1), X_train1, y_train1, X_test1, y_test1)
print benchmark(mmk.MMGKNB(w2v), X_train1, y_train1, X_test1, y_test1)

1.0
1.0


In [27]:
np.abs(w2v['paris'] - w2v['france'] - w2v['berlin'] + w2v['germany']).sum()

13.75019307

In [32]:
clf = mm.MMGNB(w2v, 0.1 ,0.1)

In [37]:
clf.fit(X_train1, y_train1)

In [41]:
clf.predict_one(['cow'])

'pets'

In [45]:
model = gensim.models.Word2Vec(X_train + X_test)
w2valt = dict(zip(model.index2word, model.syn0))

In [46]:
len(model.syn0)

7367

In [47]:
len(model.index2word)

7367

In [48]:
w2valt = dict(zip(model.index2word, model.syn0))

In [41]:
from gensim.models.doc2vec import TaggedDocument

In [42]:
all_docs = [TaggedDocument(words=x, tags=["document" + str(i)]) for i, x in enumerate(X)]

In [43]:
model = gensim.models.Doc2Vec(all_docs, size=100, window=3)

In [44]:
len(model.docvecs)

9100

In [142]:
docvecs = model.docvecs

In [45]:
X_vecs = []
for i, v in enumerate(model.docvecs):
    X_vecs.append(v)
X_vecs = np.array(X_vecs)

In [144]:
len(X_train)

5485

In [46]:
benchmark(RandomForestClassifier(n_estimators=200), X_vecs, y)

0.60558732952045757

In [115]:
benchmark_vec(SVC())

0.49474645957058017

In [118]:
docvecs.doctag_syn0

array([[ -1.60047424e-03,  -1.50309177e-03,   1.91274623e-03, ...,
          3.45259253e-03,  -1.40266877e-03,  -2.33687880e-03],
       [ -5.00012720e-05,  -2.76605692e-03,  -3.37624503e-03, ...,
          4.67358250e-03,   9.84009821e-04,   3.48499510e-03],
       [  3.55318451e-04,  -4.34295693e-03,  -2.04310846e-03, ...,
         -2.74742721e-03,  -9.80215496e-04,  -4.21651546e-03],
       ..., 
       [ -9.97600495e-04,  -4.70627612e-03,  -2.01077177e-03, ...,
         -1.73579657e-03,  -2.84776674e-03,  -3.40009597e-03],
       [  3.12148756e-03,  -4.59358888e-03,  -2.19333800e-03, ...,
          6.43378589e-05,  -4.52710222e-03,   3.19797290e-03],
       [ -4.50575352e-03,  -3.90426372e-03,  -2.71564000e-03, ...,
          4.53540031e-03,  -4.40954603e-03,  -4.32068482e-03]], dtype=float32)

In [124]:
docvecs.index_to_doctag(4444)

4444

In [102]:
import tabulate

In [139]:
tabulate.tabulate?