In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn import metrics


TRAIN_PATH = "r8-train-all-terms.txt"
TEST_PATH = "r8-test-all-terms.txt"

TRAIN_PATH = "r8-train-no-stop.txt"
TEST_PATH = "r8-test-no-stop.txt"
WORD_VECTORS_PATH = "glove.6B.50d.txt"
# WORD_VECTORS_PATH = "glove.840B.300d.txt"

In [2]:
X_train, y_train = [], []
with open(TRAIN_PATH, "rb") as infile:
    for line in infile:
        label, text = line.split("\t")
        X_train.append(text.split())
        y_train.append(label)
        



X_test, y_test = [], []
with open(TEST_PATH, "rb") as infile:
    for line in infile:
        label, text = line.split("\t")
        X_test.append(text.split())
        y_test.append(label)
        
    
all_words = set(w for words in (X_train + X_test) for w in words)
X_train = X_train[:250]
y_train = y_train[:250]

y_train = np.array(y_train)
y_test = np.array(y_test)

In [3]:
w2v = {}
with open(WORD_VECTORS_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0]
        nums = map(float, parts[1:])
        if word in all_words:
            w2v[word] = np.array(nums)

In [4]:
X_train_vec = np.array([
        np.mean([w2v[k] for k in kws if k in w2v], axis=0)
        for kws in X_train
    ])

X_test_vec = np.array([
        np.mean([w2v[k] for k in kws if k in w2v], axis=0)
        for kws in X_test
    ])

In [5]:
def benchmark(clf, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    score = (pred == y_test).mean()
    return score

def benchmark_vec(clf):
    clf.fit(X_train_vec, y_train)
    pred = clf.predict(X_test_vec)
    score = (pred == y_test).mean()
    return score

In [6]:
mult_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])

mult_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb_tfifd = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])

In [7]:
benchmark(mult_nb), benchmark(bern_nb), benchmark(mult_nb_tfidf), benchmark(bern_nb_tfifd)

(0.88761991777067151,
 0.58611238008222932,
 0.66331658291457285,
 0.58611238008222932)

In [15]:
benchmark_vec(RandomForestClassifier(n_estimators=200))

0.8423937871174052

In [16]:
benchmark_vec(ExtraTreesClassifier(n_estimators=200))

0.84878940155322069

In [17]:
benchmark_vec(SVC())

0.78529008679762446

In [13]:
import multi_multi_nb as mm
reload(mm)

<module 'multi_multi_nb' from 'multi_multi_nb.pyc'>

In [199]:
benchmark(mm.MMGNB(w2v, 0.1, 0.1))

0.86888990406578348

In [189]:
for alpha in [0.001, 0.01, 0.05, 0.1]:
    print alpha, benchmark(mm.MMGNB(w2v, alpha))


0.001 0.494746459571
0.01 0.494746459571
0.05 0.00456829602558
0.1 0.868889904066


In [190]:
benchmark(mm.MMGNB(w2v, 0.2))

0.76838739150296942

In [191]:
for alpha in [0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5]:
    clf = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), 
                    ("multinomial nb", MultinomialNB(alpha=alpha))])
    print alpha, benchmark(clf)


0.001 0.881224303335
0.01 0.895386021014
0.05 0.906349931476
0.1 0.909090909091
0.2 0.905893101873
0.5 0.903152124258
1 0.887619917771
2 0.871174052079
5 0.845134764733


In [47]:
Counter(y_train[1000:2000])

Counter({'earn': 464, 'acq': 307, 'crude': 75, 'interest': 44, 'ship': 38, 'trade': 35, 'money-fx': 32, 'grain': 5})

In [84]:
Counter(y_train[:250])

Counter({'earn': 141, 'acq': 67, 'trade': 13, 'crude': 13, 'grain': 5, 'money-fx': 4, 'interest': 4, 'ship': 3})

In [9]:
import multi_multi_kernel_nb as mmk
reload(mmk)

<module 'multi_multi_kernel_nb' from 'multi_multi_kernel_nb.pyc'>

In [192]:
%%time
benchmark(mmk.MMGKNB(w2v, sigma=0.1))

Wall time: 9min 32s


0.49748743718592964

In [145]:
for sigma in [0.2, 0.5, 1, 2, 5, 10, 20]:
    clf = mmk.MMGKNB(w2v, sigma=sigma)
    print sigma, benchmark(clf)

0.2 0.808588396528
0.5 0.857012334399
1 0.861123800822
2 0.68935587026
5 0.494746459571
10 0.494746459571
20 0.494746459571


In [175]:
len(w2v)

10570

In [11]:
w2v = {}
with open(WORD_VECTORS_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0]
        nums = map(float, parts[1:])
        w2v[word] = np.array(nums)

In [15]:
X_train1 = [['cat', 'dog', 'hamster'],
            ['paris', 'berlin', 'london']]
y_train1 = ['pets',
           'capitals']
X_test1 = [['parrot', 'goldfish'],
           ['london', 'madrid']]
y_test1 = np.array(['pets', 'capitals'])
print benchmark(mm.MMGNB(w2v, 0.1, 0.1), X_train1, y_train1, X_test1, y_test1)
print benchmark(mmk.MMGKNB(w2v), X_train1, y_train1, X_test1, y_test1)

1.0
1.0


In [27]:
np.abs(w2v['paris'] - w2v['france'] - w2v['berlin'] + w2v['germany']).sum()

13.75019307

In [32]:
clf = mm.MMGNB(w2v, 0.1 ,0.1)

In [37]:
clf.fit(X_train1, y_train1)

In [41]:
clf.predict_one(['cow'])

'pets'