In [13]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import re
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt
import pylab
from glove import Glove, Corpus
%matplotlib inline

In [41]:
MIN_COUNT = 5 # the minimal number of occurences for a particular word in order to be included
SIZE = 300 # the size of the embedding
TEST_SET = 0.2
T_SNE = False
K_MEANS = True
CLASSIFIER = True

In [42]:
dataset = fetch_20newsgroups(subset='all', shuffle=True)

In [43]:
def preproc(line):
    line = re.sub('[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n]', ' ', line)
    line = re.sub('[0-9]', '', line)
    words = line.lower().split()
    proper = []
    for word in words:
        if len(word) > 2:
            proper.append(word)
    return proper

X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=TEST_SET)
X_train, X_test = [preproc(line) for line in X_train], [preproc(line) for line in X_test]

In [44]:
corpus = Corpus()
corpus.fit(X_train)

glove = Glove(no_components=SIZE, learning_rate=0.05)
 
glove.fit(corpus.matrix, epochs=30, no_threads=8, verbose=True)
glove.add_dictionary(corpus.dictionary)
model = glove

Performing 30 training epochs with 8 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [45]:
object_methods = [method_name for method_name in dir(glove)]
glove.word_vectors[glove.dictionary['only']]

array([ 1.29680462e-01,  2.03904166e-01,  8.89107637e-02, -1.54881783e-01,
        2.51916101e-01,  3.80483315e-02,  3.05416478e-01,  2.28546549e-02,
       -1.26759974e-01,  2.28333708e-01,  4.84659946e-02, -1.84331748e-01,
        1.24411462e-01, -7.67160179e-02,  9.31707164e-02, -1.99920467e-01,
       -5.24743849e-02, -7.93961185e-02, -1.32804040e-01,  8.85677009e-02,
       -2.86782349e-01, -1.42096356e-01, -2.61349180e-01, -1.43584834e-01,
        9.32634025e-03,  1.34729896e-01,  1.20198947e-01,  2.34515197e-01,
        1.21955635e-01,  1.36529456e-01,  1.58619176e-01,  8.48278654e-02,
       -1.85386394e-01,  1.38982882e-01,  1.51485042e-01,  1.00034582e-01,
       -4.31165617e-01, -1.72589170e-01,  1.66632175e-01, -1.15527113e-01,
        1.96738025e-01,  8.24328357e-04, -2.12845243e-01, -4.61287390e-02,
       -4.22331482e-02,  3.78180679e-02,  1.52784823e-01,  1.50386238e-02,
       -3.06157699e-01,  4.03075611e-01, -2.40698052e-01, -1.74673204e-01,
       -1.65147412e-01,  

In [46]:
def sentence2vector(data, glove):
    x = []
    for line in data:
        vector = []
        for word in line:
            if word in glove.dictionary.keys():
                vector.append(glove.word_vectors[glove.dictionary[word]])
        x.append(vector)
    return x

In [47]:
X_train, X_test = sentence2vector(X_train, model), sentence2vector(X_test, model)
print('X_train: %d\nX_test %d' % (len(X_train), len(X_test)))

X_train: 15076
X_test 3770


In [48]:
def averageVectors(data, labels):
    avg = []
    labels_clear = []
    for row, label in zip(data, labels):
        if len(row) > 0:
            sample = sum(row)/len(row)
            avg.append(sample)
            labels_clear.append(label)
    return avg, labels_clear

In [49]:
X_train, y_train = averageVectors(X_train, y_train)
X_test, y_test = averageVectors(X_test, y_test)

In [50]:
if T_SNE:
    tsne = TSNE(verbose=1, n_iter=2500, learning_rate=1000)
    result = tsne.fit_transform(X_train)
    
    import random
 
    def colors(n):
        ret = []
        r = int(random.random() * 256)
        g = int(random.random() * 256)
        b = int(random.random() * 256)
        step = 256 / n
        for i in range(n):
            r += step
            g += step
            b += step
            r = int(r) % 256
            g = int(g) % 256
            b = int(b) % 256
            ret.append((r/256,g/256,b/256)) 
        return ret

    colors = colors(20)
    m = plt.get_cmap()
    x = np.array(np.matrix(result)[:,0].reshape(1,-1))
    y = np.array(np.matrix(result)[:,1].reshape(1,-1))
    c = [colors[i] for i in y_train]
    plt.scatter(x, y, c=c)
    plt.savefig('glove.pdf')

In [51]:
print('X_train: %d\nX_test %d' % (len(X_train), len(X_test)))
print('\nExample:')
print(X_train[0])

X_train: 15076
X_test 3770

Example:
[ 0.19665052  0.13148975  0.11489718  0.06313163  0.11216958 -0.09665706
  0.03054124  0.02772437 -0.1936699   0.08954351 -0.03490513 -0.05817913
  0.04941081 -0.05879615  0.00299012 -0.03500903 -0.08708629 -0.10821586
 -0.02003703  0.05965656 -0.10798991 -0.15392292  0.03046072 -0.00411286
 -0.04510488  0.11960951 -0.00507757  0.04310217  0.0495707   0.09407486
 -0.0197391   0.10784171 -0.08249628  0.03185181  0.03938374 -0.02412637
 -0.0559407  -0.01030827  0.10387594 -0.03923275  0.03887461 -0.03878042
  0.03773385 -0.05610722 -0.06940983  0.03639879  0.15632733 -0.08056015
 -0.07349539  0.01332725 -0.0639918  -0.14213716 -0.14900329  0.12765242
 -0.07522953  0.10097406  0.07609285 -0.02244475 -0.10117409  0.0490825
 -0.01133064  0.11982951  0.10635011  0.09593495  0.17497404 -0.11105407
  0.01850728 -0.10221129  0.0645162   0.05840711  0.06400835 -0.030579
 -0.04097324  0.17057143  0.03826791  0.03836644  0.01143341 -0.06120741
 -0.03280361  0.0

In [52]:
# TODO randomowe slowa, wybrac najbardziej podobne uzyc tsne i narysowac

In [53]:
for reg in [1, 10, 100, 1000, 10000]:
    classifier = LogisticRegression(
                solver='lbfgs', max_iter=3000, multi_class='multinomial', C=reg)
    classifier.fit(X_train, y_train)
    print("Lambda: %f acc: %.3f " % (1/reg, classifier.score(X_test, y_test)))

Lambda: 1.000000 acc: 0.618 
Lambda: 0.100000 acc: 0.675 
Lambda: 0.010000 acc: 0.707 




Lambda: 0.001000 acc: 0.725 
Lambda: 0.000100 acc: 0.719 


