In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import re
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt
import pylab
from glove import Glove, Corpus
%matplotlib inline

In [None]:
MIN_COUNT = 5 # the minimal number of occurences for a particular word in order to be included
SIZE = 300 # the size of the embedding
TEST_SET = 0.2
T_SNE = False
K_MEANS = True
CLASSIFIER = True

In [None]:
dataset = fetch_20newsgroups(subset='all', shuffle=True)

In [None]:
def preproc(line):
    line = re.sub('[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n]', ' ', line)
    line = re.sub('[0-9]', '', line)
    words = line.lower().split()
    proper = []
    for word in words:
        if len(word) > 2:
            proper.append(word)
    return proper

X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=TEST_SET)
X_train, X_test = [preproc(line) for line in X_train], [preproc(line) for line in X_test]

In [None]:
corpus = Corpus()
corpus.fit(X_train)

glove = Glove(no_components=SIZE, learning_rate=0.05)
 
glove.fit(corpus.matrix, epochs=30, no_threads=8, verbose=True)
glove.add_dictionary(corpus.dictionary)
model = glove

In [None]:
object_methods = [method_name for method_name in dir(glove)]
glove.word_vectors[glove.dictionary['only']]

In [None]:
def sentence2vector(data, glove):
    x = []
    for line in data:
        vector = []
        for word in line:
            if word in glove.dictionary.keys():
                vector.append(glove.word_vectors[glove.dictionary[word]])
        x.append(vector)
    return x

In [None]:
X_train, X_test = sentence2vector(X_train, model), sentence2vector(X_test, model)
print('X_train: %d\nX_test %d' % (len(X_train), len(X_test)))

In [None]:
def averageVectors(data, labels):
    avg = []
    labels_clear = []
    for row, label in zip(data, labels):
        if len(row) > 0:
            sample = sum(row)/len(row)
            avg.append(sample)
            labels_clear.append(label)
    return avg, labels_clear

In [None]:
X_train, y_train = averageVectors(X_train, y_train)
X_test, y_test = averageVectors(X_test, y_test)

In [None]:
if T_SNE:
    tsne = TSNE(verbose=1, n_iter=2500, learning_rate=1000)
    result = tsne.fit_transform(X_train)
    
    import random
 
    def colors(n):
        ret = []
        r = int(random.random() * 256)
        g = int(random.random() * 256)
        b = int(random.random() * 256)
        step = 256 / n
        for i in range(n):
            r += step
            g += step
            b += step
            r = int(r) % 256
            g = int(g) % 256
            b = int(b) % 256
            ret.append((r/256,g/256,b/256)) 
        return ret

    colors = colors(20)
    m = plt.get_cmap()
    x = np.array(np.matrix(result)[:,0].reshape(1,-1))
    y = np.array(np.matrix(result)[:,1].reshape(1,-1))
    c = [colors[i] for i in y_train]
    plt.scatter(x, y, c=c)
    plt.savefig('glove.pdf')

In [None]:
print('X_train: %d\nX_test %d' % (len(X_train), len(X_test)))
print('\nExample:')
print(X_train[0])

In [None]:
# TODO randomowe slowa, wybrac najbardziej podobne uzyc tsne i narysowac

In [None]:
for reg in [1, 10, 100, 1000, 10000]:
    classifier = LogisticRegression(
                solver='lbfgs', max_iter=3000, multi_class='multinomial', C=reg)
    classifier.fit(X_train, y_train)
    print("Lambda: %f acc: %.3f " % (1/reg, classifier.score(X_test, y_test)))