In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
%matplotlib inline

In [None]:
MIN_COUNT = 5 # the minimal number of occurences for a particular word in order to be included
SIZE = 100 # the size of the embedding
TEST_SET = 0.2
T_SNE = False
K_MEANS = True
CLASSIFIER = True
DM = 0

In [None]:
dataset = fetch_20newsgroups(subset='all', shuffle=True)

In [None]:
def preproc(line):
    line = re.sub('[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n]', ' ', line)
    line = re.sub('[0-9]', '', line)
    words = line.lower().split()
    proper = []
    for word in words:
        if len(word) > 2:
            proper.append(word)
    return proper

X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=TEST_SET)
X_train, X_test = [preproc(line) for line in X_train], [preproc(line) for line in X_test]
X_train = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train)]
X_test = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_test)]

In [None]:
model = Doc2Vec(size=SIZE, dm=DM, min_count=5)
model.build_vocab(X_train)
model.train(X_train, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
X_train = [model.docvecs[i] for i in range(len(X_train))]
X_test = [model.infer_vector(X_test[i][0]) for i in range(len(X_test))]

In [None]:
for reg in [1, 10, 100, 1000, 10000]:
    classifier = LogisticRegression(
                solver='lbfgs', max_iter=3000, multi_class='multinomial', C=reg)
    classifier.fit(X_train, y_train)
    print("Lambda: %f acc: %.3f " % (1/reg, classifier.score(X_test, y_test)))

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances, silhouette_score

scores = []
for k in range(2, 25):
    model = KMeans(n_clusters=k).fit(X_train)
    labels = model.labels_
    score = silhouette_score(X_train, labels, metric='euclidean')
    scores.append(score)
    print('K = %d\tScore = %f' % (k, score))

plt.plot(np.arange(2, 25), scores)
plt.savefig('kmeans-doc2vec-DM.pdf')

In [None]:
print(max(scores))

In [None]:
X_all = X_train + X_test
y_all = np.concatenate([y_train, y_test])

In [None]:
tsne = TSNE(verbose=1, n_iter=250)
result = tsne.fit_transform(X_train)

In [None]:
import random 
def colors(n):
    ret = []
    r = int(random.random() * 256)
    g = int(random.random() * 256)
    b = int(random.random() * 256)
    step = 256 / n
    for i in range(n):
        r += step
        g += step
        b += step
        r = int(r) % 256
        g = int(g) % 256
        b = int(b) % 256
        ret.append((r/256,g/256,b/256)) 
    return ret
colors = colors(20)

In [None]:
m = plt.get_cmap()
x = np.array(np.matrix(result)[:,0].reshape(1,-1))
y = np.array(np.matrix(result)[:,1].reshape(1,-1))
c = [colors[i] for i in y_train]
plt.scatter(x, y, c=c)
plt.savefig('doc2ec-non-def.pdf')