In [106]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import re
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt
import pylab
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances, silhouette_score
%matplotlib inline

In [107]:
MIN_COUNT = 5 # the minimal number of occurences for a particular word in order to be included
SIZE = 300 # the size of the embedding
SG = 1 # if using skipgram
TEST_SET = 0.2
T_SNE = False
K_MEANS = True
CLASSIFIER = True

In [108]:
dataset = fetch_20newsgroups(subset='all', shuffle=True)

In [109]:
def preproc(line):
    line = re.sub('[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n]', ' ', line)
    line = re.sub('[0-9]', '', line)
    words = line.lower().split()
    proper = []
    for word in words:
        if len(word) > 2:
            proper.append(word)
    return proper

X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=TEST_SET)
X_train, X_test = [preproc(line) for line in X_train], [preproc(line) for line in X_test]

In [110]:
model = Word2Vec(X_train, 
                 min_count=MIN_COUNT, 
                 size=SIZE, sg=SG)

In [111]:
def sentence2vector(data, model):
    x = []
    for line in data:
        vector = []
        for word in line:
            if word in model.wv.vocab:
                vector.append(model.wv[word])
        x.append(vector)
    return x

In [112]:
X_train, X_test = sentence2vector(X_train, model), sentence2vector(X_test, model)
print('X_train: %d\nX_test %d' % (len(X_train), len(X_test)))

X_train: 15076
X_test 3770


In [113]:
def averageVectors(data, labels):
    avg = []
    labels_clear = []
    for row, label in zip(data, labels):
        if len(row) > 0:
            sample = sum(row)/len(row)
            avg.append(sample)
            labels_clear.append(label)
    return avg, labels_clear

In [114]:
X_train, y_train = averageVectors(X_train, y_train)
X_test, y_test = averageVectors(X_test, y_test)

In [115]:
if(T_SNE):
    tsne = TSNE(verbose=1, n_iter=250)
    result = tsne.fit_transform(X_train)

    import random

    def colors(n):
        ret = []
        r = int(random.random() * 256)
        g = int(random.random() * 256)
        b = int(random.random() * 256)
        step = 256 / n
        for i in range(n):
            r += step
            g += step
            b += step
            r = int(r) % 256
            g = int(g) % 256
            b = int(b) % 256
            ret.append((r/256,g/256,b/256)) 
        return ret
    colors = colors(6)

    m = plt.get_cmap()
    x = np.array(np.matrix(result)[:,0].reshape(1,-1))
    y = np.array(np.matrix(result)[:,1].reshape(1,-1))
    c = [colors[i] for i in y_train]
    plt.scatter(x, y, c=c)
    plt.savefig('plot-skipgram-6.pdf')

In [116]:
if CLASSIFIER:  
    for c in [1, 10, 100, 1000, 10000]:
        classifier = LogisticRegression(solver='lbfgs', max_iter=1500, multi_class='multinomial', C=c)
        classifier.fit(X_train, y_train)
        print("C: %f\tAcc: %f" % (c, classifier.score(X_test, y_test)))

C: 1.000000	Acc: 0.782759
C: 10.000000	Acc: 0.807427
C: 100.000000	Acc: 0.804775




C: 1000.000000	Acc: 0.789125
C: 10000.000000	Acc: 0.775597




In [117]:
scores = []
if K_MEANS:
    
    for k in range(2, 25):
        model = KMeans(n_clusters=k).fit(X_train)
        labels = model.labels_
        score = silhouette_score(X_train, labels, metric='euclidean')
        scores.append(score)
        print('K = %d\tScore = %f' % (k, score))

    plt.plot(np.arange(2, 25), scores)
print(scores)

K = 2	Score = 0.162759
K = 3	Score = 0.097403
K = 4	Score = 0.100766
K = 5	Score = 0.091466
K = 6	Score = 0.091732
K = 7	Score = 0.056522
K = 8	Score = 0.058888
K = 9	Score = 0.055094


KeyboardInterrupt: 