In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
from gensim.utils import simple_preprocess
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

english_words = set(nltk.corpus.words.words())
english_stop_words = nltk.corpus.stopwords.words("english")

lemmatizer = WordNetLemmatizer()

def preprocess(text):
    return " ".join(lemmatizer.lemmatize(w) for w in nltk.wordpunct_tokenize(text)
        if w.lower() in english_words and w.lower() not in english_stop_words or not w.isalpha())

def tokenize(document):
    return simple_preprocess(str(document).encode("utf-8"))

In [3]:
import pandas as pd

dname = './data/preprocessed-twitter-tweets'

pos = pd.read_csv('{}/{}.csv'.format(dname, 'processedPositive')).T
neu = pd.read_csv('{}/{}.csv'.format(dname, 'processedNeutral')).T
neg = pd.read_csv('{}/{}.csv'.format(dname, 'processedNegative')).T

documents = pd.concat([pos, neu, neg])
documents['tweets'] = documents.index
documents = documents.sample(frac=1).reset_index(drop=True)

documents.head(15)

Unnamed: 0,tweets
0,it looks like iOS 11 is due to kill it unhappy
1,What results mean for others.
2,it is versus
3,physicist still has his table full
4,convoy attacked on Parimpora-Panthachowk bypas...
5,rocks. next question - where is my astro unha...
6,How Modi won UP
7,keeps key unchanged-- repo at 6.25%
8,i miss minmin crying
9,Thanks for the retweets this week. Much apprec...


In [4]:
training_docs = []
num_docs = float(len(documents.index))
for _, row in documents.iterrows():
    training_docs.append(tokenize(preprocess(row['tweets'])))

training_docs

[['like', 'due', 'kill', 'unhappy'],
 ['mean'],
 ['versus'],
 ['physicist', 'still', 'table', 'full'],
 ['convoy', 'bypass', 'police'],
 ['next', 'question', 'unhappy'],
 [],
 ['key', 'unchanged'],
 ['miss', 'cry'],
 ['thanks', 'week', 'much', 'happy', 'want'],
 ['follow', 'unhappy'],
 ['suddenly', 'cannot', 'enough', 'mother'],
 ['please', 'reading', 'heart', 'unhappy'],
 ['hello', 'getting', 'something', 'unhappy'],
 ['look', 'good', 'current', 'shame', 'never', 'cut', 'unhappy'],
 ['work', 'much', 'better', 'unhappy'],
 ['today', 'congress', 'address'],
 ['please', 'beautiful', 'princess', 'help'],
 ['awesome', 'id', 'love', 'see', 'like'],
 ['happy', 'belated', 'birthday', 'another', 'duet', 'much'],
 ['order', 'ever', 'want', 'eat', 'may', 'great', 'day', 'happy'],
 ['tell', 'unhappy'],
 ['tried', 'everything'],
 ['give', 'trial', 'run'],
 [],
 ['table', 'today'],
 [],
 ['last', 'night', 'dream', 'japan', 'see', 'request', 'unhappy'],
 ['definitely', 'arm', 'unhappy'],
 ['miss', '

In [5]:
from gensim.models import Word2Vec

w2v = Word2Vec(size=150, window=10, min_count=1, sg=1, workers=10)
w2v.build_vocab(training_docs)
w2v.train(sentences=training_docs, total_examples=len(training_docs), epochs=w2v.epochs)

w2v

<gensim.models.word2vec.Word2Vec at 0x1232009e8>

In [6]:
def doc2vec(docs):
    last_indices = []
    testing_docs = []
    tokenized_docs = []

    for _, row in docs.iterrows():
        tokens = tokenize(preprocess(row['tweets']))
        if len(last_indices) > 0:
            last_indices.append(last_indices[len(last_indices) - 1] + len(tokens))
        else:
            last_indices.append(len(tokens))
        testing_docs.extend(tokens)
        tokenized_docs.append(tokens)

    return tokenized_docs, testing_docs, last_indices

from sklearn.cluster import MiniBatchKMeans

kmeans = MiniBatchKMeans(n_clusters=8, batch_size=750)

def stream_cluster(vectors, vector_size, last_indices):
    document_vectors = []
    first_index = 0
    
    for last_index in last_indices:
        min_vector = []
        max_vector = []
        for i in range(vector_size):
            v_list = [v[i] for v in vectors[first_index:last_index]]
            if len(v_list) > 0:
                min_vector.append(min(v_list))
                max_vector.append(max(v_list))
        if len(min_vector) > 0 and len(max_vector) > 0:
            min_vector.extend(max_vector)
            document_vectors.append(min_vector)
        first_index = last_index
    clustering = kmeans.partial_fit(document_vectors)
    
    return clustering, clustering.labels_, document_vectors

In [7]:
last_index = int(len(documents.index) / 5)
documents_test = documents[:last_index]

tokenized_docs, testing_docs, last_indices = doc2vec(documents_test)

tokenized_docs

[['like', 'due', 'kill', 'unhappy'],
 ['mean'],
 ['versus'],
 ['physicist', 'still', 'table', 'full'],
 ['convoy', 'bypass', 'police'],
 ['next', 'question', 'unhappy'],
 [],
 ['key', 'unchanged'],
 ['miss', 'cry'],
 ['thanks', 'week', 'much', 'happy', 'want'],
 ['follow', 'unhappy'],
 ['suddenly', 'cannot', 'enough', 'mother'],
 ['please', 'reading', 'heart', 'unhappy'],
 ['hello', 'getting', 'something', 'unhappy'],
 ['look', 'good', 'current', 'shame', 'never', 'cut', 'unhappy'],
 ['work', 'much', 'better', 'unhappy'],
 ['today', 'congress', 'address'],
 ['please', 'beautiful', 'princess', 'help'],
 ['awesome', 'id', 'love', 'see', 'like'],
 ['happy', 'belated', 'birthday', 'another', 'duet', 'much'],
 ['order', 'ever', 'want', 'eat', 'may', 'great', 'day', 'happy'],
 ['tell', 'unhappy'],
 ['tried', 'everything'],
 ['give', 'trial', 'run'],
 [],
 ['table', 'today'],
 [],
 ['last', 'night', 'dream', 'japan', 'see', 'request', 'unhappy'],
 ['definitely', 'arm', 'unhappy'],
 ['miss', '

In [8]:
vectors = list(map(lambda word: w2v.wv[word], testing_docs))

vectors[:1]

[array([ 0.00238459, -0.03510084, -0.09075089, -0.01923401, -0.07362858,
        -0.0351901 , -0.04651758,  0.02633143, -0.00778141,  0.07152639,
        -0.18311621, -0.02533881, -0.12324546, -0.0891151 , -0.1748734 ,
        -0.0948323 , -0.0228548 , -0.09283423,  0.07747453,  0.11168096,
         0.01514961,  0.00600822, -0.10589401,  0.06043209, -0.01824423,
         0.12368233,  0.07921502, -0.13378091,  0.00426658, -0.11942744,
        -0.12330525, -0.1123973 , -0.1258716 ,  0.06898737, -0.187171  ,
        -0.04723676, -0.02984368, -0.15233803, -0.01727172, -0.1342542 ,
        -0.02071373, -0.09113612,  0.21011426, -0.01502392, -0.06103258,
        -0.05154557,  0.22392108, -0.05037166,  0.19439599,  0.05712034,
        -0.07022048,  0.04029144,  0.04544317, -0.00638363, -0.060417  ,
        -0.10512706,  0.01641648, -0.02564002, -0.00808017,  0.10630167,
        -0.02908321, -0.00245452, -0.09934234,  0.04758047,  0.05943349,
         0.04119796, -0.16798595,  0.04287523, -0.0

In [9]:
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

from sklearn import metrics

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.5548120141029358
Calinski-Harabaz Index: 1024.0996385567587
Davies-Bouldin Index: 0.6553123301704322


In [10]:
first_index = last_index
last_index = int(len(documents.index)/ 5 * 2)
documents_test = documents[first_index:last_index]

_, testing_docs, last_indices = doc2vec(documents_test)
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.5404684543609619
Calinski-Harabaz Index: 1047.129095337243
Davies-Bouldin Index: 0.6600749432326024


In [11]:
first_index = last_index
last_index = int(len(documents.index) / 5 * 3)
documents_test = documents[first_index:last_index]

_, testing_docs, last_indices = doc2vec(documents_test)
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.5429835319519043
Calinski-Harabaz Index: 1088.1915405290952
Davies-Bouldin Index: 0.6466463975469459


In [12]:
first_index = last_index
last_index = int(len(documents.index) / 5 * 4)
documents_test = documents[first_index:last_index]

_, testing_docs, last_indices = doc2vec(documents_test)
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.5231156349182129
Calinski-Harabaz Index: 966.8132630231855
Davies-Bouldin Index: 0.6940759534807669


In [13]:
first_index = last_index
last_index = int(len(documents.index))
documents_test = documents[first_index:last_index]

_, testing_docs, last_indices = doc2vec(documents_test)
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.5257409811019897
Calinski-Harabaz Index: 1004.2611452756448
Davies-Bouldin Index: 0.700394942856913
