In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
from gensim.utils import simple_preprocess
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

english_words = set(nltk.corpus.words.words())
english_stop_words = nltk.corpus.stopwords.words("english")

lemmatizer = WordNetLemmatizer()

def preprocess(text):
    return " ".join(lemmatizer.lemmatize(w) for w in nltk.wordpunct_tokenize(text)
        if w.lower() in english_words and w.lower() not in english_stop_words or not w.isalpha())

def tokenize(document):
    return simple_preprocess(str(document).encode("utf-8"))

In [3]:
import pandas as pd

dname = './data/preprocessed-twitter-tweets'

pos = pd.read_csv('{}/{}.csv'.format(dname, 'processedPositive')).T
neu = pd.read_csv('{}/{}.csv'.format(dname, 'processedNeutral')).T
neg = pd.read_csv('{}/{}.csv'.format(dname, 'processedNegative')).T

documents = pd.concat([pos, neu, neg])
documents['tweets'] = documents.index
documents = documents.sample(frac=1).reset_index(drop=True)

documents.head(15)

Unnamed: 0,tweets
0,If you live only for money
1,i find it so difficult to says no to him unhap...
2,No merit in claims that can be tampered with; ...
3,my favourite lipstick hilang crying
4,I'm scared unhappy I mean. Why the heck did I...
5,Wow awesome GOG! thanks happy
6,Preacher...... unhappy
7,govt sparks traffic chaos with plans for gran...
8,this is damn.... theres people who sincerely w...
9,Supreme Court gives govt a chance to oppose ju...


In [4]:
training_docs = []
num_docs = float(len(documents.index))
for _, row in documents.iterrows():
    training_docs.append(tokenize(preprocess(row['tweets'])))

training_docs

[['live', 'money'],
 ['find', 'difficult', 'unhappy'],
 ['merit', 'wild', 'baseless'],
 ['lipstick', 'cry'],
 ['unhappy', 'mean', 'heck', 'decide', 'laundry', 'wrong', 'mention', 'dark'],
 ['wow', 'awesome', 'gog', 'thanks', 'happy'],
 ['preacher', 'unhappy'],
 ['spark', 'traffic', 'chaos', 'grand', 'gate'],
 ['damn',
  'there',
  'people',
  'sincerely',
  'want',
  'attend',
  'birthday',
  'party',
  'people',
  'try',
  'fun',
  'make',
  'ca'],
 ['supreme',
  'court',
  'chance',
  'oppose',
  'judge',
  'appointment',
  'national',
  'security'],
 ['could', 'whatever'],
 ['short', 'two', 'majority'],
 ['tell', 'idea'],
 ['love', 'unhappy'],
 ['camp', 'meet', 'assembly', 'speaker'],
 ['moment'],
 ['thanks', 'recent', 'follow', 'happy', 'connect', 'happy', 'great'],
 ['thinking', 'know', 'cute', 'happy'],
 ['talking',
  'driver',
  'said',
  'love',
  'go',
  'new',
  'york',
  'since',
  'trump',
  'probably'],
 ['former', 'captain', 'team'],
 ['riot', 'refugee', 'resolve', 'also'

In [5]:
from gensim.models import Word2Vec

w2v = Word2Vec(size=150, window=10, min_count=1, sg=1, workers=10)
w2v.build_vocab(training_docs)
w2v.train(sentences=training_docs, total_examples=len(training_docs), epochs=w2v.epochs)

w2v

<gensim.models.word2vec.Word2Vec at 0x11eca8e80>

In [6]:
def doc2vec(docs):
    last_indices = []
    testing_docs = []
    tokenized_docs = []

    for _, row in docs.iterrows():
        tokens = tokenize(preprocess(row['tweets']))
        if len(last_indices) > 0:
            last_indices.append(last_indices[len(last_indices) - 1] + len(tokens))
        else:
            last_indices.append(len(tokens))
        testing_docs.extend(tokens)
        tokenized_docs.append(tokens)

    return tokenized_docs, testing_docs, last_indices

from sklearn.cluster import MiniBatchKMeans

kmeans = MiniBatchKMeans(n_clusters=8, batch_size=750)

def stream_cluster(vectors, vector_size, last_indices):
    document_vectors = []
    first_index = 0
    
    for last_index in last_indices:
        min_vector = []
        max_vector = []
        for i in range(vector_size):
            v_list = [v[i] for v in vectors[first_index:last_index]]
            if len(v_list) > 0:
                min_vector.append(min(v_list))
                max_vector.append(max(v_list))
        if len(min_vector) > 0 and len(max_vector) > 0:
            min_vector.extend(max_vector)
            document_vectors.append(min_vector)
        first_index = last_index
    clustering = kmeans.partial_fit(document_vectors)
    
    return clustering, clustering.labels_, document_vectors

In [7]:
last_index = int(len(documents.index) / 5)
documents_test = documents[:last_index]

tokenized_docs, testing_docs, last_indices = doc2vec(documents_test)

tokenized_docs

[['live', 'money'],
 ['find', 'difficult', 'unhappy'],
 ['merit', 'wild', 'baseless'],
 ['lipstick', 'cry'],
 ['unhappy', 'mean', 'heck', 'decide', 'laundry', 'wrong', 'mention', 'dark'],
 ['wow', 'awesome', 'gog', 'thanks', 'happy'],
 ['preacher', 'unhappy'],
 ['spark', 'traffic', 'chaos', 'grand', 'gate'],
 ['damn',
  'there',
  'people',
  'sincerely',
  'want',
  'attend',
  'birthday',
  'party',
  'people',
  'try',
  'fun',
  'make',
  'ca'],
 ['supreme',
  'court',
  'chance',
  'oppose',
  'judge',
  'appointment',
  'national',
  'security'],
 ['could', 'whatever'],
 ['short', 'two', 'majority'],
 ['tell', 'idea'],
 ['love', 'unhappy'],
 ['camp', 'meet', 'assembly', 'speaker'],
 ['moment'],
 ['thanks', 'recent', 'follow', 'happy', 'connect', 'happy', 'great'],
 ['thinking', 'know', 'cute', 'happy'],
 ['talking',
  'driver',
  'said',
  'love',
  'go',
  'new',
  'york',
  'since',
  'trump',
  'probably'],
 ['former', 'captain', 'team'],
 ['riot', 'refugee', 'resolve', 'also'

In [8]:
vectors = list(map(lambda word: w2v.wv[word], testing_docs))

vectors[:1]

[array([ 0.05253141, -0.06629163,  0.00603855,  0.00476747, -0.03895578,
         0.06784478, -0.00357799, -0.01466561, -0.09427092, -0.01066834,
        -0.00411893, -0.00658451, -0.0026103 ,  0.07183614,  0.04328703,
        -0.04386939, -0.03975967, -0.01594156,  0.10841969,  0.07769248,
        -0.01955079, -0.03748922, -0.01338697,  0.0285856 , -0.07125536,
        -0.10389573, -0.0880417 , -0.08408411,  0.06863542,  0.03343922,
         0.03264093,  0.07052794,  0.06125285,  0.05636711, -0.02918257,
        -0.06513051, -0.03836258,  0.0494308 ,  0.01825709, -0.02122097,
         0.02782661,  0.01791395,  0.04948552,  0.07387761,  0.06228635,
         0.03170639, -0.03587729,  0.04210365, -0.1042795 , -0.04188482,
        -0.07966202,  0.03555579, -0.02385851, -0.03520425,  0.02959975,
        -0.07321492, -0.00654964, -0.05864341, -0.06704858, -0.08080675,
        -0.02889092, -0.02750096, -0.01574948, -0.00336254, -0.0117588 ,
        -0.03101211, -0.03390565,  0.05239751,  0.0

In [9]:
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

from sklearn import metrics

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.5303376913070679
Calinski-Harabaz Index: 1196.8276732890342
Davies-Bouldin Index: 0.6012048496094309


In [10]:
first_index = last_index
last_index = int(len(documents.index)/ 5 * 2)
documents_test = documents[first_index:last_index]

_, testing_docs, last_indices = doc2vec(documents_test)
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.4881032407283783
Calinski-Harabaz Index: 1003.3864599930846
Davies-Bouldin Index: 0.6746392779376456


In [11]:
first_index = last_index
last_index = int(len(documents.index) / 5 * 3)
documents_test = documents[first_index:last_index]

_, testing_docs, last_indices = doc2vec(documents_test)
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.501458466053009
Calinski-Harabaz Index: 975.732862816094
Davies-Bouldin Index: 0.705786882878441


In [12]:
first_index = last_index
last_index = int(len(documents.index) / 5 * 4)
documents_test = documents[first_index:last_index]

_, testing_docs, last_indices = doc2vec(documents_test)
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.49035030603408813
Calinski-Harabaz Index: 945.366903668589
Davies-Bouldin Index: 0.6812677723299275


In [13]:
first_index = last_index
last_index = int(len(documents.index))
documents_test = documents[first_index:last_index]

_, testing_docs, last_indices = doc2vec(documents_test)
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.48773515224456787
Calinski-Harabaz Index: 906.1196343477592
Davies-Bouldin Index: 0.7349005074463227
