In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
from gensim.utils import simple_preprocess
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

english_words = set(nltk.corpus.words.words())
english_stop_words = nltk.corpus.stopwords.words("english")

lemmatizer = WordNetLemmatizer()

def preprocess(text):
    return " ".join(lemmatizer.lemmatize(w) for w in nltk.wordpunct_tokenize(text)
        if w.lower() in english_words and w.lower() not in english_stop_words or not w.isalpha())

def tokenize(document):
    return simple_preprocess(str(document).encode("utf-8"))

In [3]:
import pandas as pd

dname = './data/preprocessed-twitter-tweets'

pos = pd.read_csv('{}/{}.csv'.format(dname, 'processedPositive')).T
neu = pd.read_csv('{}/{}.csv'.format(dname, 'processedNeutral')).T
neg = pd.read_csv('{}/{}.csv'.format(dname, 'processedNegative')).T

documents = pd.concat([pos, neu, neg])
documents['tweets'] = documents.index
documents = documents.sample(frac=1).reset_index(drop=True)

documents.head(15)

Unnamed: 0,tweets
0,Follow him happy
1,How became demonetised notes in
2,nachnis of fighting for rights in and more.
3,Hellooo happy PixelSavage (Pixel_Savage_Yo)
4,i love mason and miss mason unhappy
5,4000 migrated in 2015 to shield their capital.
6,Ruby was rehomed for times before she was for ...
7,death on 1st solo trip and more. Also in epap...
8,What meant by 'potato factory' is thriving rea...
9,i hope this little beaand is andot overworkian...


In [4]:
training_docs = []
num_docs = float(len(documents.index))
for _, row in documents.iterrows():
    training_docs.append(tokenize(preprocess(row['tweets'])))

training_docs

[['follow', 'happy'],
 [],
 ['fighting'],
 ['happy', 'pixel_savage_yo'],
 ['love', 'mason', 'miss', 'mason', 'unhappy'],
 ['shield', 'capital'],
 ['ruby', 'time', 'old', 'forever', 'home', 'always', 'consider', 'rescue'],
 ['death', 'st', 'solo', 'trip', 'also'],
 ['meant', 'potato', 'factory', 'thriving', 'reality', 'chain'],
 ['hope', 'little', 'least', 'rest', 'unhappy'],
 ['sad', 'life', 'must'],
 ['would', 'great', 'trick', 'happy'],
 ['slipped', 'budget'],
 ['red', 'velvet', 'unhappy'],
 ['missing', 'cry', 'seaside', 'area'],
 ['list', 'want', 'happy'],
 ['worry', 'choice', 'chief', 'minister'],
 ['end', 'culture'],
 ['flagging', 'campaign'],
 ['win', 'influence', 'people'],
 ['thanks', 'top', 'engaged', 'community', 'week', 'happy', 'want'],
 ['happy', 'ming', 'love', 'love', 'happy'],
 ['assembly', 'secretary', 'confidence', 'motion', 'speaker'],
 ['thank', 'happy'],
 ['love', 'unhappy'],
 ['boo'],
 ['career', 'unhappy'],
 ['flashing', 'light', 'paying', 'parking'],
 [],
 ['hop

In [5]:
from gensim.models import Word2Vec

w2v = Word2Vec(size=150, window=10, min_count=1, sg=1, workers=10)
w2v.build_vocab(training_docs)
w2v.train(sentences=training_docs, total_examples=len(training_docs), epochs=w2v.epochs)

w2v

<gensim.models.word2vec.Word2Vec at 0x11bf4b780>

In [6]:
def doc2vec(docs):
    last_indices = []
    testing_docs = []
    tokenized_docs = []

    for _, row in docs.iterrows():
        tokens = tokenize(preprocess(row['tweets']))
        if len(last_indices) > 0:
            last_indices.append(last_indices[len(last_indices) - 1] + len(tokens))
        else:
            last_indices.append(len(tokens))
        testing_docs.extend(tokens)
        tokenized_docs.append(tokens)

    return tokenized_docs, testing_docs, last_indices

from sklearn.cluster import MiniBatchKMeans

kmeans = MiniBatchKMeans(n_clusters=8, batch_size=750)

def stream_cluster(vectors, vector_size, last_indices):
    document_vectors = []
    first_index = 0
    
    for last_index in last_indices:
        min_vector = []
        max_vector = []
        for i in range(vector_size):
            v_list = [v[i] for v in vectors[first_index:last_index]]
            if len(v_list) > 0:
                min_vector.append(min(v_list))
                max_vector.append(max(v_list))
        if len(min_vector) > 0 and len(max_vector) > 0:
            min_vector.extend(max_vector)
            document_vectors.append(min_vector)
        first_index = last_index
    clustering = kmeans.partial_fit(document_vectors)
    
    return clustering, clustering.labels_, document_vectors

In [7]:
last_index = int(len(documents.index) / 5)
documents_test = documents[:last_index]

tokenized_docs, testing_docs, last_indices = doc2vec(documents_test)

tokenized_docs

[['follow', 'happy'],
 [],
 ['fighting'],
 ['happy', 'pixel_savage_yo'],
 ['love', 'mason', 'miss', 'mason', 'unhappy'],
 ['shield', 'capital'],
 ['ruby', 'time', 'old', 'forever', 'home', 'always', 'consider', 'rescue'],
 ['death', 'st', 'solo', 'trip', 'also'],
 ['meant', 'potato', 'factory', 'thriving', 'reality', 'chain'],
 ['hope', 'little', 'least', 'rest', 'unhappy'],
 ['sad', 'life', 'must'],
 ['would', 'great', 'trick', 'happy'],
 ['slipped', 'budget'],
 ['red', 'velvet', 'unhappy'],
 ['missing', 'cry', 'seaside', 'area'],
 ['list', 'want', 'happy'],
 ['worry', 'choice', 'chief', 'minister'],
 ['end', 'culture'],
 ['flagging', 'campaign'],
 ['win', 'influence', 'people'],
 ['thanks', 'top', 'engaged', 'community', 'week', 'happy', 'want'],
 ['happy', 'ming', 'love', 'love', 'happy'],
 ['assembly', 'secretary', 'confidence', 'motion', 'speaker'],
 ['thank', 'happy'],
 ['love', 'unhappy'],
 ['boo'],
 ['career', 'unhappy'],
 ['flashing', 'light', 'paying', 'parking'],
 [],
 ['hop

In [8]:
vectors = list(map(lambda word: w2v.wv[word], testing_docs))

vectors[:1]

[array([-0.08834181, -0.06055463,  0.06258085, -0.0245199 ,  0.06082386,
         0.00599433,  0.01328864,  0.03933259, -0.07060732,  0.02296185,
        -0.03206814,  0.02855729,  0.02900007, -0.00964998,  0.02986188,
         0.05972818, -0.04019231,  0.0324606 , -0.01360453,  0.0377456 ,
        -0.00349328, -0.01173059, -0.03538696,  0.02079999, -0.02273198,
        -0.0240435 , -0.02361206, -0.02080856,  0.00040294, -0.01269963,
        -0.01324517,  0.02747713,  0.00269448, -0.03933906,  0.01882623,
         0.06795495, -0.03604603, -0.06369546,  0.01837513, -0.00238435,
         0.01799057,  0.02159044, -0.03718265, -0.0005188 , -0.02380586,
         0.0128215 , -0.01987697,  0.00503257,  0.04433121, -0.03344968,
        -0.03675985, -0.02851134,  0.02705334, -0.0340457 , -0.05263006,
         0.04495753, -0.04050985, -0.0283006 , -0.00725221,  0.00035667,
         0.05819903,  0.03475019, -0.01920024,  0.05105542,  0.01063198,
         0.00908202, -0.07329476, -0.067671  ,  0.0

In [9]:
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

from sklearn import metrics

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinksi-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.4774351716041565
Calinksi-Harabaz Index: 1297.2355956284641
Davies-Bouldin Index: 0.7500911614999684


In [10]:
first_index = last_index
last_index = int(len(documents.index)/ 5 * 2)
documents_test = documents[first_index:last_index]

_, testing_docs, last_indices = doc2vec(documents_test)
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinksi-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.4759519100189209
Calinksi-Harabaz Index: 1243.076520135763
Davies-Bouldin Index: 0.7464461641714557


In [11]:
first_index = last_index
last_index = int(len(documents.index) / 5 * 3)
documents_test = documents[first_index:last_index]

_, testing_docs, last_indices = doc2vec(documents_test)
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinksi-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.4573555588722229
Calinksi-Harabaz Index: 1079.6613237010329
Davies-Bouldin Index: 0.7776765581073123


In [12]:
first_index = last_index
last_index = int(len(documents.index) / 5 * 4)
documents_test = documents[first_index:last_index]

_, testing_docs, last_indices = doc2vec(documents_test)
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinksi-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.4785693287849426
Calinksi-Harabaz Index: 1275.2718197354347
Davies-Bouldin Index: 0.7559754844376398


In [13]:
first_index = last_index
last_index = int(len(documents.index))
documents_test = documents[first_index:last_index]

_, testing_docs, last_indices = doc2vec(documents_test)
_, labels, document_vectors = stream_cluster(vectors, w2v.wv[testing_docs[0]].shape[0], last_indices)

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(document_vectors, labels)))
print('Calinksi-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(document_vectors, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(document_vectors, labels)))

Silhouette Coefficient: 0.4626476466655731
Calinksi-Harabaz Index: 1070.4605617750144
Davies-Bouldin Index: 0.7888986458392233
