### imports

In [1]:
import math
import lucene
import pickle
import time
import matplotlib.pyplot as plt
import itertools
import numpy as np
from tqdm import tqdm
from java.io import File
import xml.etree.ElementTree as ET
from collections import defaultdict
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.util import BytesRefIterator
from org.apache.lucene.index import DirectoryReader, Term
from org.apache.lucene.analysis.en import EnglishAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search import IndexSearcher, BooleanQuery, BooleanClause, TermQuery, BoostQuery
from org.apache.lucene.search.similarities import BM25Similarity, LMJelinekMercerSimilarity, LMDirichletSimilarity
lucene.initVM()

<jcc.JCCEnv at 0x7f5053e9dc50>

In [2]:
index_path = './documents_index/'  #wt100g document index


directory = FSDirectory.open(File(index_path).toPath())
indexReader = DirectoryReader.open(directory)

### document preprocessing

In [5]:
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_short


def alpha(text):
    a = text.split()
    j = [k for k in a if k.isalpha()]
    return ' '.join(j)

def getprocessdoc(doc_num):
    p = PorterStemmer()
    doc = indexReader.document(doc_num)  
    raw_doc =  doc.get('CONTENTS')
    ids = doc.get('ID')

    no_punc = strip_punctuation(raw_doc)
    stemed_doc = p.stem_sentence(no_punc)
    no_stopword = remove_stopwords(stemed_doc)
    no_nonalpha = alpha(no_stopword)
    pro_doc = strip_short(no_nonalpha, minsize=4)

    return pro_doc


In [37]:
def getdoc(num):
    for i in tqdm(range(num)):
        yield getprocessdoc(i)

### tf-idf vectorization using scikitlearn

In [71]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.4, min_df=10)
total_doc = indexReader.numDocs()
X = vectorizer.fit_transform(getdoc(total_doc))

100%|██████████| 528155/528155 [09:54<00:00, 887.73it/s] 


In [87]:
X

<528155x73464 sparse matrix of type '<class 'numpy.float64'>'
	with 71542455 stored elements in Compressed Sparse Row format>

In [89]:
with open('cleaned_doc.vec', 'wb') as fin:
     pickle.dump(X, fin, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
with open('cleaned_doc.vec', 'rb') as handle:
    X_tfidf = pickle.load(handle)

In [19]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from time import time
import pickle

with open('cleaned_doc.vec', 'rb') as handle:
    X_tfidf = pickle.load(handle)


lsa = make_pipeline(TruncatedSVD(n_components=500), Normalizer(copy=False))
t0 = time()
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()

print(f"LSA done in {time() - t0:.3f} s")
print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

# with open('dim-red-vec.pickle', 'wb') as handle:
#     pickle.dump(X_lsa, handle, protocol=pickle.HIGHEST_PROTOCOL)
#     print('doc-vec saved successfully to file')

LSA done in 5.219 s
Explained variance of the SVD step: 1.5%


In [109]:
from sklearn.cluster import KMeans, MiniBatchKMeans
true_k = 100
 
# for seed in range(5):
#     kmeans = KMeans(
#         n_clusters=true_k,
#         max_iter=100,
#         n_init=1,
#         random_state=seed,
#     ).fit(X)
#     cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
#     print(f"Number of elements assigned to each cluster: {cluster_sizes}")
# print()


In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
k_range = range(2,20)

sse = []

for k in k_range:
    km = KMeans(n_clusters=k)
    km.fit(X_tfidf)
    preds = km.fit_predict(X_tfidf)
    centers = km.cluster_centers_
    score = silhouette_score(X_tfidf, preds)
    print("For n_clusters = {}, silhouette score is {})".format(k, score))
    sse.append(km.inertia_)

In [None]:
plt.plot(k_range, sse)

In [20]:
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans, MiniBatchKMeans

# k_range = range(2,20)
k_range = [100]

sse = []

for k in k_range:
    km = MiniBatchKMeans(n_clusters=k, batch_size=6000,
        max_iter=100,
        n_init=5,
        random_state=1, verbose=True).fit(X_lsa)
   
    preds = km.fit_predict(X_lsa)

    centers = km.cluster_centers_
    doc_labels = km.labels_
    # score = silhouette_score(X, preds)
    # print("For n_clusters = {}, silhouette score is {})".format(k, score))
    sse.append(km.inertia_)

Init 1/5 with method k-means++
Inertia for init 1/5: 0.8131460934990498
Init 2/5 with method k-means++
Inertia for init 2/5: 0.8001849921494194
Init 3/5 with method k-means++
Inertia for init 3/5: 0.808760448652096
Init 4/5 with method k-means++
Inertia for init 4/5: 0.7707530576099133
Init 5/5 with method k-means++
Inertia for init 5/5: 0.7741522624032409
[MiniBatchKMeans] Reassigning 1 cluster centers.
Minibatch step 1/8802: mean batch inertia: 4.1594422121685346e-05
Minibatch step 2/8802: mean batch inertia: 0.0007042091752306175, ewa inertia: 0.0007042091752306175
Minibatch step 3/8802: mean batch inertia: 0.000131847172146701, ewa inertia: 0.000691204790092501
Minibatch step 4/8802: mean batch inertia: 0.00011954861084225251, ewa inertia: 0.0006782164416670302
Minibatch step 5/8802: mean batch inertia: 0.00025887060604714055, ewa inertia: 0.0006686886695174406
Minibatch step 6/8802: mean batch inertia: 0.0001436330654016252, ewa inertia: 0.0006567591122476382
Minibatch step 7/8802