In [1]:
import pickle
import collections
 
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

In [2]:
tokens = pickle.load(open('../Preprocessing/tokens.pkl', 'rb'))
data = pickle.load(open('../Preprocessing/topic-corpus.pkl', 'rb'))

In [3]:
def dummy_fun(tokens):
    return tokens

In [4]:
def cluster_texts(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(analyzer='word',
                            tokenizer=dummy_fun,
                            preprocessor=dummy_fun,
                            max_df=0.9,
                            min_df=0.1,
                            token_pattern=None) 
    X = vectorizer.fit_transform(tokens)
    features = vectorizer.get_feature_names()
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(X)
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering, X, features

In [5]:
NUMBER_OF_CLUSTERS = 7

clusters, X, features = cluster_texts(data, NUMBER_OF_CLUSTERS)

In [6]:
import pandas as pd
items = [[key, i] for key, value in dict(clusters).items() for i in value]
df2 = pd.DataFrame(items, columns=['Cluster', 'Index'])
df2.set_index('Index', inplace=True)
df2 = df2.sort_index()
data = data.reset_index()

In [7]:
conc = pd.concat([data, df2], axis=1)
conc.to_csv('/Users/ajda/Desktop/clusters.csv', index=False)

In [8]:
print("Number of clusters found: {}".format(len(clusters)))

Number of clusters found: 7


In [9]:
from Orange.statistics.util import FDR
from orangecontrib.text.stats import hypergeom_p_values
import numpy as np

filter_p_value = 0.01
filter_fdr_value = 1

def getKey(item):
    return item[1]

for i in range(len(clusters)):
    mask = []
    for cl, ind in items:
        if cl == i:
            mask.append(True)
        else:
            mask.append(False)

    data = X.toarray()
    selected_data = np.compress(mask, data, axis=0)

    p_values = hypergeom_p_values(data, selected_data)
    fdr_values = FDR(p_values)
    
    fp = lambda score: "%0.5f" % score if score > 10e-3 else "%0.1e" % score
    fpt = lambda score: "%0.9f" % score if score > 10e-3 else "%0.5e" % score
    
    print("Enrichment of cluster {}".format(i+1))
    
    result = []
    for word, pval, fval in zip(features, p_values, fdr_values):
        if pval <= filter_p_value and fval <= filter_fdr_value:
            result.append((word, fp(pval), fpt(fval)))
    result = sorted(result, key=getKey)
    for w, p, f in result:
        print("    ", w, p, f)
    print()


Enrichment of cluster 1
     study 1.1e-04 6.73112e-03
     method 1.7e-03 0.051584168
     data 2.6e-03 0.062881410
     finding 2.6e-05 3.12038e-03
     related 4.3e-03 0.084390748
     value 6.1e-03 0.101753235
     using 8.4e-03 0.101753235
     analysis 8.7e-04 0.034381829
     two 9.6e-03 0.101753235

Enrichment of cluster 2
     aim 1.0e-04 0.012132163
     change 5.3e-04 0.031539104

Enrichment of cluster 3
     service 6.7e-03 0.702804380

Enrichment of cluster 4
     use 2.3e-03 0.269778679
     first 4.5e-03 0.269778679

Enrichment of cluster 5
     research 1.3e-03 0.159902055
     however 4.6e-03 0.203626194
     activity 5.1e-03 0.203626194

Enrichment of cluster 6
     business 7.0e-03 0.539877605

Enrichment of cluster 7

