## Weights k-Means

This is the same script as topic-modeling, but considers weights. Currently, weights are set to 'CitedBy' column, which is a count of citations as sources by Scopus.

In [None]:
import pickle
import collections
import pandas as pd
import numpy as np
 
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

In [None]:
tokens = pickle.load(open('../Preprocessing/tokens.pkl', 'rb'))
data = pickle.load(open('../Preprocessing/topic-corpus.pkl', 'rb'))

In [None]:
def dummy_fun(tokens):
    return tokens

In [None]:
cited_by = data['CitedBy'].to_numpy()
weights = cited_by.astype(np.float)

In [None]:
def cluster_texts(texts, clusters=3, weight=weights):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(analyzer='word',
                            tokenizer=dummy_fun,
                            preprocessor=dummy_fun,
                            max_df=0.9,
                            min_df=0.1,
                            token_pattern=None) 
    X = vectorizer.fit_transform(tokens)
    features = vectorizer.get_feature_names()
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(X, sample_weight=weights)
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering, X, features

In [None]:
NUMBER_OF_CLUSTERS = 7

clusters, X, features = cluster_texts(data, NUMBER_OF_CLUSTERS, weights)

In [None]:
import pandas as pd
items = [[key, i] for key, value in dict(clusters).items() for i in value]
df2 = pd.DataFrame(items, columns=['Cluster', 'Index'])
df2.set_index('Index', inplace=True)
df2 = df2.sort_index()
data = data.reset_index()

In [None]:
conc = pd.concat([data, df2], axis=1)
conc.to_csv('/Users/ajda/Desktop/weighted-clusters.csv', index=False)

In [None]:
print("Number of clusters found: {}".format(len(clusters)))

In [None]:
from Orange.statistics.util import FDR
from orangecontrib.text.stats import hypergeom_p_values
import numpy as np

filter_p_value = 0.01
filter_fdr_value = 1

def getKey(item):
    return item[1]

for i in range(len(clusters)):
    mask = []
    for cl, ind in items:
        if cl == i:
            mask.append(True)
        else:
            mask.append(False)

    data = X.toarray()
    selected_data = np.compress(mask, data, axis=0)

    p_values = hypergeom_p_values(data, selected_data)
    fdr_values = FDR(p_values)
    
    fp = lambda score: "%0.5f" % score if score > 10e-3 else "%0.1e" % score
    fpt = lambda score: "%0.9f" % score if score > 10e-3 else "%0.5e" % score
    
    print("Enrichment of cluster {}".format(i+1))
    
    result = []
    for word, pval, fval in zip(features, p_values, fdr_values):
        if pval <= filter_p_value and fval <= filter_fdr_value:
            result.append((word, fp(pval), fpt(fval)))
    result = sorted(result, key=getKey)
    for w, p, f in result:
        print("    ", w, p, f)
    print()
