# Visualizing and analyzing results

In [1]:
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
import os
import string
import numpy as np

import joblib

[nltk_data] Downloading package punkt to /home/pkmandke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pkmandke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Custom defined imports

from agglo_clus import Agglo_clus as ag
from pre_process import Doc2vec_wrapper, extract_mapped_doc2vecs

# Some helper functions

In [19]:
def setify(depts):
    unq = []
    for st in depts:
        if st in unq:
            continue
        unq.append(st)
    return unq

def get_unique_field(df, field):
    
    return setify(df[field])

def find_docs_in_cluster(df, idx, clust):
    
    return df.loc[df['identifier-uri'].isin([clust.doc_names[_] for _ in idx])]

def get_index_of_docs(clust_object, clust_num=8):
    return [idx for idx, _ in enumerate(clust_object.predictions) if _ == clust_num]
    
def read_docvec_model(path):
    
    return gensim.models.doc2vec.Doc2Vec.load(path)

def get_docs_per_cluster(model, num_classes):
    
    return [len(model.predictions[model.predictions == i]) for i in range(num_classes)]

# Helpers for kmeans

def get_kmeans_map_dict(km_model):
    
    kmap_dict = dict()

    for l in list(set(km_model.km.labels_)):
        for _ in km_model.km.labels_:
            if _ == l:
                if l in kmap_dict.keys():
                    kmap_dict[l] += 1
                else:
                    kmap_dict[l] = 1
                
    return kmap_dict

def kmeans_get_docs_with_idx(km_model, idx):
    
    docs = []
    
    for ix, _ in enumerate(km_model.km.labels_):
        
        if _ == idx:
            
            docs.append(km_model.doc_list[ix])
    
    return docs

# Compute hierarchical clustering

In [4]:
# Paths

DOCVEC_MODEL = '../obj/doc2vec/abstracts_etd_doc2vec_all_docs30961_docs'
NUM_CLUS = 500

In [5]:
doc_vectors, keys = extract_mapped_doc2vecs(read_docvec_model(DOCVEC_MODEL))

print(len(doc_vectors))

23140


In [5]:
doc_vectors, keys = extract_mapped_doc2vecs(read_docvec_model(DOCVEC_MODEL))

ag_model = ag(doc_vectors, doc_names = keys, num_clus = NUM_CLUS, linkage='ward', affinity='euclidean', iter_='1')

ag_model.clusterize()

Starting clustering...
Done training in 0:00:28.063638s


In [8]:
ag_model.save('ag_clus_full_data_trial.sav')

# Analyze results

In [5]:
ag_model = joblib.load('../obj/agglo_clus/iter_1/ag_clus_full_data_trial.sav')

In [22]:
# Read JSON metadata

dframe = pd.read_json('../data/30Kmetadata.json', orient=str, lines=True)[['identifier-uri',\
                                                                           'contributor-department', 'searchTitle']]

pd.options.display.max_rows = 40000

In [7]:
max(get_docs_per_cluster(ag_model, num_classes=NUM_CLUS))  # Get number of documents on each cluster

911

In [8]:
idx = get_index_of_docs(ag_model, clust_num=10) # Find index of docs in a particular cluster

doc_urls = find_docs_in_cluster(dframe, idx, ag_model) # Get documents belonging to that cluster

unique_field = get_unique_field(doc_urls, 'contributor-department')
unique_field, len(doc_urls)

(['Electrical and Computer Engineering ', 'Electrical Engineering '], 32)

In [9]:
idx = get_index_of_docs(ag_model, clust_num=20) # Find index of docs in a particular cluster

doc_urls = find_docs_in_cluster(dframe, idx, ag_model) # Get documents belonging to that cluster

unique_field = get_unique_field(doc_urls, 'contributor-department')
unique_field, len(doc_urls)

(['Mechanical Engineering ',
  'Civil Engineering ',
  'Engineering Science and Mechanics ',
  'Aerospace and Ocean Engineering ',
  'Aerospace Engineering ',
  'Engineering Mechanics ',
  'Civil and Environmental Engineering '],
 82)

In [10]:
doc_urls

Unnamed: 0,identifier-uri,contributor-department,searchTitle
565,http://hdl.handle.net/10919/19257,Mechanical Engineering,aerodynamics of endwall contouring with discre...
747,http://hdl.handle.net/10919/23272,Mechanical Engineering,effect of blitype inlet distortion on turbofan...
879,http://hdl.handle.net/10919/24827,Mechanical Engineering,experimental study of the heat transfer on a s...
901,http://hdl.handle.net/10919/25288,Mechanical Engineering,the influence of development and fanscreen int...
991,http://hdl.handle.net/10919/30847,Mechanical Engineering,flow control optimization for improvement of f...
998,http://hdl.handle.net/10919/30854,Mechanical Engineering,measurements and predictions of heat transfer ...
1128,http://hdl.handle.net/10919/30992,Mechanical Engineering,flow field computations of combustorturbine in...
1147,http://hdl.handle.net/10919/31011,Mechanical Engineering,effects of flow control on the aerodynamics of...
1411,http://hdl.handle.net/10919/31297,Mechanical Engineering,aerodynamic performance of a flow controlled c...
1494,http://hdl.handle.net/10919/31385,Mechanical Engineering,noise reduction in an axisymmetric supersonic ...


# Analyze kmeans results

In [4]:
km_model = joblib.load('../obj/kmeans/iter_2/abstracts_etd_doc2vec_all_docs30961_docs_kmeans.sav')

In [5]:
len(km_model.km.labels_)

23140

In [6]:
help(km_model.km)

Help on KMeans in module sklearn.cluster.k_means_ object:

class KMeans(sklearn.base.BaseEstimator, sklearn.base.ClusterMixin, sklearn.base.TransformerMixin)
 |  K-Means clustering
 |  
 |  Read more in the :ref:`User Guide <k_means>`.
 |  
 |  Parameters
 |  ----------
 |  
 |  n_clusters : int, optional, default: 8
 |      The number of clusters to form as well as the number of
 |      centroids to generate.
 |  
 |  init : {'k-means++', 'random' or an ndarray}
 |      Method for initialization, defaults to 'k-means++':
 |  
 |      'k-means++' : selects initial cluster centers for k-mean
 |      clustering in a smart way to speed up convergence. See section
 |      Notes in k_init for more details.
 |  
 |      'random': choose k observations (rows) at random from data for
 |      the initial centroids.
 |  
 |      If an ndarray is passed, it should be of shape (n_clusters, n_features)
 |      and gives the initial centers.
 |  
 |  n_init : int, default: 10
 |      Number of time 

In [16]:
kmap_dict = get_kmeans_map_dict(km_model)
kmap_dict

{0: 1,
 1: 101,
 2: 99,
 3: 41,
 4: 135,
 5: 49,
 6: 69,
 7: 94,
 8: 287,
 9: 98,
 10: 207,
 11: 78,
 12: 66,
 13: 19,
 14: 45,
 15: 38,
 16: 169,
 17: 33,
 18: 19,
 19: 26,
 20: 23,
 21: 56,
 22: 23,
 23: 27,
 24: 47,
 25: 1,
 26: 141,
 27: 40,
 28: 63,
 29: 19,
 30: 82,
 31: 1,
 32: 5,
 33: 25,
 34: 2,
 35: 2,
 36: 71,
 37: 171,
 38: 1,
 39: 37,
 40: 39,
 41: 108,
 42: 70,
 43: 9,
 44: 26,
 45: 52,
 46: 32,
 47: 89,
 48: 41,
 49: 64,
 50: 68,
 51: 35,
 52: 104,
 53: 15,
 54: 140,
 55: 41,
 56: 15,
 57: 33,
 58: 50,
 59: 98,
 60: 60,
 61: 46,
 62: 8,
 63: 46,
 64: 34,
 65: 110,
 66: 28,
 67: 2,
 68: 23,
 69: 18,
 70: 3,
 71: 108,
 72: 29,
 73: 25,
 74: 33,
 75: 80,
 76: 95,
 77: 132,
 78: 42,
 79: 49,
 80: 252,
 81: 14,
 82: 97,
 83: 3,
 84: 66,
 85: 17,
 86: 12,
 87: 96,
 88: 37,
 89: 2,
 90: 3,
 91: 1,
 92: 1,
 93: 91,
 94: 46,
 95: 53,
 96: 20,
 97: 7,
 98: 11,
 99: 19,
 100: 25,
 101: 53,
 102: 146,
 103: 489,
 104: 41,
 105: 16,
 106: 31,
 107: 16,
 108: 63,
 109: 35,
 110: 15,
 

In [25]:
doc_l = kmeans_get_docs_with_idx(km_model, 7)

dframe.loc[dframe['identifier-uri'].isin(doc_l)]['contributor-department']

261                           Marriage and Family Therapy 
415                           Marriage and Family Therapy 
580       Forest Resources and Environmental Conservation 
685                                     Human Development 
1851                                    Human Development 
2079                                             Forestry 
2274                                           Psychology 
2484                                    Human Development 
2589                                    Human Development 
2629                                    Human Development 
2727                                            Marketing 
2954                               Landscape Architecture 
3061                                    Human Development 
3069                                    Human Development 
3384                                    Human Development 
4074                                    Human Development 
4109                                    Human Developmen

In [26]:
doc_l = kmeans_get_docs_with_idx(km_model, 6)

dframe.loc[dframe['identifier-uri'].isin(doc_l)]['contributor-department']

374                                               Biology 
564       Forest Resources and Environmental Conservation 
670                  Crop and Soil Environmental Sciences 
919       Forest Resources and Environmental Conservation 
924                        Biological Systems Engineering 
1416                            Environmental Engineering 
1653                                         Horticulture 
1931                 Crop and Soil Environmental Sciences 
1970                 Crop and Soil Environmental Sciences 
2210                 Crop and Soil Environmental Sciences 
2246                 Crop and Soil Environmental Sciences 
3133                                  Geological Sciences 
3382                                         Horticulture 
3618                                           Entomology 
4207                                             Forestry 
4364                                             Forestry 
4546                 Crop and Soil Environmental Science