# Visualizing and analyzing results

In [1]:
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
import os
import string
import numpy as np

import joblib

[nltk_data] Downloading package punkt to /home/pkmandke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pkmandke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Custom defined imports

from agglo_clus import Agglo_clus as ag
from pre_process import Doc2vec_wrapper, extract_mapped_doc2vecs

# Some helper functions

In [3]:
def setify(depts):
    unq = []
    for st in depts:
        if st in unq:
            continue
        unq.append(st)
    return unq

def get_unique_field(df, field):
    
    return setify(df[field])

def find_docs_in_cluster(df, idx, clust):
    
    return df.loc[df['identifier-uri'].isin([clust.doc_names[_] for _ in idx])]

def get_index_of_docs(clust_object, clust_num=8):
    return [idx for idx, _ in enumerate(clust_object.predictions) if _ == clust_num]
    
def read_docvec_model(path):
    
    return gensim.models.doc2vec.Doc2Vec.load(path)

def get_docs_per_cluster(model, num_classes):
    
    return [len(model.predictions[model.predictions == i]) for i in range(num_classes)]


# Compute hierarchical clustering

In [4]:
# Paths

DOCVEC_MODEL = '../obj/doc2vec/abstracts_etd_doc2vec_all_docs30961_docs'
NUM_CLUS = 500

In [5]:
doc_vectors, keys = extract_mapped_doc2vecs(read_docvec_model(DOCVEC_MODEL))

ag_model = ag(doc_vectors, doc_names = keys, num_clus = NUM_CLUS, linkage='ward', affinity='euclidean', iter_='1')

ag_model.clusterize()

Starting clustering...
Done training in 0:00:28.063638s


In [8]:
ag_model.save('ag_clus_full_data_trial.sav')

# Analyze results

In [5]:
ag_model = joblib.load('../obj/agglo_clus/iter_1/ag_clus_full_data_trial.sav')

In [6]:
# Read JSON metadata

dframe = pd.read_json('../data/30Kmetadata.json', orient=str, lines=True)[['identifier-uri',\
                                                                           'contributor-department', 'searchTitle']]

pd.options.display.max_rows = 40000

In [7]:
max(get_docs_per_cluster(ag_model, num_classes=NUM_CLUS))  # Get number of documents on each cluster

911

In [8]:
idx = get_index_of_docs(ag_model, clust_num=10) # Find index of docs in a particular cluster

doc_urls = find_docs_in_cluster(dframe, idx, ag_model) # Get documents belonging to that cluster

unique_field = get_unique_field(doc_urls, 'contributor-department')
unique_field, len(doc_urls)

(['Electrical and Computer Engineering ', 'Electrical Engineering '], 32)

In [9]:
idx = get_index_of_docs(ag_model, clust_num=20) # Find index of docs in a particular cluster

doc_urls = find_docs_in_cluster(dframe, idx, ag_model) # Get documents belonging to that cluster

unique_field = get_unique_field(doc_urls, 'contributor-department')
unique_field, len(doc_urls)

(['Mechanical Engineering ',
  'Civil Engineering ',
  'Engineering Science and Mechanics ',
  'Aerospace and Ocean Engineering ',
  'Aerospace Engineering ',
  'Engineering Mechanics ',
  'Civil and Environmental Engineering '],
 82)

In [10]:
doc_urls

Unnamed: 0,identifier-uri,contributor-department,searchTitle
565,http://hdl.handle.net/10919/19257,Mechanical Engineering,aerodynamics of endwall contouring with discre...
747,http://hdl.handle.net/10919/23272,Mechanical Engineering,effect of blitype inlet distortion on turbofan...
879,http://hdl.handle.net/10919/24827,Mechanical Engineering,experimental study of the heat transfer on a s...
901,http://hdl.handle.net/10919/25288,Mechanical Engineering,the influence of development and fanscreen int...
991,http://hdl.handle.net/10919/30847,Mechanical Engineering,flow control optimization for improvement of f...
998,http://hdl.handle.net/10919/30854,Mechanical Engineering,measurements and predictions of heat transfer ...
1128,http://hdl.handle.net/10919/30992,Mechanical Engineering,flow field computations of combustorturbine in...
1147,http://hdl.handle.net/10919/31011,Mechanical Engineering,effects of flow control on the aerodynamics of...
1411,http://hdl.handle.net/10919/31297,Mechanical Engineering,aerodynamic performance of a flow controlled c...
1494,http://hdl.handle.net/10919/31385,Mechanical Engineering,noise reduction in an axisymmetric supersonic ...
