In [None]:
####################################################
####################################################
# coding: utf-8       
# Copyright 2020 IBM All Rights Reserved.   
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.   
# You may obtain a copy of the License at   
#   
# http://www.apache.org/licenses/LICENSE-2.0   
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS,   
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   
# See the License for the specific language governing permissions and   
# limitations under the License.
####################################################
####################################################
#
# The data used by this notebook has been generated from various sources including content from the
# COVID-19 Open Research Dataset (CORD-19)  (https://pages.semanticscholar.org/coronavirus-research)
#
####################################################
#
# If you do not have WordCloud and want to use it, you'll need to download at a location such as: 
#           https://www.geeksforgeeks.org/generating-word-cloud-python/
#
####################################################
import sys
import os
from os import walk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import json

In [None]:
def word_cloud(words_for_cloud):
######################################
# Given a list of space-delimited words, this function will build
# and display a word cloud image
######################################
    wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
#                stopwords = stopwords, 
                min_font_size = 10).generate(words_for_cloud) 
# plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.show() 

In [None]:
def initialize():
######################################
# This function will initialize global variables and load the master index. This index is
# in CSV format and contains the main items of information that allow a user to drill 
# down into the actual raw ACD enrichment detail files in order to perform deeper analysis.
######################################
    global raw_files_path
    
    # !!!!!!!!!!!!!!!!!!!!!
    # TO DO:  
    # !!!!!!!!!!!!!!!!!!!!!
    
    # Set path values for the location of the csv file and the raw data files 
    # Example:
    # csv_path = "/Users/myname/folder1/folder2/xxxxxx.csv"
    # raw_files_path = "/Users/myname/raw_files_place"
    
    csv_path = "<.......your csv file path here.......>"
    raw_files_path = "<.......your raw files root directory here.......>"

    global master_index
    master_index = pd.read_csv(csv_path,
                               usecols=["docId","name","preferredName"],
                               dtype={"docId":"str"}
                              )    #,nrows=15800000)
    

In [None]:
def get_first_data_file(dirname):
######################################
# This function will return the first raw data file it finds in the folder structure
# of raw json files. A file is needed to support the methods that list the 
# names of the data elements that a user might want to use to perform analysis.
######################################
    for (pth, dir, fn) in walk(dirname):
        for n in fn:
            if n.endswith(".json"):
                return os.path.join(pth,n)
    return "no_datafile_found"


In [None]:
def list_data_types():
######################################
# This function uses an arbitrary raw data file to obtain
# and list out for the user a list of the data types that 
# are available for exploration.
######################################
    print("=============================")
    print(" ACD raw data - data types")
    print("=============================")
    targetJsonFile=get_first_data_file(raw_files_path)
    # read in json file as a dataframe        
    jdata = pd.read_json(targetJsonFile)
    json_dataframe = pd.DataFrame(jdata)
    xresult=json_dataframe.get(key="result")
    xunstruc=xresult.get(key="unstructured")
    xzero=xunstruc[0]
    xdata=xzero["data"]
    for i in xdata:
        print(f'{i:30}',type(xdata[i]))

In [None]:
def list_data_type_fields(data_type):
######################################
# Given a data type, this function uses an arbitrary raw data file to obtain
# and list out for the user a list of the fields supporting that 
# data type. These fields can then be used to get at the lowest level
# of ACD enrichment data.
######################################
    print("=============================")
    print(" ACD raw data - ",data_type,"field names")
    print("=============================")
    targetJsonFile=get_first_data_file(raw_files_path)
    # read in json file as a dataframe        
    jdata = pd.read_json(targetJsonFile)
    json_dataframe = pd.DataFrame(jdata)
    xresult=json_dataframe.get(key="result")
    xunstruc=xresult.get(key="unstructured")
    xzero=xunstruc[0]
    xdata=xzero["data"]
    this_data_type=xdata[data_type]
    tdtzero=this_data_type[0]
    for i in tdtzero:
        print(f'{i:30}',type(tdtzero[i]))    

In [None]:
def get_top_names(topnamedepth):
######################################
# This function will list, in ranked order, the attribute names
# and the associate preferred names of the concept it is associated
# with.  The ranking is done by instance counts of the relationships
# across all documents processed by this enrichment run.
######################################
    print("\n\n=============================")
    print("Top attributeValue Names in Ranked Order of Occurrence")
    print("=============================")
    name_rank_index=master_index["name"].value_counts()
    nr_len = name_rank_index.size
    if nr_len < topnamedepth:
        topnamedepth = nr_len
    name_list=[]
    for x in range(0,topnamedepth):
        name_list.append(name_rank_index.index[x])
    return name_list


In [None]:
def get_top_name_selection(top_name_list):
######################################
# Function to prompt for and return the value chosen which corresponds to the 
# name value that the user wants to work with.
######################################
    list_size=len(top_name_list)
    ct=0
    print("\n\n")
    print(ct,"Exit")
    for x in top_name_list:
        ct += 1
        print(ct,x)
    top_name_index_int = -1
    while top_name_index_int < 0 or top_name_index_int > list_size:
        top_name_index=input("\nEnter number of desired name: ")
        top_name_index_int=int(top_name_index)
    return top_name_index_int-1              # allow for zero-based index

In [None]:
def get_top_preferred_names(topprefnamedepth,df_top_name):
######################################
# This function will list, in ranked order, the attribute names
# and the associate preferred names of the concept it is associated
# with.  The ranking is done by instance counts of the relationships
# across all documents processed by this enrichment run.
######################################
    print("\n\n=============================")
    print("Top attributeValue Preferred Names in Ranked Order of Occurrence")
    print("=============================")
    name_rank_index=df_top_name["preferredName"].value_counts()
    nr_len = name_rank_index.size
    if nr_len < topprefnamedepth:
        topprefnamedepth = nr_len
    name_list=[]
    for x in range(0,topprefnamedepth):
        name_list.append(name_rank_index.index[x])
    return name_list


In [None]:
def get_top_preferred_name_selection(top_preferred_name_list):
######################################
# Function to prompt for and return the value chosen which corresponds to the 
# preferred name value that the user wants to work with.
######################################
    list_size=len(top_preferred_name_list)
    ct=0
    print("\n\n")
    print(ct,"Exit")
    for x in top_preferred_name_list:
        ct += 1
        print(ct,x)
    top_preferred_name_index_int = -1
    while top_preferred_name_index_int < 0 or top_preferred_name_index_int > list_size:
        top_preferred_name_index=input("\nEnter number of desired name: ")
        top_preferred_name_index_int=int(top_preferred_name_index)
    return top_preferred_name_index_int-1              #allow for zero-based index

In [None]:
def get_document_count():
    print(master_index["docId"].value_counts().size)

In [None]:
#################################
# run this method to perform all initialization
#################################
initialize()

In [None]:
#################################
# run this method to return the number of documents defined in the
# master index file
#################################
get_document_count()

In [None]:
#################################
# run this method to list out the data types that are available in the raw files
#################################
list_data_types()
    

In [None]:
#################################
# run this method to list the field names and types for a 
# given data type (that would be listed by the preceding method)
#################################
list_data_type_fields("attributeValues")    

In [None]:
    ################################################
    ## main module
    ################################################
    list_depth=20
    
    #############################
    # load top names from the ACD Enrichment Result CSV
    ############################# 
    top_name_list=get_top_names(list_depth)
    top_name_index=get_top_name_selection(top_name_list)
    while top_name_index > -1:
        df_top_name = master_index.loc[master_index['name']==top_name_list[top_name_index]]
    #############################
    # load top preferred names from the ACD Enrichment Result CSV
    ############################# 
        top_preferred_name_list=get_top_preferred_names(list_depth,df_top_name)
        top_preferred_name_index=get_top_preferred_name_selection(top_preferred_name_list)
        while top_preferred_name_index > -1:
            df_top_pref_name = df_top_name.loc[df_top_name['preferredName']==top_preferred_name_list[top_preferred_name_index]]
    #############################
    # get top documents for preferred names
    ############################# 
            docList=df_top_pref_name["docId"].value_counts()
            print("\n========================================================\n")
            print(docList.size,"documents were found matching your selection of",top_name_list[top_name_index],"and",top_preferred_name_list[top_preferred_name_index])
            print("\nHow many documents do you want to include in your analysis?")
            print("Note: Documents will be included in descending order of occurrences per document of your selection.")
            print("It is recommended that you choose 500 documents or less, unless you want to wait a long time.")
            doc_count=input()
            doc_count=int(doc_count)
            if doc_count > 5000:
                doc_count=5000
            if doc_count > docList.size:
                doc_count = docList.size
            print("Will process",doc_count,"files.")
            if doc_count == 0:
                break
            current_doc_count=0
            fflist = os.listdir(raw_files_path)
            wordlist=""   
            found_atleast_one_doc=False
            flist=[]
            for (pth, dir, fn) in walk(raw_files_path):
                for fnn in fn:
                    flist.append(os.path.join(pth,fnn))
            for doc_id in docList.index:
                #sometimes doc_id can be all numbers, so let's make sure it's a string type
                doc_id=str(doc_id)
                doc_id_str=str(doc_id)+"_body"
                found_doc=False
                for fname in flist:
                    if doc_id_str in fname and fname.endswith(".json"):
                        targetJsonFile = fname
                        found_doc=True
                        fount_atleast_one_doc=True
                        # read in json file as a dataframe        
                        jdata = pd.read_json(targetJsonFile)
                        json_dataframe = pd.DataFrame(jdata)
                        xresult=json_dataframe.get(key="result")
                        xunstruc=xresult.get(key="unstructured")
                        if type(xunstruc) is not list:
                            continue
                        xzero=xunstruc[0]
                        if "data" in xzero:
                            xdata=xzero["data"]
                            if "attributeValues" in xdata:
                                xattrv=xdata["attributeValues"]
                                for oneattrv in xattrv:
                                    if "coveredText" in oneattrv:
                                        covt=oneattrv["coveredText"]
                                        covt=covt.replace(" ","_")
                                        wordlist=wordlist+" "+covt
                    if found_doc:
                        break
                if found_doc:
                    current_doc_count += 1
                if current_doc_count == doc_count:
                    break;
            if wordlist=="":
                if found_atleast_one_doc==False:
                    wordlist="no_matching_documents"
                else:
                    wordlist="no_words"
            word_cloud(wordlist)
            top_preferred_name_index=get_top_preferred_name_selection(top_preferred_name_list)
        top_name_index=get_top_name_selection(top_name_list)
