# **Amazon - Energy Efficient Communication Systems ENGINE Project**

Bibliographic Network Visualization by Kevin Um, following [this guide](https://medium.com/mbf-data-science/bibliographic-network-visualization-for-academic-literature-reviews-e1cd2c357526)

This notebook contains all citation data exported from Web of Science. 
> Each excel file contains a list of nodes, with each node a paper either in the corpus or cited by the corpus, and edges indicating where one paper cites another.



## Loading Data

In [58]:
! gdown --id 1g7njLEyFn5h49_jU-H8zsq3Tw6SZ3D4D

Downloading...
From: https://drive.google.com/uc?id=1g7njLEyFn5h49_jU-H8zsq3Tw6SZ3D4D
To: /content/for_visualization.zip
100% 25.0k/25.0k [00:00<00:00, 33.0MB/s]


In [59]:
! mkdir for_visualization
! unzip for_visualization.zip -d ./for_visualization

mkdir: cannot create directory ‘for_visualization’: File exists
Archive:  for_visualization.zip
replace ./for_visualization/savedrecs (1).xls? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace ./for_visualization/savedrecs.xls? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [60]:
!pip3 install metaknowledge
!pip3 install --upgrade xlrd



In [61]:
# To save the best model and get data files.
import os
import glob
import copy

import metaknowledge as mk
import pandas as pd

In [62]:
# Load the csv files over here into pandas data frame
# YOUR CODE HERE
# 0.1
def load_files(xls):
  # Input: xls_file_name
  # Output: pandas data frame
  pandas_xls = pd.read_excel(xls)
  return pandas_xls

def load_all_files(directory):
  xls_files = glob.glob(directory + '*.xls')
  raw_data = {}

  for xls_file in xls_files:
    data_id = os.path.splitext(os.path.basename(xls_file))[0]
    
    xls_data = load_files(directory + data_id + '.xls')

    raw_data[data_id] = {'data': xls_data}

  return raw_data

In [63]:
raw_data = load_all_files('/content/for_visualization/')

In [64]:
def get_usable_subjects(raw_data):
  usable_subjects = []
  keys = sorted(raw_data.keys())

  for subject in keys:
    usable_subjects.append(subject)

  return usable_subjects

def preprocess(raw_data):

  usable_subjects = []
  usable_subjects = get_usable_subjects(raw_data)

  count = 0
  columns = [raw_data['savedrecs (1)']['data'].columns]
  sources = pd.DataFrame(columns=columns)

  for subject in usable_subjects:
    print('\tProcessing subject', count+1, ':', subject, '...')
    sources = sources.append(pd.DataFrame(raw_data[subject]['data'].values, columns=columns), ignore_index = True)
    count += 1
 
  return sources

In [65]:
sources = preprocess(raw_data)

	Processing subject 1 : savedrecs ...
	Processing subject 2 : savedrecs (1) ...


In [66]:
sources

Unnamed: 0,Publication Type,Authors,Book Authors,Group Authors,Book Group Authors,Researcher Ids,ORCIDs,Book Editors,Author - Arabic,Article Title,...,"Times Cited, SCIELO","Times Cited, All Databases",180 Day Usage Count,Since 2013 Usage Count,ISSN,eISSN,ISBN,UT (Unique ID),Pubmed Id,Unnamed: 57
0,J,"Kim, Hak-Jin; Kim, Marie S.; Han, Seung-Jae",,,,,,,,Collision-free optimal packet scheduling algor...,...,0,0,,,1389-1286,1872-7069,,WOS:000773705300005,,
1,J,"Cena, Gianluca; Scanzio, Stefano; Valenzano, A...",,,,"Cena, Gianluca/C-3904-2015","Cena, Gianluca/0000-0003-0084-5321",,,Ultra-Low Power Wireless Sensor Networks Based...,...,0,1,,,,2079-9292,,WOS:000754400600001,,
2,J,"Urke, Andreas Ramstad; Kure, Oivind; ovsthus, ...",,,,,"Ovsthus, Knut/0000-0001-6849-0913; Urke, Andre...",,,A Survey of 802.15.4 TSCH Schedulers for a Sta...,...,0,0,,,,1424-8220,,WOS:000751079200001,35009558.0,
3,J,"Javan, Nastooh Taheri; Sabaei, Masoud; Hakami,...",,,,"Hakami, Vesal/T-1426-2018","Hakami, Vesal/0000-0002-0798-3981; Taheri Java...",,,Adaptive Channel Hopping for IEEE 802.15.4 TSC...,...,0,1,,,1530-437X,1558-1748,,WOS:000709128900154,,
4,J,"Farahmandand, Mohammad; Nabi, Majid",,,,,"Farahmand, Mohammad/0000-0001-9529-0802",,,Channel Quality Prediction for TSCH Blacklisti...,...,0,1,,,1530-437X,1558-1748,,WOS:000703056000146,,
5,J,"Daneels, Glenn; Delgado, Carmen; Elsas, Robbe;...",,,,"Famaey, Jeroen/AAB-6171-2022; Pinillos, Carmen...","Famaey, Jeroen/0000-0002-3587-1354; Pinillos, ...",,,Slot Bonding for Adaptive Modulations in IEEE ...,...,0,1,,,2327-4662,,,WOS:000665207100039,,
6,J,"Solimini, Domenico; Tuset-Peiro, Pere; Boquet,...",,,,"Vázquez-Gallego, Francisco/AAS-9955-2021; Gome...","Vilajosana, Xavi/0000-0002-3020-427X; Boquet P...",,,Towards Reliable IEEE 802.15.4g SUN with Re-tr...,...,0,0,,,1939-8018,1939-8115,,WOS:000650522400001,,
7,J,"Amezcua Valdovinos, Ismael; Figueroa Millan, P...",,,,,"Vargas-Rosales, Cesar/0000-0003-1770-471X; Ame...",,,Distributed Channel Ranking Scheduling Functio...,...,0,2,,,,1424-8220,,WOS:000628665700001,33668770.0,
8,J,"Scanzio, Stefano; Wisniewski, Lukasz; Gaj, Piotr",,,,"Wisniewski, Lukasz/AAE-4436-2022; Scanzio, Ste...","Wisniewski, Lukasz/0000-0001-6537-4511; Scanzi...",,,Heterogeneous and dependable networks in indus...,...,0,9,,,0166-3615,1872-6194,,WOS:000615986900003,,
9,J,"Queiroz, Diego V.; Gomes, Ruan D.; Fonseca, Ig...",,,,"Gomes, R./F-5212-2013; de Alencar, Marcelo Sam...","de Alencar, Marcelo Sampaio/0000-0002-2849-164...",,,Channel assignment in TSCH-based wireless sens...,...,0,1,,,1868-5137,1868-5145,,WOS:000604526300014,,


In [67]:
# removes columns with 30% of NaN values
sources.dropna(axis=1, thresh=0.7*len(sources), inplace=True)

In [None]:
sources

Unnamed: 0,Publication Type,Authors,Researcher Ids,ORCIDs,Article Title,Source Title,Volume,DOI,Publication Date,Publication Year,Abstract,"Times Cited, WoS Core","Times Cited, CSCD","Times Cited, RSCI","Times Cited, ARCI","Times Cited, BCI","Times Cited, SCIELO","Times Cited, All Databases",ISSN,UT (Unique ID)
0,J,"Kim, Hak-Jin; Kim, Marie S.; Han, Seung-Jae",,,Collision-free optimal packet scheduling algor...,COMPUTER NETWORKS,206.0,10.1016/j.comnet.2022.108816,APR 7 2022,2022,One of the key challenges for the IoT (Interne...,0,0,0,0,0,0,0,1389-1286,WOS:000773705300005
1,J,"Cena, Gianluca; Scanzio, Stefano; Valenzano, A...","Cena, Gianluca/C-3904-2015","Cena, Gianluca/0000-0003-0084-5321",Ultra-Low Power Wireless Sensor Networks Based...,ELECTRONICS,11.0,10.3390/electronics11030304,FEB 2022,2022,Devices in wireless sensor networks are typica...,1,0,0,0,0,0,1,,WOS:000754400600001
2,J,"Urke, Andreas Ramstad; Kure, Oivind; ovsthus, ...",,"Ovsthus, Knut/0000-0001-6849-0913; Urke, Andre...",A Survey of 802.15.4 TSCH Schedulers for a Sta...,SENSORS,22.0,10.3390/s22010015,JAN 2022,2022,Concepts such as Industry 4.0 and Cyber-Physic...,0,0,0,0,0,0,0,,WOS:000751079200001
3,J,"Javan, Nastooh Taheri; Sabaei, Masoud; Hakami,...","Hakami, Vesal/T-1426-2018","Hakami, Vesal/0000-0002-0798-3981; Taheri Java...",Adaptive Channel Hopping for IEEE 802.15.4 TSC...,IEEE SENSORS JOURNAL,21.0,10.1109/JSEN.2021.3110720,OCT 15 2021,2021,In IEEE 802.15.4 standard for low-power low-ra...,1,0,0,0,0,0,1,1530-437X,WOS:000709128900154
4,J,"Farahmandand, Mohammad; Nabi, Majid",,"Farahmand, Mohammad/0000-0001-9529-0802",Channel Quality Prediction for TSCH Blacklisti...,IEEE SENSORS JOURNAL,21.0,10.1109/JSEN.2021.3093424,SEP 15 2021,2021,Cross-Technology Interference (CTI) is a sever...,1,0,0,0,0,0,1,1530-437X,WOS:000703056000146
5,J,"Daneels, Glenn; Delgado, Carmen; Elsas, Robbe;...","Famaey, Jeroen/AAB-6171-2022; Pinillos, Carmen...","Famaey, Jeroen/0000-0002-3587-1354; Pinillos, ...",Slot Bonding for Adaptive Modulations in IEEE ...,IEEE INTERNET OF THINGS JOURNAL,8.0,10.1109/JIOT.2021.3050274,JUL 1 2021,2021,The numerous applications of industrial automa...,1,0,0,0,0,0,1,2327-4662,WOS:000665207100039
6,J,"Solimini, Domenico; Tuset-Peiro, Pere; Boquet,...","Vázquez-Gallego, Francisco/AAS-9955-2021; Gome...","Vilajosana, Xavi/0000-0002-3020-427X; Boquet P...",Towards Reliable IEEE 802.15.4g SUN with Re-tr...,JOURNAL OF SIGNAL PROCESSING SYSTEMS FOR SIGNA...,93.0,10.1007/s11265-021-01665-z,SEP 2021,2021,"In this paper, we propose and evaluate two mec...",0,0,0,0,0,0,0,1939-8018,WOS:000650522400001
7,J,"Amezcua Valdovinos, Ismael; Figueroa Millan, P...",,"Vargas-Rosales, Cesar/0000-0003-1770-471X; Ame...",Distributed Channel Ranking Scheduling Functio...,SENSORS,21.0,10.3390/s21051593,MAR 2021,2021,The Industrial Internet of Things (IIoT) is co...,2,0,0,0,0,0,2,,WOS:000628665700001
8,J,"Scanzio, Stefano; Wisniewski, Lukasz; Gaj, Piotr","Wisniewski, Lukasz/AAE-4436-2022; Scanzio, Ste...","Wisniewski, Lukasz/0000-0001-6537-4511; Scanzi...",Heterogeneous and dependable networks in indus...,COMPUTERS IN INDUSTRY,125.0,10.1016/j.compind.2020.103388,FEB 2021,2021,The real and effective ground of all new conce...,9,0,0,0,0,0,9,0166-3615,WOS:000615986900003
9,J,"Queiroz, Diego V.; Gomes, Ruan D.; Fonseca, Ig...","Gomes, R./F-5212-2013; de Alencar, Marcelo Sam...","de Alencar, Marcelo Sampaio/0000-0002-2849-164...",Channel assignment in TSCH-based wireless sens...,JOURNAL OF AMBIENT INTELLIGENCE AND HUMANIZED ...,,10.1007/s12652-020-02741-1,,2021,"Recent advances in wireless sensor networks, e...",1,0,0,0,0,0,1,1868-5137,WOS:000604526300014


## Visualizing Data (following tutorial)

In [None]:
#Bibliographic Record Visualization Version 0.1
#writtern by Michael Burnam-Fink
#2018
#Licensed under CC BY-NC 4.0

#INSTRUCTIONS:
#Save to a directory containing WoS records and run

#this scipt processes Web of Science record collections to produce a bibliographic network for visualization with Kumu.io
#for full details, see my website

title = str(input('Input the name of your project: '))

RC = mk.RecordCollection(".")

Nodes = pd.DataFrame()

NodesDict = {}
EdgeList = []

#this loop ensures that papers in your corpus are properly identified in the network
#node format is (UID, Source or Cite, text to be displayed, a link to a google search, and the abstract)
#UID is a unique identifier, DOI if it exists, or string similar to a cited record if not
for R in RC :
    author = R['AU'][0]
    pubyear = str(R['PY'])
    pub = R['J9']
    try:
        volume = 'V'+str(R['VL'])
    except:
        volume =''
    try:
        page = 'P'+str(R['BP'])
    except:
        page =''
    try:
        uid = 'DOI '+str(R['DI'])
    except:
        uid='UID ' + ', '.join([author.replace(',',''),pubyear,pub])
    sourcedisplay = ', '.join([author,pubyear,R['TI']])
    searchlink = 'https://www.google.com/search?q='+str(uid)
    try :
        abstract = R['AB']
    except :
        abstract = 'No Abstract Found'
    NodesDict[uid] = (uid, 'Source', sourcedisplay, searchlink, abstract)


#this loop generates the edges, and adds nodes for cited references
#cited reference format is similar to source node format
for R in RC :
    author = R['AU'][0]
    pubyear = str(R['PY'])
    pub = R['J9']
    try:
        volume = 'V'+str(R['VL'])
    except:
        volume =''
    try:
        page = 'P'+str(R['BP'])
    except:
        page =''
    try:
        uid = 'DOI '+str(R['DI'])
    except:
        uid = 'UID ' + ', '.join([author.replace(',', ''), pubyear, pub])
    for cite in R['CR'] :
        cite = str(cite)
        citelist = cite.split(', ')
        try :
            citeid = citelist[5]
            search = citeid
        except :
            citeid = ', '.join(citelist[0:3])
            search = '+'.join(citelist[0:3])
        citedisplay = ', '.join(citelist[0:3])

        searchlink = 'https://www.google.com/search?q=' + search
        EdgeList = EdgeList + [(uid,citeid)]
        if citeid not in NodesDict:
            NodesDict[citeid] = (citeid, 'Cite', citedisplay, searchlink, '')

NodesList =[]
for node in NodesDict:
    NodesList = NodesList + [NodesDict[node]]


Nodes = pd.DataFrame.from_records(NodesList, columns = ['ID','Type','Label', 'Search','Abstract'], index='ID')
Edges = pd.DataFrame.from_records(EdgeList, columns=['From','To'])

#calculates indegree, since this code is faster than Kumu.io
Nodes['indegree'] = 0
ToEdges = {}
for row in Edges.itertuples():
    if row[2] in ToEdges :
        ToEdges[row[2]] =ToEdges[row[2]] + 1
    else :
        ToEdges[row[2]] = 1

for row in Nodes.itertuples():
    target = row[0]
    try :
        count = ToEdges[target]
    except:
        count = 0
    Nodes.at[target,'indegree'] = count

#the next few lines create a CoCitation Network
Nodes['Keep']=0

for row in Nodes.itertuples():
    if row[1]== 'Source' :
        Nodes.at[row[0],'Keep'] = 1
    if row[5] > 1 :
        Nodes.at[row[0], 'Keep'] = 1

CoNodes = Nodes[Nodes.Keep == 1]
Edges['Keep']=Edges['From'].isin(CoNodes.index.values)*1 + Edges['To'].isin(CoNodes.index.values)*1
CoEdges = Edges[Edges.Keep == 2]

#and writes a file to disk
writer = pd.ExcelWriter(title+'Cites.xlsx')
Nodes.to_excel(writer,'Elements')
Edges.to_excel(writer,'Connections', index=False)
writer.save()

writer = pd.ExcelWriter(title+'CoCites.xlsx')
CoNodes.to_excel(writer,'Elements')
CoEdges.to_excel(writer,'Connections', index=False)
writer.save()

Input the name of your project: Amazon
