# Data and Network

In [1]:
import pandas as pd
import networkx
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from bokeh.io import output_notebook, show, save
output_notebook()

# Crossref

In [3]:
import json
import urllib.request
from urllib.error import HTTPError
from urllib.parse import quote_plus, urlencode
from urllib.request import CacheFTPHandler, urlopen, Request
from Levenshtein import ratio, matching_blocks, editops

In [4]:
def crossref_query_title(title):
    EMPTY_RESULT = {
        "crossref_title": "",
        "similarity": 0,
        "doi": ""
    }
    api_url = "https://api.crossref.org/works?"
    params = {"rows": "5", "query.bibliographic": title}
    url = api_url + urlencode(params, quote_via=quote_plus)
    # print(url)
    request = Request(url)
    request.add_header(
        "User-Agent", "DOI Importer; mailto:example@gmail.com")
    try:
        ret = urlopen(request)
        content = ret.read()
        data = json.loads(content)
        items = data["message"]["items"]
        most_similar = EMPTY_RESULT
        for item in items:
            if "title" not in item:
                continue
            title = item["title"].pop()
            result = {
                "crossref_title": title,
                "similarity": ratio(title.lower(), params["query.bibliographic"].lower()),
                "doi": item["DOI"]
            }
            if most_similar["similarity"] < result["similarity"]:
                most_similar = result
        return {"success": True, "result": most_similar}
    except HTTPError as httpe:
        return {"success": False, "result": EMPTY_RESULT, "exception": httpe}

In [5]:
from crossref.restful import Works
works = Works()

In [6]:
def doi_info(doi):
    w1 = works.doi(doi)
    w1_cnt = w1['reference-count']
    w1_title = w1['title'][0]
    w1_year = w1['created']['date-parts'][0][0]
    w1_journal = w1['container-title'][0]
    w1_1au = w1['author'][0]['given'] + " " + w1['author'][0]['family']
    ref_lst = []
    for iw in range(w1_cnt):
        try:
            ref_lst.append(w1['reference'][iw]['DOI'].upper())
        except:
            print("No DOI found for the reference:")
            print(w1['reference'][iw])

    return w1_title, w1_year, w1_journal, w1_1au, ref_lst

In [7]:
# https://scipython.com/blog/doi-to-bibtex/
def doi_bib_dx(doi):
    BASE_URL = 'http://dx.doi.org/'
    url = BASE_URL + doi
    req = urllib.request.Request(url)
    req.add_header('Accept', 'application/x-bibtex')  # can give a better formatted bibtex
    #req.add_header('Accept', 'text/bibliography; style=bibtex')
    try:
        with urllib.request.urlopen(req) as f:
            bibtex = f.read().decode()
        return bibtex
    except HTTPError as e:
        if e.code == 404:
            return 'DOI not found.'
        else:
            return 'Service unavailable.'

# SemanticScholar

In [10]:
from semanticscholar import SemanticScholar

In [11]:
sch = SemanticScholar(timeout=2)

In [12]:
paper = sch.paper('10.1126/SCIENCE.1141243')

In [13]:
paper.keys()

dict_keys(['abstract', 'arxivId', 'authors', 'citationVelocity', 'citations', 'corpusId', 'doi', 'fieldsOfStudy', 'influentialCitationCount', 'isOpenAccess', 'isPublisherLicensed', 'is_open_access', 'is_publisher_licensed', 'numCitedBy', 'numCiting', 'paperId', 'references', 'title', 'topics', 'url', 'venue', 'year'])

In [25]:
paper['year']

2007

In [79]:
# create a dataframe for the paper
papel = pd.DataFrame( columns = ['abstract', 'arxivId', 'authors', 'citations', 'corpusId', 'doi', 'fieldsOfStudy', 'influentialCitationCount', 'numCitedBy', 'numCiting', 'paperId', 'references', 'title', 'topics', 'url', 'venue', 'year'] )  

# Fetch DOI from a file

In [77]:
import re, time

In [65]:
with open('./refs.bib') as f:
    # read all content of the file
    read_data = f.read()
    #print(read_data)
    obj_dois = re.findall(r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+)\b', read_data)
    print(len(obj_dois))
    
    # # read line by line
    # for line in f:
    #     str_line = line.rstrip()
    #     # find the DOI via https://stackoverflow.com/questions/27910/finding-a-doi-in-a-document-or-page
    #     obj = re.findall(r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+)\b', str_line)
    #     #obj = re.findall(r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])[[:graph:]])+)\b', str_line)
    #     if obj:
    #         print(obj)

127


In [76]:
sample_dois = obj_dois[:10]

In [80]:
for one_doi in sample_dois:
    print(one_doi, "\n")
    paper = sch.paper(one_doi)
    papel = papel.append( {'abstract': paper['abstract'], 
                           'arxivId': paper['arxivId'],
                           'authors': paper['authors'],
                           'citations': paper['citations'],
                           'corpusId': paper['corpusId'],
                           'doi': paper['doi'],
                           'fieldsOfStudy': paper['fieldsOfStudy'],
                           'influentialCitationCount': paper['influentialCitationCount'],
                           'numCitedBy': paper['numCitedBy'],
                           'numCiting': paper['numCiting'],
                           'paperId': paper['paperId'],
                           'references': paper['references'],
                           'title': paper['title'],
                           'topics': paper['topics'],
                           'url': paper['url'],
                           'venue': paper['venue'],
                           'year': paper['year'] } , ignore_index = True)
    time.sleep(1)
    print(paper['title'], "\n")
    print(paper['doi'], "\n")
    print(" === \n")

10.1126/science.1141243 

An On-Demand Coherent Single-Electron Source 

10.1126/science.1141243 

 === 

10.1103/PhysRevLett.127.096803 

Electron Waiting Times in a Strongly Interacting Quantum Dot: Interaction Effects and Higher-Order Tunneling Processes. 

10.1103/PhysRevLett.127.096803 

 === 

10.1103/PhysRevResearch.1.033163 

Waiting time distributions in a two-level fluctuator coupled to a superconducting charge detector 

10.1103/PhysRevResearch.1.033163 

 === 

10.1103/PhysRevB.93.245409 

Quantum theory of an electron waiting time clock 

10.1103/PhysRevB.93.245409 

 === 

10.1103/PhysRevLett.108.186806 

Electron waiting times in mesoscopic conductors. 

10.1103/PhysRevLett.108.186806 

 === 

10.1002/qute.201900014 

Time‐Domain Spectroscopy of Mesoscopic Conductors Using Voltage Pulses 

10.1002/qute.201900014 

 === 

10.1103/PhysRevB.104.165304 

Synchronized coherent charge oscillations in coupled double quantum dots 

10.1103/PhysRevB.104.165304 

 === 

10.1103/Ph

In [91]:
papel.loc[0]['references']

[{'arxivId': None,
  'authors': [{'authorId': '1861837498', 'name': 'G. G. Stokes'}],
  'doi': '10.1177/001452469000101110',
  'intent': [],
  'isInfluential': False,
  'paperId': '90006064cafcb0a9ad8a30cffeb56efe7e14129b',
  'title': '"J."',
  'url': 'https://www.semanticscholar.org/paper/90006064cafcb0a9ad8a30cffeb56efe7e14129b',
  'venue': '',
  'year': 1890},
 {'arxivId': None,
  'authors': [{'authorId': '30044468', 'name': 'van Wees BJ'},
   {'authorId': '29991537', 'name': 'van Houten H'},
   {'authorId': '30179013', 'name': 'Beenakker'},
   {'authorId': '47688973', 'name': 'Williamson'},
   {'authorId': '117440855', 'name': 'Kouwenhoven'},
   {'authorId': '30144914', 'name': 'van der Marel D'},
   {'authorId': '30178041', 'name': 'Foxon'}],
  'doi': '10.1103/PHYSREVLETT.60.848',
  'intent': [],
  'isInfluential': False,
  'paperId': '379b3476dd8b93547af8f062b6e710298692a403',
  'title': 'Quantized conductance of point contacts in a two-dimensional electron gas.',
  'url': 'https

# Test: get bibtex from doi

In [8]:
item_bib = doi_bib_dx('10.1126/SCIENCE.1141243')

In [9]:
print(item_bib)

@article{Fe_ve_2007,
	doi = {10.1126/science.1141243},
	url = {https://doi.org/10.1126%2Fscience.1141243},
	year = 2007,
	month = {may},
	publisher = {American Association for the Advancement of Science ({AAAS})},
	volume = {316},
	number = {5828},
	pages = {1169--1172},
	author = {G. Fe{\`}ve and A. Mahe{\'} and J.-M. Berroir and T. Kontos and B. Plac{\c}ais and D. C. Glattli and A. Cavanna and B. Etienne and Y. Jin},
	title = {An On-Demand Coherent Single-Electron Source},
	journal = {Science}
}


# Simple test for citation diagram

## first we get the citation of one paper

In [None]:
recv = crossref_query_title("An On-Demand Coherent Single-Electron Source") # fetch doi from title
formattedDOI = recv['result']['doi'].upper()                                # format the doi to upper case
Ref_title, Ref_year, Ref_journal, Ref_1au, Ref_dois = doi_info(formattedDOI)                                # get the refs for the doi

In [None]:
# create a dataframe for the paper
papel = pd.DataFrame( columns = ['Title', 'Author_1st', 'Journal', 'Year', 'DOI'] )  

In [None]:
papel = papel.append( {'Title': Ref_title, 'Author_1st': Ref_1au, 'Journal': Ref_journal, 'Year': Ref_year, 'DOI': formattedDOI} , ignore_index = True)

In [None]:
papel

In [None]:
formattedDOI = '10.1103/REVMODPHYS.74.145'.upper()

In [None]:
Ref_title, Ref_year, Ref_journal, Ref_1au, Ref_dois = doi_info(formattedDOI)   

In [None]:
papel = papel.append( {'Title': Ref_title, 'Author_1st': Ref_1au, 'Journal': Ref_journal, 'Year': Ref_year, 'DOI': formattedDOI} , ignore_index = True)
papel

In [None]:
formattedDOI = '10.1103/PHYSREVLETT.72.210'.upper()
Ref_title, Ref_year, Ref_journal, Ref_1au, Ref_dois = doi_info(formattedDOI)
papel = papel.append( {'Title': Ref_title, 'Author_1st': Ref_1au, 'Journal': Ref_journal, 'Year': Ref_year, 'DOI': formattedDOI} , ignore_index = True)
papel

In [None]:
formattedDOI = '10.1103/PHYSREVLETT.93.126804'.upper()
Ref_title, Ref_year, Ref_journal, Ref_1au, Ref_dois = doi_info(formattedDOI)
papel = papel.append( {'Title': Ref_title, 'Author_1st': Ref_1au, 'Journal': Ref_journal, 'Year': Ref_year, 'DOI': formattedDOI} , ignore_index = True)
papel

In [None]:
formattedDOI = '10.1103/PHYSREVB.46.12485'.upper()
if papel.query(' DOI == @formattedDOI ').empty:
    Ref_title, Ref_year, Ref_journal, Ref_1au, Ref_dois = doi_info(formattedDOI)
    papel = papel.append( {'Title': Ref_title, 'Author_1st': Ref_1au, 'Journal': Ref_journal, 'Year': Ref_year, 'DOI': formattedDOI} , ignore_index = True)
papel.head()

In [None]:
# create a dataframe for the citation diagram
cit_vis = pd.DataFrame( columns = ['cit', 'ref'] )  
for item_ref in dois_Ref:
    cit_vis = cit_vis.append( {'cit': formattedDOI, 'ref': item_ref} , ignore_index = True)

In [None]:
cit_vis

## Simple citation diagram

In [None]:
Gcit = networkx.from_pandas_edgelist(cit_vis, 'cit', 'ref')

In [None]:
networkx.draw(Gcit)

In [None]:
plt.figure(figsize=(8,8))
networkx.draw(Gcit, with_labels=True, node_color='skyblue', width=.3, font_size=8)

## find the most cited paper in the list

In [None]:
networkx.degree(Gcit)

In [None]:
degrees = dict(networkx.degree(Gcit))
networkx.set_node_attributes(Gcit, name='degree', values=degrees)

In [None]:
degree_df = pd.DataFrame(Gcit.nodes(data='degree'), columns=['node', 'degree'])
degree_df = degree_df.sort_values(by='degree', ascending=False)
degree_df

In [None]:
num_nodes_to_inspect = 10
degree_df[:num_nodes_to_inspect].plot(x='node', y='degree', kind='barh').invert_yaxis()

## Interactive Network visualization with Bokeh

In [None]:
from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine, EdgesAndLinkedNodes, NodesAndLinkedEdges
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
from bokeh.palettes import Blues8, Reds8, Purples8, Oranges8, Viridis8, Spectral8
from bokeh.transform import linear_cmap
from networkx.algorithms import community

In [None]:
# Calculate degree for each node and add as node attribute
degrees = dict(networkx.degree(Gcit))
networkx.set_node_attributes(Gcit, name='degree', values=degrees)

In [None]:
# Slightly adjust degree so that the nodes with very small degrees are still visible
number_to_adjust_by = 5
adjusted_node_size = dict([(node, degree+number_to_adjust_by) for node, degree in networkx.degree(Gcit)])
networkx.set_node_attributes(Gcit, name='adjusted_node_size', values=adjusted_node_size)

In [None]:
communities = community.greedy_modularity_communities(Gcit)

In [None]:
# Add modularity class and color as attributes to network graph
# Create empty dictionaries
modularity_class = {}
modularity_color = {}
#Loop through each community in the network
for community_number, community in enumerate(communities):
    #For each member of the community, add their community number and a distinct color
    for name in community: 
        modularity_class[name] = community_number
        modularity_color[name] = Spectral8[community_number]

In [None]:
# Add modularity class and color as attributes from the network above
networkx.set_node_attributes(Gcit, modularity_class, 'modularity_class')
networkx.set_node_attributes(Gcit, modularity_color, 'modularity_color')

In [None]:
from bokeh.models import EdgesAndLinkedNodes, NodesAndLinkedEdges

#Choose colors for node and edge highlighting
node_highlight_color = 'white'
edge_highlight_color = 'black'

#Choose attributes from G network to size and color by — setting manual size (e.g. 10) or color (e.g. 'skyblue') also allowed
size_by_this_attribute = 'adjusted_node_size'
color_by_this_attribute = 'modularity_color'

#Pick a color palette — Blues8, Reds8, Purples8, Oranges8, Viridis8
color_palette = Blues8

#Choose a title!
title = 'Citation Network'

#Establish which categories will appear when hovering over each node
HOVER_TOOLTIPS = [
       ("DOI", "@index"),
        ("Degree", "@degree"),
         ("Modularity Class", "@modularity_class"),
        ("Modularity Color", "$color[swatch]:modularity_color"),
]

#Create a plot — set dimensions, toolbar, and title
plot = figure(tooltips = HOVER_TOOLTIPS,
              tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
            x_range=Range1d(-10.1, 10.1), y_range=Range1d(-10.1, 10.1), title=title)

#Create a network graph object
# https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.drawing.layout.spring_layout.html
network_graph = from_networkx(Gcit, networkx.spring_layout, scale=10, center=(0, 0))

#Set node sizes and colors according to node degree (color as category from attribute)
network_graph.node_renderer.glyph = Circle(size=size_by_this_attribute, fill_color=color_by_this_attribute)
#Set node highlight colors
network_graph.node_renderer.hover_glyph = Circle(size=size_by_this_attribute, fill_color=node_highlight_color, line_width=2)
network_graph.node_renderer.selection_glyph = Circle(size=size_by_this_attribute, fill_color=node_highlight_color, line_width=2)

#Set edge opacity and width
network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5, line_width=1)

#Set edge highlight colors
network_graph.edge_renderer.selection_glyph = MultiLine(line_color=edge_highlight_color, line_width=2)
network_graph.edge_renderer.hover_glyph = MultiLine(line_color=edge_highlight_color, line_width=2)

    #Highlight nodes and edges
network_graph.selection_policy = NodesAndLinkedEdges()
network_graph.inspection_policy = NodesAndLinkedEdges()

plot.renderers.append(network_graph)

show(plot)
#save(plot, filename=f"{title}.html")