In [1]:
import pandas as pd

# Graph Visualization 

In [2]:
merged_df = pd.read_csv("cit_ntwk_raw.csv",index_col = 0)

In [3]:
df = merged_df[['citing_paper','cited_paper','Date']]

In [4]:
df

Unnamed: 0,citing_paper,cited_paper,Date
0,dissecting mechanisms of financial crises: int...,Delayed crises and slow recoveries,February 2024
1,public liquidity and financial crises,Delayed crises and slow recoveries,February 2024
2,inefficient credit cycles,Delayed crises and slow recoveries,February 2024
3,learning and the capital age premium,Learning about the consumption risk exposure o...,February 2024
4,"investment, uncertainty, and u-shaped return v...",Learning about the consumption risk exposure o...,February 2024
...,...,...,...
410809,kobi̇'lerin finansman sorunları ve çözüm öneri...,Financing patterns around the world: Are small...,September 2008
410810,finanțarea întreprinderilor mici și mijlocii d...,Financing patterns around the world: Are small...,September 2008
410811,relação entre estrutura de financiamento e açõ...,Financing patterns around the world: Are small...,September 2008
410812,中国商业银行综合融资能力测度及影响因素分析,Financing patterns around the world: Are small...,September 2008


In [5]:
# Set to collect unique nodes
unique_nodes = set(df['citing_paper']) | set(df['cited_paper'])

In [6]:
paper_nodes = pd.DataFrame(unique_nodes, columns = ['Title'])
paper_nodes['name'] = ["node"+str(i) for i in range(len(paper_nodes))]
paper_nodes

Unnamed: 0,Title,name
0,an international study of the value implicatio...,node0
1,structure of banking industry and firms' risk-...,node1
2,forecasting bond risk premia with machine lear...,node2
3,financial statement anomalies in the bond market,node3
4,the effect of ceo hiring source on total cash ...,node4
...,...,...
186539,The creation and evolution of entrepreneurial ...,node186539
186540,research on historical phase division of terro...,node186540
186541,the effects of brexit on credit spreads: evide...,node186541
186542,"momentum, reversals, and investor clientele",node186542


In [7]:
# Mapping titles to IDs
title_to_id = paper_nodes.reset_index().set_index('Title')['index'].to_dict()

In [8]:
# Replacing titles with IDs in df1
df['citing_paper'] = df['citing_paper'].map(title_to_id)
df['cited_paper'] = df['cited_paper'].map(title_to_id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['citing_paper'] = df['citing_paper'].map(title_to_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cited_paper'] = df['cited_paper'].map(title_to_id)


In [9]:
df

Unnamed: 0,citing_paper,cited_paper,Date
0,21388,114306,February 2024
1,46949,114306,February 2024
2,72291,114306,February 2024
3,61857,9241,February 2024
4,124939,9241,February 2024
...,...,...,...
410809,121093,133292,September 2008
410810,30241,133292,September 2008
410811,166535,133292,September 2008
410812,15020,133292,September 2008


In [10]:
edges = df[['citing_paper','cited_paper']].rename(columns = {'citing_paper':'source','cited_paper':'target'})

In [11]:
edges

Unnamed: 0,source,target
0,21388,114306
1,46949,114306
2,72291,114306
3,61857,9241
4,124939,9241
...,...,...
410809,121093,133292
410810,30241,133292
410811,166535,133292
410812,15020,133292


## DataShader Method 

reference: https://datashader.org/user_guide/Networks.html#pcap-computer-network-data

In [12]:
import datashader as ds
import datashader.transfer_functions as tf
from datashader.layout import random_layout, circular_layout, forceatlas2_layout

In [None]:
circular  = circular_layout(paper_nodes[['name']], uniform=False)
randomloc = random_layout(paper_nodes[['name']])
randomloc.tail()

In [28]:
cvsopts = dict(plot_height=10000, plot_width=10000)

def nodesplot(nodes, name=None, canvas=None, cat=None):
    canvas = ds.Canvas(**cvsopts) if canvas is None else canvas
    aggregator=None if cat is None else ds.count_cat(cat)
    agg=canvas.points(nodes,'x','y',aggregator)
    return tf.spread(tf.shade(agg, cmap=["#FF3333"]), px=3, name=name)

In [30]:
tf.Images(nodesplot(randomloc,"Random layout"),
          nodesplot(circular, "Circular layout"))

0,1
Random layout,Circular layout


Clearly overplotting. 

reference for overplotting: 
* https://holoviz.org/tutorial/Plotting.html
* https://datashader.org/user_guide/Plotting_Pitfalls.html

## HoloViz Method 

reference: https://examples.holoviz.org/gallery/uk_researchers/uk_researchers.html

We use HoloViz to avoid this overplotting pitfall. 

We use the circular, random, and forcedirected layout functions from DataShader since it is useful in putting our nodes into x and y coordinates. 

In [14]:
import holoviews as hv
from holoviews import opts

from colorcet import fire
from datashader.bundling import directly_connect_edges, hammer_bundle

from holoviews.operation.datashader import datashade, dynspread
from holoviews.operation import decimate

In [15]:
from dask.distributed import Client
client = Client()

In [16]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 5
Total threads: 20,Total memory: 31.44 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:50640,Workers: 5
Dashboard: http://127.0.0.1:8787/status,Total threads: 20
Started: Just now,Total memory: 31.44 GiB

0,1
Comm: tcp://127.0.0.1:50664,Total threads: 4
Dashboard: http://127.0.0.1:50669/status,Memory: 6.29 GiB
Nanny: tcp://127.0.0.1:50643,
Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-ywfy6o3z,Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-ywfy6o3z

0,1
Comm: tcp://127.0.0.1:50665,Total threads: 4
Dashboard: http://127.0.0.1:50672/status,Memory: 6.29 GiB
Nanny: tcp://127.0.0.1:50645,
Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-wfarmsg5,Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-wfarmsg5

0,1
Comm: tcp://127.0.0.1:50663,Total threads: 4
Dashboard: http://127.0.0.1:50667/status,Memory: 6.29 GiB
Nanny: tcp://127.0.0.1:50647,
Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-0ie1bevx,Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-0ie1bevx

0,1
Comm: tcp://127.0.0.1:50666,Total threads: 4
Dashboard: http://127.0.0.1:50674/status,Memory: 6.29 GiB
Nanny: tcp://127.0.0.1:50649,
Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-x09ogxr3,Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-x09ogxr3

0,1
Comm: tcp://127.0.0.1:50671,Total threads: 4
Dashboard: http://127.0.0.1:50676/status,Memory: 6.29 GiB
Nanny: tcp://127.0.0.1:50651,
Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-rclsqxz9,Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-rclsqxz9


In [17]:
hv.notebook_extension('bokeh','matplotlib')

decimate.max_samples=20000
dynspread.threshold=0.01
datashade.cmap=fire[40:]
sz = dict(width=150,height=150)

opts.defaults(
    opts.RGB(width=800, height=800, xaxis=None, yaxis=None, show_grid=False, bgcolor="black"))

In [18]:
edges

Unnamed: 0,source,target
0,21388,114306
1,46949,114306
2,72291,114306
3,61857,9241
4,124939,9241
...,...,...
410809,121093,133292
410810,30241,133292
410811,166535,133292
410812,15020,133292


### 1. With random layout

In [19]:
r_nodes_df = randomloc.iloc[:,1:]
r_edges_df = edges

NameError: name 'randomloc' is not defined

In [None]:
r_nodes_df

In [None]:
r_edges_df

In [None]:
r_nodes = hv.Points(r_nodes_df, label="Nodes")
r_edges = hv.Curve(r_edges_df, label="Edges")
len(r_nodes), len(r_edges)

In [None]:
r_direct = hv.Curve(directly_connect_edges(r_nodes.data, r_edges.data),label="Direct")

In [None]:
dynspread(datashade(r_nodes,cmap=["cyan"])) + datashade(r_direct)

### 2. Forcedirected

In [25]:
%time 
forcedirected = forceatlas2_layout(paper_nodes, edges)
tf.Images(nodesplot(forcedirected, "ForceAtlas2 layout"))

CPU times: total: 0 ns
Wall time: 0 ns


KeyboardInterrupt: 