In [1]:
import pandas as pd

# Graph Visualization 

In [24]:
merged_df = pd.read_csv("cit_ntwk_raw.csv",index_col = 0)

In [25]:
df = merged_df[['citing_paper','cited_paper','Date']]

In [26]:
df

Unnamed: 0,citing_paper,cited_paper,Date
0,dissecting mechanisms of financial crises: int...,Delayed crises and slow recoveries,February 2024
1,public liquidity and financial crises,Delayed crises and slow recoveries,February 2024
2,inefficient credit cycles,Delayed crises and slow recoveries,February 2024
3,learning and the capital age premium,Learning about the consumption risk exposure o...,February 2024
4,"investment, uncertainty, and u-shaped return v...",Learning about the consumption risk exposure o...,February 2024
...,...,...,...
410809,kobi̇'lerin finansman sorunları ve çözüm öneri...,Financing patterns around the world: Are small...,September 2008
410810,finanțarea întreprinderilor mici și mijlocii d...,Financing patterns around the world: Are small...,September 2008
410811,relação entre estrutura de financiamento e açõ...,Financing patterns around the world: Are small...,September 2008
410812,中国商业银行综合融资能力测度及影响因素分析,Financing patterns around the world: Are small...,September 2008


In [27]:
# Set to collect unique nodes
unique_nodes = set(df['citing_paper']) | set(df['cited_paper'])

In [28]:
paper_nodes = pd.DataFrame(unique_nodes, columns = ['Title'])
paper_nodes['name'] = ["node"+str(i) for i in range(len(paper_nodes))]
paper_nodes

Unnamed: 0,Title,name
0,Financial transaction taxes and the informatio...,node0
1,characterizing the tail-risk of factor mimicki...,node1
2,seeking the roots of entrepreneurship: insight...,node2
3,surging sovereign spreads: the impact of risin...,node3
4,melting down: systemic financial instability a...,node4
...,...,...
186539,fatores de influência na compra de criptomoeda...,node186539
186540,impact of economic policy uncertainty on the d...,node186540
186541,auditor exits and firm performance: is there a...,node186541
186542,"social media, top managers' characteristics, a...",node186542


In [29]:
# Mapping titles to IDs
title_to_id = paper_nodes.reset_index().set_index('Title')['index'].to_dict()

In [30]:
# Replacing titles with IDs in df1
df['citing_paper'] = df['citing_paper'].map(title_to_id)
df['cited_paper'] = df['cited_paper'].map(title_to_id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['citing_paper'] = df['citing_paper'].map(title_to_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cited_paper'] = df['cited_paper'].map(title_to_id)


In [31]:
df

Unnamed: 0,citing_paper,cited_paper,Date
0,43974,27533,February 2024
1,120209,27533,February 2024
2,54320,27533,February 2024
3,136323,148800,February 2024
4,153509,148800,February 2024
...,...,...,...
410809,10368,132685,September 2008
410810,141965,132685,September 2008
410811,80166,132685,September 2008
410812,62411,132685,September 2008


In [32]:
edges = df[['citing_paper','cited_paper']].rename(columns = {'citing_paper':'source','cited_paper':'target'})

In [33]:
edges

Unnamed: 0,source,target
0,43974,27533
1,120209,27533
2,54320,27533
3,136323,148800
4,153509,148800
...,...,...
410809,10368,132685
410810,141965,132685
410811,80166,132685
410812,62411,132685


## DataShader Method 

reference: https://datashader.org/user_guide/Networks.html#pcap-computer-network-data

In [34]:
import datashader as ds
import datashader.transfer_functions as tf
from datashader.layout import random_layout, circular_layout, forceatlas2_layout

In [35]:
circular  = circular_layout(paper_nodes[['name']], uniform=False)
randomloc = random_layout(paper_nodes[['name']])
randomloc.tail()

Unnamed: 0,name,x,y
186539,node186539,0.125416,0.950485
186540,node186540,0.652583,0.405317
186541,node186541,0.724169,0.487971
186542,node186542,0.716962,0.515436
186543,node186543,0.122925,0.713586


In [15]:
cvsopts = dict(plot_height=10000, plot_width=10000)

def nodesplot(nodes, name=None, canvas=None, cat=None):
    canvas = ds.Canvas(**cvsopts) if canvas is None else canvas
    aggregator=None if cat is None else ds.count_cat(cat)
    agg=canvas.points(nodes,'x','y',aggregator)
    return tf.spread(tf.shade(agg, cmap=["#FF3333"]), px=3, name=name)

In [16]:
tf.Images(nodesplot(randomloc,"Random layout"),
          nodesplot(circular, "Circular layout"))

0,1
Random layout,Circular layout


Clearly overplotting. 

reference for overplotting: 
* https://holoviz.org/tutorial/Plotting.html
* https://datashader.org/user_guide/Plotting_Pitfalls.html

## HoloViz Method 

reference: https://examples.holoviz.org/gallery/uk_researchers/uk_researchers.html

We use HoloViz to avoid this overplotting pitfall. 

We use the circular, random, and forcedirected layout functions from DataShader since it is useful in putting our nodes into x and y coordinates. 

In [36]:
import holoviews as hv
from holoviews import opts

from colorcet import fire
from datashader.bundling import directly_connect_edges, hammer_bundle

from holoviews.operation.datashader import datashade, dynspread
from holoviews.operation import decimate

In [15]:
from dask.distributed import Client
client = Client(n_workers=5, threads_per_worker=3, memory_limit="15GB")

In [16]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 5
Total threads: 15,Total memory: 69.85 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:49225,Workers: 5
Dashboard: http://127.0.0.1:8787/status,Total threads: 15
Started: Just now,Total memory: 69.85 GiB

0,1
Comm: tcp://127.0.0.1:49251,Total threads: 3
Dashboard: http://127.0.0.1:49258/status,Memory: 13.97 GiB
Nanny: tcp://127.0.0.1:49228,
Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-qc9xgsxc,Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-qc9xgsxc

0,1
Comm: tcp://127.0.0.1:49250,Total threads: 3
Dashboard: http://127.0.0.1:49256/status,Memory: 13.97 GiB
Nanny: tcp://127.0.0.1:49230,
Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-ki7pwktk,Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-ki7pwktk

0,1
Comm: tcp://127.0.0.1:49260,Total threads: 3
Dashboard: http://127.0.0.1:49261/status,Memory: 13.97 GiB
Nanny: tcp://127.0.0.1:49232,
Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-iuny1sbs,Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-iuny1sbs

0,1
Comm: tcp://127.0.0.1:49248,Total threads: 3
Dashboard: http://127.0.0.1:49252/status,Memory: 13.97 GiB
Nanny: tcp://127.0.0.1:49234,
Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-n8fwo_4n,Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-n8fwo_4n

0,1
Comm: tcp://127.0.0.1:49249,Total threads: 3
Dashboard: http://127.0.0.1:49254/status,Memory: 13.97 GiB
Nanny: tcp://127.0.0.1:49236,
Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-on9eia08,Local directory: C:\Users\ewp\AppData\Local\Temp\dask-scratch-space\worker-on9eia08


In [37]:
hv.notebook_extension('bokeh','matplotlib')

decimate.max_samples=20000
dynspread.threshold=0.01
datashade.cmap=fire[40:]
sz = dict(width=150,height=150)

opts.defaults(
    opts.RGB(width=500, height=500, xaxis=None, yaxis=None, show_grid=False, bgcolor="black"))

In [18]:
edges

Unnamed: 0,source,target
0,43974,27533
1,120209,27533
2,54320,27533
3,136323,148800
4,153509,148800
...,...,...
410809,10368,132685
410810,141965,132685
410811,80166,132685
410812,62411,132685


### 1. With random layout

In [44]:
r_nodes_df = randomloc.iloc[:,1:]
r_edges_df = edges

In [20]:
r_nodes_df

Unnamed: 0,x,y
0,90.738435,78.989307
1,98.600950,61.745111
2,23.608261,7.532646
3,53.605930,99.869803
4,0.335067,55.778789
...,...,...
186539,6.727890,24.949561
186540,6.942172,75.416991
186541,61.194178,98.730795
186542,90.340795,20.459853


In [21]:
r_edges_df

Unnamed: 0,source,target
0,43974,27533
1,120209,27533
2,54320,27533
3,136323,148800
4,153509,148800
...,...,...
410809,10368,132685
410810,141965,132685
410811,80166,132685
410812,62411,132685


In [39]:
r_nodes = hv.Points(r_nodes_df, label="Nodes")
r_edges = hv.Curve(r_edges_df, label="Edges")
len(r_nodes), len(r_edges)

(186544, 410772)

In [42]:
r_direct = hv.Curve(directly_connect_edges(r_nodes.data, r_edges.data),label="Direct")

In [43]:
dynspread(datashade(r_nodes,cmap=["cyan"])) + datashade(r_direct)

In [67]:
r_bundled = hv.Curve(hammer_bundle(r_nodes.data, r_edges.data),label="Bundled")

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
2024-04-17 21:03:05,249 - distributed.protocol.core - CRITICAL - Failed to deserialize
Traceback (most recent call last):
  File "C:\Users\ewp\anaconda3\Lib\site-packages\distributed\protocol\core.py", line 160, in loads
    return msgpack.loads(
           ^^^^^^^^^^^^^^
  File "C:\Users\ewp\anaconda3\Lib\site-packages\msgpack\fallback.py", line 128, in unpackb
    ret = unpacker._unpack()
          ^^^^^^^^^^^^^^^^^^
  File "C:\Users\ewp\anaconda3\Lib\site-packages\msgpack\fallback.py", line 565, in _unpack
    ret.append(self._unpack(EX_CONSTRUCT))
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ewp\anaconda3\Lib\site-packages\msgpack\fallback.py", line 592, in _unpack
    ret[key] = self._unpack(EX_CONSTRUCT)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ewp\anaconda3\Lib\site-packages\msgpack\fallback.py", line 565, in _unpack
    ret.append(self._unpack(EX_CONSTRUCT))
 

CancelledError: resample_edges-4fb8e408-0e6e-4035-91ba-7459a7646a0d

In [None]:
dynspread(datashade(r_nodes,cmap=["cyan"])) + datashade(r_bundled)

### 2. Forcedirected

In [45]:
%time 
forcedirected = forceatlas2_layout(paper_nodes, edges)
tf.Images(nodesplot(forcedirected, "ForceAtlas2 layout"))

CPU times: total: 0 ns
Wall time: 0 ns




KeyboardInterrupt: 