In [1]:
# Imports
import graphistry
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
graphistry.register(key='273c43ef4595ac5b96318ff06461f58caaadb67ff4a3f51b48dc51ff3427582d')

A new version of PyGraphistry is available (installed=0.9.43 latest=0.9.49).


In [2]:
# Static
DB_URL = 'arxiv_raw.sqlite'

In [3]:
# Connect to the database
# conn = sqlite3.connect(DB_URL)
conn = create_engine('sqlite:///{}'.format(DB_URL))

In [4]:
# Retrieve all the paper metadata
arxiv_metadata = pd.read_sql_table('Papers', conn)

In [5]:
# Retrieve citations data
# citations = pd.read_csv('remote_data/1/citations/citations.txt', names = ['source', 'target', 'label'])
citations = pd.read_sql_table('Citations', conn)
citations = citations.rename(columns={'paper_id': 'source', 'reference_id': 'target', 'weight': 'label'})

In [6]:
# Retrieve authors data
authors = pd.read_sql_table('Authors', conn)

In [7]:
# Dedupe Citations
citations = citations.drop_duplicates(subset=['source', 'target'])

In [8]:
# Retrieve Publications data
publications = pd.read_sql_table('Publications', conn)

In [9]:
# Clean Citations IDs
citations['target'] = citations['target'].str.strip('.')
citations['source'] = citations['source'].astype(str).str.strip('.')

In [10]:
# Unique subjects
subjects = arxiv_metadata.primary_subject.unique()
subject_colors = dict(zip(subjects, range(0, len(subjects))))
arxiv_metadata['color'] = arxiv_metadata.primary_subject.map(lambda x: subject_colors[x])

In [11]:
# All edges with metadata
metadata_merge = citations.merge(arxiv_metadata, 
                                 left_on='source', 
                                 right_on='id').merge(arxiv_metadata,  
                                                      left_on='target', 
                                                      right_on='id', 
                                                      suffixes=('_from', '_to'))

In [12]:
# Set up igraph for easy metadata etc
# ig = plotter.pandas2igraph(citations)
# Set up the plotter
plotter = graphistry.bind(source="source", destination="target")
print('Creating iGraph...')
ig = plotter.pandas2igraph(metadata_merge)

# Add the Arxiv Metadata
vertex_metadata = pd.DataFrame(ig.vs['__nodeid__'], columns=['id']).merge(arxiv_metadata, how='left', on='id')
ig.vs['primary_subject'] = vertex_metadata['primary_subject']
ig.vs['color'] = vertex_metadata['color']
ig.vs['title'] = vertex_metadata['title']
ig.vs['year'] = vertex_metadata['year']
ig.vs['month'] = vertex_metadata['month']
ig.vs['category'] = vertex_metadata['category']
print('Calculating PageRank...')
ig.vs['pagerank'] = ig.pagerank()
print('Calculating Communities...')
ig.vs['community'] = ig.community_infomap().membership
print('Calculating Degrees...')
ig.vs['in_degree'] = ig.indegree()

# plotter.bind(point_color='community', point_size='pagerank').plot(ig)



Calculating PageRank...
Calculating Communities...
Calculating Degrees...


In [13]:
# Perform the plotting

# Set up the plotter
plotter = graphistry.bind(source="source", destination="target")

# citations["label"] = citations.value.map(lambda v: "#Meetings: %d" % v)
plotter = plotter.bind(edge_weight="label")
# plotter.plot(citations)
plotter = plotter.bind(point_size='pagerank', point_color='color', point_title="title")
print('Plotting...')
plotter.plot(ig)

Plotting...




Uploading 5869 kB. This may take a while...
