In [1]:
from bs4 import BeautifulSoup
from bs4 import NavigableString
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from scipy.spatial.distance import pdist, squareform

# Read and parse the existing converted SVG

In [2]:
file = 'MOD2.subclass_classcol.constellation.svg'
with open(file) as fp:
    data = fp.read()
    soup = BeautifulSoup( data, 'xml' )

Find circles and put attribes into a dataframe

In [3]:
# find all circle (path) objects 
hlist = soup.find_all('path')
print(len(hlist))

676


In [4]:
# convert non-filled circles objects attributes into pandas dataframe
# use the associated parent.def.rect as center and radius
circles = pd.DataFrame()
for hindex, helem in enumerate(hlist) :
    if helem.attrs['fill'] == 'none' :
        dfh = pd.DataFrame( helem.attrs, index=[0] )
        dfr = pd.DataFrame( helem.parent.defs.rect.attrs, index=[0] )
        joined = dfh.join(dfr)
        circles = pd.concat( [circles,joined], ignore_index=True )

cols = ['x','y','width','height']
for cc in cols :
    circles[cc] = circles[cc].astype(float)
print(len(circles))

338


In [5]:
circles.head(5)

Unnamed: 0,clip-path,fill,stroke,stroke-width,stroke-linecap,stroke-linejoin,stroke-miterlimit,d,id,x,y,width,height
0,url(#SVGID_1024_),none,#FA0087,0.71,round,round,10,"M430.53,566.23c0-3.221,2.64-5.86,5.8...",SVGID_1023_,430.175,560.015,12.44,12.44
1,url(#SVGID_1042_),none,#FA0087,0.71,round,round,10,"M410.46,481.9c0-3.091,2.521-5.61,5.6...",SVGID_1041_,410.105,475.935,11.92,11.921
2,url(#SVGID_1060_),none,#FA0087,0.71,round,round,10,"M481.57,489.23c0-2.591,2.13-4.721,4....",SVGID_1059_,481.215,484.155,10.16,10.149
3,url(#SVGID_1078_),none,#FA0087,0.71,round,round,10,"M442.89,520.96c0-4.38,3.58-7.96,7.96...",SVGID_1077_,442.535,512.645,16.63,16.631
4,url(#SVGID_1096_),none,#FA0087,0.71,round,round,10,"M501.49,557.24c0-3.891,3.189-7.08,7....",SVGID_1095_,501.135,549.805,14.87,14.87


Find text and put attributes into a dataframe

In [6]:
# find all the text/label objects
tlist = soup.find_all('text')
print(len(tlist))

338


In [7]:
# convert all text/label objects into pandas dataframe
annotations = pd.DataFrame()
for tindex, telem in enumerate(tlist) :
    df = pd.DataFrame( telem.attrs, index=[0] )
    df.loc[0,'node'] = telem.text
    numbers = re.findall(r'\d+\.\d+|\d+', telem.attrs['transform'])
    df.loc[0,'x'] = numbers[4]
    df.loc[0,'y'] = numbers[5]
    annotations = pd.concat( [annotations,df],ignore_index=True )

cols = ['x','y']
for cc in cols :
    annotations[cc] = annotations[cc].astype(float)
print(len(annotations))

338


In [8]:
annotations.head(5)

Unnamed: 0,transform,font-family,font-size,node,x,y
0,matrix(1 0 0 1 173.0117 103.3154),'ArialMT',8,337,173.0117,103.3154
1,matrix(1 0 0 1 165.8916 96.5474),'ArialMT',8,338,165.8916,96.5474
2,matrix(1 0 0 1 433.3398 570.1797),'ArialMT',8,1,433.3398,570.1797
3,matrix(1 0 0 1 413.0117 485.8516),'ArialMT',8,2,413.0117,485.8516
4,matrix(1 0 0 1 483.2441 493.1797),'ArialMT',8,3,483.2441,493.1797


Find all the connection arcs and put attributes into a dataframe

In [9]:
# find all the polygon objects
plist = soup.find_all('polygon')
print(len(plist))

503


In [10]:
# convert all connection arcs objects into pandas dataframe
connections = pd.DataFrame()
for tindex, telem in enumerate(plist) :
    dfh = pd.DataFrame( telem.attrs, index=[0] )
    dfr = pd.DataFrame( telem.parent.defs.rect.attrs, index=[0] )
    joined = dfh.join(dfr)
    connections = pd.concat( [connections,joined],ignore_index=True )

cols = ['x','y','width','height']
for cc in cols :
    connections[cc] = connections[cc].astype(float)

connections['top_left_x'] = connections['x']
connections['top_left_y'] = connections['y']

connections['top_right_x'] = connections['x'] + connections['width']
connections['top_right_y'] = connections['y']

connections['bottom_left_x'] = connections['x']
connections['bottom_left_y'] = connections['y'] + connections['height']

connections['bottom_right_x'] = connections['x'] + connections['width']
connections['bottom_right_y'] = connections['y'] + connections['height']

print(len(connections))

503


In [11]:
connections.head(5)

Unnamed: 0,clip-path,fill,points,id,x,y,width,height,top_left_x,top_left_y,top_right_x,top_right_y,bottom_left_x,bottom_left_y,bottom_right_x,bottom_right_y
0,url(#SVGID_6_),#333333,"416.15,481.891 415.99,481.91 416.01,482.129 41...",SVGID_5_,415.99,481.89,20.81,84.54,415.99,481.89,436.8,481.89,415.99,566.43,436.8,566.43
1,url(#SVGID_8_),#333333,"450.34,520.88 450.33,521 450.31,521.12 450.29,...",SVGID_7_,436.26,520.88,15.09,45.44,436.26,520.88,451.35,520.88,436.26,566.32,451.35,566.32
2,url(#SVGID_10_),#333333,"564.721,326.67 564.6,326.97 564.49,327.28 564....",SVGID_9_,477.51,326.67,87.87,114.71,477.51,326.67,565.38,326.67,477.51,441.38,565.38,441.38
3,url(#SVGID_12_),#333333,"309.7,298.291 309.58,298.339 309.471,298.39 30...",SVGID_11_,265.84,298.29,44.07,40.03,265.84,298.29,309.91,298.29,265.84,338.32,309.91,338.32
4,url(#SVGID_14_),#333333,"266.12,337.89 265.99,337.9 265.86,337.9 265.74...",SVGID_13_,218.15,337.89,48.1,1.57,218.15,337.89,266.25,337.89,218.15,339.46,266.25,339.46


Link up circle with a node by finding the closest text label

In [12]:
# link up circles and node id by finding closest distance points
circle_coord = circles[['x','y']].values
anno_coord = annotations[['x','y']].values

joined_coord = np.concatenate([circle_coord, anno_coord])

'''
num_nodes = len(circle_coord)
cdist = squareform(pdist(joined_coord,'euclidean'))
cdist = cdist[0:num_nodes, num_nodes:]
nmatched = np.argmin(cdist,axis=1)
circles['node'] = annotations.loc[nmatched,'node'].values
'''
# closet distance not working properly, assume that the circles are in order.
circles['node'] = list(range(1,339))
circles['node'] = circles['node'].astype('string')

In [13]:
circles.head(5)

Unnamed: 0,clip-path,fill,stroke,stroke-width,stroke-linecap,stroke-linejoin,stroke-miterlimit,d,id,x,y,width,height,node
0,url(#SVGID_1024_),none,#FA0087,0.71,round,round,10,"M430.53,566.23c0-3.221,2.64-5.86,5.8...",SVGID_1023_,430.175,560.015,12.44,12.44,1
1,url(#SVGID_1042_),none,#FA0087,0.71,round,round,10,"M410.46,481.9c0-3.091,2.521-5.61,5.6...",SVGID_1041_,410.105,475.935,11.92,11.921,2
2,url(#SVGID_1060_),none,#FA0087,0.71,round,round,10,"M481.57,489.23c0-2.591,2.13-4.721,4....",SVGID_1059_,481.215,484.155,10.16,10.149,3
3,url(#SVGID_1078_),none,#FA0087,0.71,round,round,10,"M442.89,520.96c0-4.38,3.58-7.96,7.96...",SVGID_1077_,442.535,512.645,16.63,16.631,4
4,url(#SVGID_1096_),none,#FA0087,0.71,round,round,10,"M501.49,557.24c0-3.891,3.189-7.08,7....",SVGID_1095_,501.135,549.805,14.87,14.87,5


In [14]:
circles.tail(5)

Unnamed: 0,clip-path,fill,stroke,stroke-width,stroke-linecap,stroke-linejoin,stroke-miterlimit,d,id,x,y,width,height,node
333,url(#SVGID_7018_),none,#825F45,0.71,round,round,10,"M205.3,81.49c0-4.14,3.38-7.53,7.52-7...",SVGID_7017_,204.945,73.605,15.76,15.76,334
334,url(#SVGID_7036_),none,#825F45,0.71,round,round,10,"M203.15,100.67c0-2.19,1.79-3.98,3.98...",SVGID_7035_,202.795,96.335,8.67,8.66,335
335,url(#SVGID_7054_),none,#825F45,0.71,round,round,10,"M194.18,105.03c0-0.74,0.61-1.35,1.36...",SVGID_7053_,193.825,103.325,3.42,3.42,336
336,url(#SVGID_7072_),none,#825F45,0.71,round,round,10,"M180.07,99.28c0-1.14,0.93-2.07,2.07-...",SVGID_7071_,179.715,96.855,4.86,4.85,337
337,url(#SVGID_7090_),none,#825F45,0.71,round,round,10,"M172.79,92.52c0-1.23,1-2.24,2.23-2.2...",SVGID_7089_,172.435,89.925,5.17,5.18,338


In [15]:
# for each connection, match the four corners to nearest circle
nlist = ['top_left', 'top_right','bottom_left','bottom_right']

for nn in nlist :
    p_coord = connections[['%s_x' % nn, '%s_y' % nn]].astype(float).values
    joined_coord = np.concatenate([p_coord, circle_coord])
    num_points = len(p_coord)
    cdist = squareform(pdist(joined_coord,'euclidean'))
    cdist = cdist[0:num_points, num_points:]
    nmatched = np.argmin(cdist,axis=1)
    connections['%s_node' % nn] = circles.loc[nmatched,'node'].values
    connections['%s_dist' % nn] = np.min(cdist,axis=1)

# determine if the connection is right-to-left or left-to-right
connections['r2l_dist'] = connections['top_right_dist'] + connections['bottom_left_dist']
connections['l2r_dist'] = connections['top_left_dist'] + connections['bottom_right_dist']

# populate as though is left-to-right and then override
connections['start_node'] = connections['top_left_node']
connections['end_node'] = connections['bottom_right_node']
connections['direction'] = 'left-to-right'

pred = connections['r2l_dist'] < connections['l2r_dist']
connections.loc[pred,'start_node'] = connections.loc[pred,'top_right_node']
connections.loc[pred,'end_node'] = connections.loc[pred,'bottom_left_node']
connections.loc[pred,'direction'] = 'right-to-left'

In [16]:
connections.head(10)

Unnamed: 0,clip-path,fill,points,id,x,y,width,height,top_left_x,top_left_y,...,top_right_dist,bottom_left_node,bottom_left_dist,bottom_right_node,bottom_right_dist,r2l_dist,l2r_dist,start_node,end_node,direction
0,url(#SVGID_6_),#333333,"416.15,481.891 415.99,481.91 416.01,482.129 41...",SVGID_5_,415.99,481.89,20.81,84.54,415.99,481.89,...,27.351143,1,15.568123,1,9.221868,42.919266,17.594159,2,1,left-to-right
1,url(#SVGID_8_),#333333,"450.34,520.88 450.33,521 450.31,521.12 450.29,...",SVGID_7_,436.26,520.88,15.09,45.44,436.26,520.88,...,12.063144,1,8.762434,1,22.093747,20.825578,32.447048,4,1,right-to-left
2,url(#SVGID_10_),#333333,"564.721,326.67 564.6,326.97 564.49,327.28 564....",SVGID_9_,477.51,326.67,87.87,114.71,477.51,326.67,...,8.576039,10,9.023106,7,17.127488,17.599146,32.119628,22,10,right-to-left
3,url(#SVGID_12_),#333333,"309.7,298.291 309.58,298.339 309.471,298.39 30...",SVGID_11_,265.84,298.29,44.07,40.03,265.84,298.29,...,4.8884,100,5.132373,99,5.600808,10.020772,15.335809,88,100,right-to-left
4,url(#SVGID_14_),#333333,"266.12,337.89 265.99,337.9 265.86,337.9 265.74...",SVGID_13_,218.15,337.89,48.1,1.57,218.15,337.89,...,5.100201,101,7.517024,100,6.283522,12.617225,12.771438,100,101,right-to-left
5,url(#SVGID_16_),#333333,"228.89,316.74 228.86,316.78 228.831,316.809 22...",SVGID_15_,217.76,316.74,12.84,22.7,217.76,316.74,...,5.567643,101,7.234933,101,18.497109,12.802576,27.129968,103,101,right-to-left
6,url(#SVGID_18_),#333333,"197.34,316.47 196.64,317.811 196.7,317.84 196....",SVGID_17_,196.64,316.47,22.36,23.27,196.64,316.47,...,7.377076,196,16.902989,101,8.321661,24.280065,13.615361,109,101,left-to-right
7,url(#SVGID_20_),#333333,"229.899,317.51 229.589,317.8 229.67,317.88 229...",SVGID_19_,229.59,317.51,29.68,60.561,229.59,317.51,...,7.630233,214,13.192977,102,6.606436,20.82321,11.86024,103,102,left-to-right
8,url(#SVGID_22_),#333333,"266.19,365.73 266.169,365.75 266.15,365.76 266...",SVGID_21_,258.78,365.73,7.85,12.35,258.78,365.73,...,6.27454,102,6.2688,102,12.962941,12.54334,18.216021,107,102,right-to-left
9,url(#SVGID_24_),#333333,"197,316.899 196.99,317.381 197.03,317.381 197....",SVGID_23_,196.99,316.9,32.75,0.85,196.99,316.9,...,4.957746,109,6.456644,103,5.529941,11.41439,11.372907,109,103,left-to-right


# Load subclass infomation

In [17]:
clusters = pd.read_csv('cluster_annotation_term.csv')

pred = clusters['cluster_annotation_term_set_name'] == 'subclass'
subclasses = clusters[pred].copy()
print(len(subclasses))

pred = clusters['cluster_annotation_term_set_name'] == 'class'
classes = clusters[pred].copy()
print(len(classes))

338
34


In [18]:
subclasses['node'] = [str(int(x.split('_')[-1])) for x in subclasses['label']]
subclasses.set_index('node',inplace=True)
subclasses.head(5)

Unnamed: 0_level_0,label,name,cluster_annotation_term_set_label,parent_term_label,parent_term_set_label,term_set_order,term_order,cluster_annotation_term_set_name,color_hex_triplet
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,CS20230722_SUBC_001,001 CLA-EPd-CTX Car3 Glut,CCN20230722_SUBC,CS20230722_CLAS_01,CCN20230722_CLAS,2,0,subclass,#64c2fc
2,CS20230722_SUBC_002,002 IT EP-CLA Glut,CCN20230722_SUBC,CS20230722_CLAS_01,CCN20230722_CLAS,2,1,subclass,#1F665D
3,CS20230722_SUBC_003,003 L5/6 IT TPE-ENT Glut,CCN20230722_SUBC,CS20230722_CLAS_01,CCN20230722_CLAS,2,2,subclass,#C400FF
4,CS20230722_SUBC_004,004 L6 IT CTX Glut,CCN20230722_SUBC,CS20230722_CLAS_01,CCN20230722_CLAS,2,3,subclass,#C0FF4D
5,CS20230722_SUBC_005,005 L5 IT CTX Glut,CCN20230722_SUBC,CS20230722_CLAS_01,CCN20230722_CLAS,2,4,subclass,#660F38


# Create reorganized and labeled SVG

In [19]:
# create a new soup
newSoup = BeautifulSoup(features='xml')
newSvg = newSoup.new_tag('svg')
newSvg.attrs = soup.svg.attrs.copy()
newSoup.append(newSvg)

In [20]:
pocHost = 'http://35.92.115.7:8883'

In [21]:
# create connections group
newGroup = newSoup.new_tag('g')
newGroup['id'] = 'connections'
newSvg.append(newGroup)

for cindex, crow in connections.iterrows() :

    newPolygon = newSoup.new_tag('polygon')
    newPolygon.attrs['points'] = crow['points']
    newPolygon.attrs['fill'] = '#BBBBBB'

    nstart = subclasses.loc[crow['start_node'],'name']
    nend = subclasses.loc[crow['end_node'],'name']
    cstr = nstart + ':: ' + nend
    newTitle = newSoup.new_tag('title')
    newString = NavigableString( cstr )
    newTitle.append(newString)
    newPolygon.append( newTitle )

    newGroup.append(newPolygon)


In [22]:
# create nodes group
newGroup = newSoup.new_tag('g')
newGroup['id'] = 'nodes'
newSvg.append(newGroup)

for cindex, crow in circles.iterrows() :

    newLink = newSoup.new_tag('a')
    newLink.attrs['href'] = '%s/display_entity?entity_id=CS20230722_SUBC_%03d' % (pocHost,int(crow['node']))
    
    newPath = newSoup.new_tag('path')
    newPath.attrs['id'] = crow['id']
    newPath.attrs['d'] = crow['d']
    newPath.attrs['fill'] = crow['stroke']
    
    newTitle = newSoup.new_tag('title')
    newString = NavigableString( subclasses.loc[crow['node'],'name'] )
    newTitle.append(newString)
    newPath.append(newTitle)

    newLink.append(newPath)
    newGroup.append(newLink)

In [23]:
# create label group
newGroup = newSoup.new_tag('g')
newGroup['id'] = 'labels'
newSvg.append(newGroup)

for cindex, crow in annotations.iterrows() :

    newLink = newSoup.new_tag('a')
    newLink.attrs['href'] = '%s/display_entity?entity_id=CS20230722_SUBC_%03d' % (pocHost,int(crow['node']))
    
    newText = newSoup.new_tag('text')
    newString = NavigableString( crow['node'] )
    newText.append(newString)

    newText.attrs['transform'] = crow['transform']
    newText.attrs['font-size'] = 7
    newText.attrs['font-family'] = "Arial"
    newText.attrs['fill'] = '#444444'

    newTitle = newSoup.new_tag('title')
    newString = NavigableString( subclasses.loc[crow['node'],'name'] )
    newTitle.append(newString)
    newText.append(newTitle)

    newLink.append(newText)
    newGroup.append(newLink)


In [24]:
with open("WMB_reprocessed.svg", "w") as f:
    f.write(str(newSoup))