# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os

# Config

In [2]:
data_dir = "data"
image_dir = "images"

# Get Raw Data

In [3]:
# 2020 data
edges = pd.read_csv(f"{data_dir}/2020/edges.csv")[['person','referent','relation']]

In [4]:
edges

Unnamed: 0,person,referent,relation
0,Colley Cibber,Alexander Pope,Attacked
1,Colley Cibber,Alexander Pope,Attacked
2,Colley Cibber,John Dennis,Akin to
3,Colley Cibber,Lewis Theobald,Akin to
4,Colley Cibber,Stephen Duck,Akin to
...,...,...,...
341,John Henley,John Toland,Akin to
342,John Henley,Matthew Tindal,Akin to
343,John Henley,Thomas Woolston,Akin to
344,John Henley,Thomas Sherlock,Akin to


In [5]:
# New data -- Seems to be missing Eliza Haywood (Heywood?)
# excel_file = "data/word_docs/ALL.xls"
# sheets = []
# keep_cols = ["character", "person_referred_to", "relation"]
# for i in range(5):
#     sheet = pd.read_excel(excel_file, sheet_name=i) 
#     sheet.columns = [col.strip().lower().replace(' ', '_') for col in sheet.columns]
#     sheets.append(sheet[keep_cols])
# edges = pd.concat(sheets)

## Clean up columns

In [6]:
# For new data only
# edges = edges.rename(columns={'character':'person', 'person_referred_to':'referent'})
# edges = edges.dropna(subset=['person','referent']).copy()

In [7]:
edges

Unnamed: 0,person,referent,relation
0,Colley Cibber,Alexander Pope,Attacked
1,Colley Cibber,Alexander Pope,Attacked
2,Colley Cibber,John Dennis,Akin to
3,Colley Cibber,Lewis Theobald,Akin to
4,Colley Cibber,Stephen Duck,Akin to
...,...,...,...
341,John Henley,John Toland,Akin to
342,John Henley,Matthew Tindal,Akin to
343,John Henley,Thomas Woolston,Akin to
344,John Henley,Thomas Sherlock,Akin to


## Clean up values

In [8]:
# For new data only
# for col in edges.columns:
#     edges[col] = edges[col].str.strip()

In [9]:
edges['relation_id'] = edges['relation'].str.lower().str.replace(' ', '_')

In [10]:
edges

Unnamed: 0,person,referent,relation,relation_id
0,Colley Cibber,Alexander Pope,Attacked,attacked
1,Colley Cibber,Alexander Pope,Attacked,attacked
2,Colley Cibber,John Dennis,Akin to,akin_to
3,Colley Cibber,Lewis Theobald,Akin to,akin_to
4,Colley Cibber,Stephen Duck,Akin to,akin_to
...,...,...,...,...
341,John Henley,John Toland,Akin to,akin_to
342,John Henley,Matthew Tindal,Akin to,akin_to
343,John Henley,Thomas Woolston,Akin to,akin_to
344,John Henley,Thomas Sherlock,Akin to,akin_to


## Normalize names

In [11]:
# For new data only
# edges.loc[edges.person.str.contains(','), 'person'] = edges\
#     .loc[edges.person.str.contains(',')]\
#     .person.apply(lambda x: f"{x.split(',')[1].strip()} {x.split(',')[0]}")
# edges.loc[edges.person == 'Edmund Curl', 'person'] = 'Edmund Curll' 
# edges.loc[edges.person == 'Curll Edmund', 'person'] = 'Edmund Curll' 

In [12]:
edges.person.value_counts()

person
Edmund Curll     124
John Dennis      107
Colley Cibber     74
Giles Jacob       20
John Henley       12
Eliza Haywood      9
Name: count, dtype: int64

In [13]:
edges

Unnamed: 0,person,referent,relation,relation_id
0,Colley Cibber,Alexander Pope,Attacked,attacked
1,Colley Cibber,Alexander Pope,Attacked,attacked
2,Colley Cibber,John Dennis,Akin to,akin_to
3,Colley Cibber,Lewis Theobald,Akin to,akin_to
4,Colley Cibber,Stephen Duck,Akin to,akin_to
...,...,...,...,...
341,John Henley,John Toland,Akin to,akin_to
342,John Henley,Matthew Tindal,Akin to,akin_to
343,John Henley,Thomas Woolston,Akin to,akin_to
344,John Henley,Thomas Sherlock,Akin to,akin_to


In [14]:
# edges.loc[edges.referent.str.contains(','), 'referent'] = edges.loc[edges.referent.str.contains(',')].referent.apply(lambda x: f"{x.split(',')[1].strip()} {x.split(',')[0]}")
# edges.loc[edges.referent == 'Edmund Curl', 'referent'] = 'Edmund Curll' 
print('\n'.join(edges.referent.value_counts().sort_index().index.to_list()))

A Boyer
A Moore
Abel Evans
Abel Roper
Alexander Pope
Ambrose Phillips
Anne Tanneguy-Le Fevre Dacier
B.B.
Barton Booth
Bavius
Benjamin Hoadly
Bernard Lintot
Besaleel Morris
Caius Gabriel Cibber
Camillo Querno
Cardell Goodman
Charles Fleetwood
Charles Gildon
Cloacina
Colley Cibber
Daniel Defoe
Dulness
Edmund Curll
Edmund Gibson
Edward Ward
Edward Young
Elisha Kirkall
Eliza Haywood
Elizabeth Thomas
Elkanah Settle
Eustace Budgell
Frances Hare
Francis Atterbury
George Ducket
George Duckett
George Etherege
George I
George II
George Ridpath
Giles Jacob
Henry Cromwell
Henry Curll
Henry Janssen
Hilkiah Bedford
Isaac Barrow
Isaac Newton
Issac Barrow
James Moore Smythe
James Ralph
James-Moore Smith
Jeremy Collier
John Anstis
John Arbuthnot
John Atterbury
John Banks
John Dennis
John Dryden
John Dunton
John Durant Breval
John Fletcher
John Garth
John Gay
John Henley
John Hervey
John Locke
John Oldmixon
John Ozell
John Toland
John Tutchin
Jonathan Swift
Joseph Addison
Joseph Gay
Joseph Gay (aka John

# Extract Relations

In [15]:
R = edges.value_counts('relation_id').to_frame('n')

In [16]:
R['color'] = ['green','red','orange','blue']
R['label'] = R.index.str.replace('_', ' ')
R

Unnamed: 0_level_0,n,color,label
relation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
akin_to,189,green,akin to
attacked,113,red,attacked
dissimilar,25,orange,dissimilar
defended,19,blue,defended


# Extract Nodes

In [17]:
N = pd.concat([edges.person, edges.referent]).value_counts().to_frame('n')
N = N.reset_index().rename(columns={'index':'full_name'})
N['index'] = N['full_name'].str.replace(' ', '_').str.upper().str.replace(r"\W", "", regex=True)
N = N.set_index('index')

In [18]:
# Create label by splitting middle space of name
X = N.full_name.str.split(' ').to_frame('list')
X['offset'] = np.round(X['list'].apply(len) / 2).astype('int')
X['label'] = X.apply(lambda x: x.list[:x.offset] + ["<br/>"] + x.list[x.offset:], axis=1).apply(' '.join)
X['label'] = X['label'].str.replace(r"^\s*<br/>\s*", "", regex=True)
X['label'] = X['label'].str.replace(r"\s*<br/>\s*", "<br/>", regex=True)
try:
    N = N.join(X.label)
except ValueError:
    pass
del(X)

In [19]:
N.label.sort_values() #

index
A_BOYER                        A<br/>Boyer
A_MOORE                        A<br/>Moore
ABEL_EVANS                  Abel<br/>Evans
ABEL_ROPER                  Abel<br/>Roper
ALEXANDER_POPE          Alexander<br/>Pope
                             ...          
WILLIAM_LAW                William<br/>Law
WILLIAM_MEARS            William<br/>Mears
WILLIAM_TEMPLE          William<br/>Temple
WILLIAM_WILKINS        William<br/>Wilkins
WILLIAM_WYCHERLEY    William<br/>Wycherley
Name: label, Length: 112, dtype: object

In [20]:
# N['label'] = N['full_name'].str.replace(' ', '<br/>')
N['dot'] = N.apply(lambda x: f"{x.name} [label=< {x.label} >]", axis=1)

In [21]:
N.head()

Unnamed: 0_level_0,full_name,n,label,dot
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
EDMUND_CURLL,Edmund Curll,131,Edmund<br/>Curll,EDMUND_CURLL [label=< Edmund<br/>Curll >]
JOHN_DENNIS,John Dennis,122,John<br/>Dennis,JOHN_DENNIS [label=< John<br/>Dennis >]
ALEXANDER_POPE,Alexander Pope,83,Alexander<br/>Pope,ALEXANDER_POPE [label=< Alexander<br/>Pope >]
COLLEY_CIBBER,Colley Cibber,78,Colley<br/>Cibber,COLLEY_CIBBER [label=< Colley<br/>Cibber >]
GILES_JACOB,Giles Jacob,24,Giles<br/>Jacob,GILES_JACOB [label=< Giles<br/>Jacob >]


# Extract Edges

In [22]:
edges['person_id'] = edges.person.map(N.reset_index().set_index('full_name')['index'])
edges['referent_id'] = edges.referent.map(N.reset_index().set_index('full_name')['index'])
# edges['relation_id'] = edges.relation.str.lower().str.replace(' ', '_', regex=True)

In [23]:
E = edges.groupby(['person_id','referent_id','relation_id']).relation_id.count().to_frame('n')
E.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
person_id,referent_id,relation_id,Unnamed: 3_level_1
COLLEY_CIBBER,ALEXANDER_POPE,akin_to,1
COLLEY_CIBBER,ALEXANDER_POPE,attacked,9
COLLEY_CIBBER,BARTON_BOOTH,akin_to,1
COLLEY_CIBBER,CAIUS_GABRIEL_CIBBER,akin_to,2
COLLEY_CIBBER,CAMILLO_QUERNO,akin_to,2


In [24]:
E['dot'] = E.apply(lambda x: f"{x.name[0]} -> {x.name[1]} [color={R.loc[x.name[2], 'color']} weight={x.n**2}]", axis=1)

In [25]:
E

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,dot
person_id,referent_id,relation_id,Unnamed: 3_level_1,Unnamed: 4_level_1
COLLEY_CIBBER,ALEXANDER_POPE,akin_to,1,COLLEY_CIBBER -> ALEXANDER_POPE [color=green w...
COLLEY_CIBBER,ALEXANDER_POPE,attacked,9,COLLEY_CIBBER -> ALEXANDER_POPE [color=red wei...
COLLEY_CIBBER,BARTON_BOOTH,akin_to,1,COLLEY_CIBBER -> BARTON_BOOTH [color=green wei...
COLLEY_CIBBER,CAIUS_GABRIEL_CIBBER,akin_to,2,COLLEY_CIBBER -> CAIUS_GABRIEL_CIBBER [color=g...
COLLEY_CIBBER,CAMILLO_QUERNO,akin_to,2,COLLEY_CIBBER -> CAMILLO_QUERNO [color=green w...
...,...,...,...,...
JOHN_HENLEY,MATTHEW_TINDAL,akin_to,1,JOHN_HENLEY -> MATTHEW_TINDAL [color=green wei...
JOHN_HENLEY,RICHARD_BLACKMORE,akin_to,1,JOHN_HENLEY -> RICHARD_BLACKMORE [color=green ...
JOHN_HENLEY,RICHARD_FLECKNOE,akin_to,1,JOHN_HENLEY -> RICHARD_FLECKNOE [color=green w...
JOHN_HENLEY,THOMAS_SHERLOCK,akin_to,1,JOHN_HENLEY -> THOMAS_SHERLOCK [color=green we...


# Draw Graphs

In [26]:
def create_graph(persons:[]=[], engines=['sfdp','circo'], verbose=True, width=2000):
    global E, N    
    
    # Create subsets of N and E
    E1 = E.loc[persons] if len(persons) > 0 else E
    node_idx = list(set(persons + E1.reset_index()['referent_id'].to_list()))
    N1 = N.loc[node_idx].sort_index()
    
    # Convert to Graphviz
    graph_text = "digraph G {\n"
    graph_text += "rankdir=LR\n"
    graph_text += "node [shape=rect overlap=false fontname=arial]\n"
    graph_text += '\n'.join(N1['dot'].values)
    graph_text += "\n"
    graph_text += '\n'.join(E1['dot'].values)
    graph_text += "\n}"
    
    # Print source to file
    file_name = '_'.join(persons) + ".dot" if len(persons) else "images/all.dot"
    file_name = image_dir + "/" + file_name
    if verbose:
        print(file_name)
    with open(file_name, "w") as outfile:
        outfile.write(graph_text)
    
    # Generate images
    for engine in engines:
        # Convert to SVG
        os.system(f"{engine} -Tsvg {file_name} > {file_name}_{engine}.svg")
        # Convert to PNG
        os.system(f"rsvg-convert -w {width} {file_name}_{engine}.svg > {file_name}_{engine}_w{width}.png")

In [27]:
persons = E.index.levels[0].to_list()

In [28]:
create_graph(persons)
for person in persons:
    create_graph([person])

images/COLLEY_CIBBER_EDMUND_CURLL_ELIZA_HAYWOOD_GILES_JACOB_JOHN_DENNIS_JOHN_HENLEY.dot
images/COLLEY_CIBBER.dot
images/EDMUND_CURLL.dot
images/ELIZA_HAYWOOD.dot
images/GILES_JACOB.dot
images/JOHN_DENNIS.dot
images/JOHN_HENLEY.dot
