<a href="https://colab.research.google.com/github/maltevogl/IMPRS_DH_WS_2023/blob/main/From_ontology_to_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

This notebook shows a complete circle using an ontology, filling it with data, querying and thereby creating networks. The resulting network is then analyzed in Cytoscape.

## Setup

In [1]:
!pip install owlready2
!pip install igraph

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting owlready2
  Downloading Owlready2-0.40.tar.gz (27.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: owlready2
  Building wheel for owlready2 (setup.py) ... [?25l[?25hdone
  Created wheel for owlready2: filename=Owlready2-0.40-cp38-cp38-linux_x86_64.whl size=24403298 sha256=d03cc96541573db01bad8cb594ebbc4a18e6974e4b79ae8b2523af45941a8034
  Stored in directory: /root/.cache/pip/wheels/f3/08/26/6ced2768b44339c0107185034531eefba1ff1abe2b22e1b08c
Successfully built owlready2
Installing collected packages: owlready2
Successfully installed owlready2-0.40
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting igraph
  Downloading igraph-0.10.4-cp38-cp38-man

In [2]:
from owlready2 import *
import pandas as pd
import igraph as ig
import uuid
from uuid import uuid4

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


At this point you need to change the path to your Google drive and point to the folder containing the unzipped data.

In [4]:
basepath = "/content/drive/MyDrive/Colab Notebooks/IMPRS_DH_WS/data/"

## Load the ontology

In [5]:
onto_path.append(".")
crm = get_ontology(basepath + "ecrm_160714.owl").load() #CRM
conferences = get_ontology(basepath + "conference_ontology.owl").load() # conference ontology

What classes do we have ? 

In [7]:
for c in conferences.classes():
    print(c)

conference_ontology.Source
conference_ontology.Activity
conference_ontology.Person
conference_ontology.ConferenceEvent
conference_ontology.Conference
conference_ontology.Datum
conference_ontology.Title
conference_ontology.Language
conference_ontology.LastName
conference_ontology.Place
conference_ontology.Participation
conference_ontology.Role
conference_ontology.ConferenceClasses
conference_ontology.ConferenceSeries
conference_ontology.FirstName
conference_ontology.Name
conference_ontology.Patron
conference_ontology.Primary
conference_ontology.Secondary
conference_ontology.besucherin
conference_ontology.forscherin
conference_ontology.wissenschatfler


In [8]:
for p in conferences.properties():
    print(p)

conference_ontology.conference_properties
conference_ontology.documents
conference_ontology.has_actor
conference_ontology.has_conference
conference_ontology.has_datum
conference_ontology.has_german_location
conference_ontology.has_location
conference_ontology.has_language
conference_ontology.has_lastName
conference_ontology.has_participant
conference_ontology.has_role
conference_ontology.has_title
conference_ontology.is_part_of
conference_ontology.is_realisation_of


## Load the data

In [9]:
data = pd.read_csv(basepath + "conf_pers.tsv",sep="\t", header=None)

### First identify the persons

In [10]:
data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7
0,1964,GRG Bulletin 19,,A018,1964.0,C0008,ALPHER,"2nd Texas, Austin"
1,1967,GRG Bulletin 19,,A018,1967.0,C0012,ALPHER,"3d Texas, New York"
2,1968,GRG Bulletin 19,,A018,1968.0,C0015,ALPHER,"4th Texas, Dallas"
3,1957,GRG Bulletin 19,,001,1957.0,C0002,ANDERSON,Chapel Hill
4,1959,GRG Bulletin 19,,001,1959.0,C0003,ANDERSON,Royaumont
5,1962,GRG Bulletin 19,,001,1962.0,C0004,ANDERSON,Warsaw
6,1963,GRG Bulletin 19,,001,1963.0,C0005,ANDERSON,"First Texas, Dallas"
7,1964,GRG Bulletin 19,,001,1964.0,C0008,ANDERSON,"2nd Texas, Austin"
8,1965,GRG Bulletin 19,,001,1965.0,C0010,ANDERSON,London
9,1967,GRG Bulletin 19,,001,1967.0,C0012,ANDERSON,"3d Texas, New York"


In [11]:
len(data[3])

1813

In [12]:
persons = set(data[3])

In [13]:
len(persons)

909

### Create instances of conference_ontology_no_crm.Person

In [14]:
ident2instances = {} # bookeeping -> name to instance
for p in persons:
    ident2instances[p] = conferences.Person(p)

In [15]:
ident2instances["A097"].iri

'http://www.semanticweb.org/dwinter/ontologies/2021/4/untitled-ontology-83#A097'

see what happened

In [16]:
resultsSearch = [p for p in conferences.search(is_a = conferences.Person)]
resultsSearch[:10]

[conference_ontology.Person,
 conference_ontology.OUT055,
 conference_ontology.A217,
 conference_ontology.A197,
 conference_ontology.OUT427,
 conference_ontology.OUT195,
 conference_ontology.A011,
 conference_ontology.A059,
 conference_ontology.072,
 conference_ontology.210]

### Create the names and attach them to the person

In [33]:
id2names = {}
for c,p in data.iterrows():
    id2names[p[3]] = p[6]

In [34]:
names2instances = {} # again bookkeeping
for pid, lastname in id2names.items():
    person_instance = ident2instances[pid]
    
    name_instance = names2instances.get(lastname, None)
    if name_instance is None:
        name_instance = conferences.LastName(uuid.uuid4())
        names2instances[lastname] = name_instance
        name_instance.label = lastname
    person_instance.has_lastName.append(name_instance)

Save the resulting data with the ontology as an OWL file.

In [35]:
conferences.save(basepath + "persons_names.owl")

## Add the conferences

In [36]:
data_konf = pd.read_csv(basepath + "Konferenzen.tsv",sep="\t", header=None)

In [37]:
data_konf.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,GR0,C0001,,,GRG Bulletin 19,,,Bern,,Jubilee Conference Bern,1955.0
1,GR1,C0002,,,GRG Bulletin 19,,,Chapel Hill,,Chapel Hill,1957.0
2,GR2,C0003,,,GRG Bulletin 19,,,Royaumont,,Royaumont,1959.0
3,GR3,C0004,,,GRG Bulletin 19,,,Warsaw,,Warsaw,1962.0
4,Te1,C0005,,,GRG Bulletin 19,,,Dallas,,"First Texas, Dallas",1963.0
5,Flo1,C0006,,,GRG Bulletin 19,,Meeting on GR: Problems of Energy and Grvitati...,Florence,,Florence (Galilei Quadricentenary),1964.0
6,Ky,C0007,,,GRG Bulletin 19,,,Kyoto,,Kyoto,1964.0
7,Te2,C0008,,,GRG Bulletin 19,,,Austin,,"2nd Texas, Austin",1964.0
8,BL,C0009,,,GRG Bulletin 19,,,Berlin,,Berlin (Jubilee GRG),1965.0
9,GR4,C0010,,,GRG Bulletin 19,,,London,,London,1965.0


In [38]:
cid2instances = {}

In [39]:
confs = set(data_konf[1])

In [40]:
for c in confs:
    cid2instances[c] = conferences.ConferenceEvent(c)

In [41]:
cid2instances

{'C0030': conference_ontology.C0030,
 'C0028': conference_ontology.C0028,
 'C0008': conference_ontology.C0008,
 'C0024': conference_ontology.C0024,
 'C0013': conference_ontology.C0013,
 'C0005': conference_ontology.C0005,
 'C0018': conference_ontology.C0018,
 'C0009': conference_ontology.C0009,
 'C0016': conference_ontology.C0016,
 'C0015': conference_ontology.C0015,
 'C0020': conference_ontology.C0020,
 'C0011': conference_ontology.C0011,
 'C0004': conference_ontology.C0004,
 'C0006': conference_ontology.C0006,
 'C0025': conference_ontology.C0025,
 'C0017': conference_ontology.C0017,
 'C0022': conference_ontology.C0022,
 'C0027': conference_ontology.C0027,
 'C0026': conference_ontology.C0026,
 'C0014': conference_ontology.C0014,
 'C0031': conference_ontology.C0031,
 'C0012': conference_ontology.C0012,
 'C0019': conference_ontology.C0019,
 'C0032': conference_ontology.C0032,
 'C0021': conference_ontology.C0021,
 'C0033': conference_ontology.C0033,
 'C0029': conference_ontology.C0029,
 

In [42]:
id2titles = {}
for c,p in data_konf.iterrows():
    id2titles[p[1]] = p[9]

In [43]:
titles2instances = {} # again bookkeeping
for cid, title in id2titles.items():
    conf_instance = cid2instances[cid]
    title_instance = titles2instances.get(title, None)
    if title_instance is None:
        title_instance = conferences.Title(uuid4().urn)
        titles2instances[title] = title_instance
        title_instance.label = title
    conf_instance.has_title.append(title_instance)

In [44]:
title_instance

conference_ontology.urn:uuid:6f780ae7-9d1c-4634-a843-acfa9d9fe1e0

## Persons -- conferences

In [45]:
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7
0,1964,GRG Bulletin 19,,A018,1964.0,C0008,ALPHER,"2nd Texas, Austin"
1,1967,GRG Bulletin 19,,A018,1967.0,C0012,ALPHER,"3d Texas, New York"
2,1968,GRG Bulletin 19,,A018,1968.0,C0015,ALPHER,"4th Texas, Dallas"
3,1957,GRG Bulletin 19,,001,1957.0,C0002,ANDERSON,Chapel Hill
4,1959,GRG Bulletin 19,,001,1959.0,C0003,ANDERSON,Royaumont


In [46]:
for c,r in data.iterrows():
    part_event = conferences.Participation(uuid4().urn)
    conf_instance = cid2instances.get(r[5],None) #there are a few entries where this is not filled yet
    if conf_instance is None:
        continue
    pers_instance = ident2instances[r[3]]
    
    part_event.is_part_of.append(conf_instance)
    part_event.has_participant.append(pers_instance)
    
    date = conferences.Datum(uuid4().urn)
    date.label = r[4]
    part_event.has_datum.append(date)

In [47]:
conferences.save(basepath + "persons_names_conferences.owl")

# Creating a first network

## Bipartite

In [48]:
sparql = """
PREFIX conf: <http://www.semanticweb.org/dwinter/ontologies/2021/4/untitled-ontology-83#>

select distinct ?title ?ln where
{
?part conf:is_part_of ?conf;
       conf:has_participant ?person.

?person conf:has_lastName/rdfs:label ?ln.

?conf conf:has_title/rdfs:label ?title.
       }
"""

In [49]:
res = default_world.sparql(sparql)

In [50]:
edges = list(res)

In [51]:
graph = ig.Graph()

In [52]:
confs = set([e[0] for e in edges])
types = ["conference" for c in confs]
pers = set([e[1] for e in edges])
types += ["person" for e in pers]

In [53]:
edges_names = list(confs) + list(pers)

In [54]:
graph.add_vertices(edges_names)

In [55]:
graph.add_edges(edges)

In [56]:
graph.vs["typ"] = types

In [57]:
graph.write_graphml(basepath + "bipartite.graphml")

## Persons

In [58]:
sparql = """
PREFIX conf: <http://www.semanticweb.org/dwinter/ontologies/2021/4/untitled-ontology-83#>
select distinct ?title ?ln1 ?ln2 ?date1 where
{
?part conf:is_part_of ?conf;
       conf:has_participant ?person.
?person conf:has_lastName/rdfs:label ?ln1.

?part conf:has_datum/rdfs:label ?date1.  
  
?conf conf:has_title/rdfs:label ?title.
?part2 conf:is_part_of ?conf;
       conf:has_participant ?person2.
  
?part2 conf:has_datum/rdfs:label ?date2.  
?person2 conf:has_lastName/rdfs:label ?ln2.

      FILTER(?date1 = ?date2)

        } 
"""

In [59]:
res = default_world.sparql(sparql)

In [None]:
edges = list(res)

In [None]:
edges[0:100]

In [None]:
pers = set([e[1] for e in edges])
pers.update(set([e[2] for e in edges]))

In [None]:
pers = set([e[1] for e in edges])
pers.update(set([e[2] for e in edges]))

In [None]:
pers = list(pers)

In [None]:
graph = ig.Graph()

In [None]:
graph.add_vertices(pers)

In [None]:
edgs = [(e[1],e[2]) for e in edges]

In [None]:
edgs[0:10]

[('ALPHER', 'ALPHER'),
 ('ALPHER', 'ANDERSON'),
 ('ALPHER', 'BAHCALL'),
 ('ALPHER', 'BAZANSKI'),
 ('ALPHER', 'BELINFANTE'),
 ('ALPHER', 'BERGMANN P.'),
 ('ALPHER', 'BOYER'),
 ('ALPHER', 'BREHME'),
 ('ALPHER', 'BRILL'),
 ('ALPHER', 'BURBIDGE EM')]

In [None]:
date_list= [ e[3] for e in edges]

In [None]:
date_list[0:5]

[1964.0, 1964.0, 1964.0, 1964.0, 1964.0]

In [None]:
conf_list = [ e[0 ] for e in edges]

In [None]:
graph.add_edges(edgs)

In [None]:
graph.es["date"] = date_list

In [None]:
graph.es["conf"] =  conf_list

In [None]:
graph = graph.simplify(loops=True, multiple = False)

In [None]:
len(graph.es)

256466

In [None]:
graph.write_graphml("pers_pers.graphml")