#### Connecting to AllegroGraph Server

On a localhost.

In [3]:
import os

AGRAPH_HOST = os.environ.get('AGRAPH_HOST', 'localhost')
AGRAPH_PORT = int(os.environ.get('AGRAPH_PORT', '10035'))
AGRAPH_USER = os.environ.get('AGRAPH_USER', 'daanaea')
AGRAPH_PASSWORD = os.environ.get('AGRAPH_PASSWORD', '0101')
print(AGRAPH_HOST, AGRAPH_PORT, AGRAPH_USER, AGRAPH_PASSWORD)
# pip install agraph-python pycurl

from franz.openrdf.connect import ag_connect
from franz.openrdf.sail.allegrographserver import AllegroGraphServer

print("Connecting to AllegroGraph server --",
      "host:'%s' port:%s" % (AGRAPH_HOST, AGRAPH_PORT))

server = AllegroGraphServer(AGRAPH_HOST, AGRAPH_PORT,
                            AGRAPH_USER, AGRAPH_PASSWORD)
print("Available catalogs:")
for cat_name in server.listCatalogs():
    if cat_name is None:
        print('  - <root catalog>')
    else:
        print('  - ' + str(cat_name))
        
print(server.listCatalogs())
catalog = server.openCatalog('')

print("Available repositories in catalog '%s':" % catalog.getName()) # default root catalog

for repo_name in catalog.listRepositories():
    print('  - ' + repo_name)

localhost 10035 daanaea 0101
Connecting to AllegroGraph server -- host:'localhost' port:10035
Available catalogs:
  - <root catalog>
  - fedshard
  - system
[None, 'fedshard', 'system']
Available repositories in catalog 'None':
  - drmo


In [4]:
# Create a connection object and bind to conn. The conn object is used to connect with an AllegroGraph repository
conn = ag_connect(repo='drmo', host='localhost', port='10035', user='daanaea', password='0101')

#### Parser

In [5]:
from franz.openrdf.vocabulary import RDF
import uuid

In [6]:
creator_property = conn.createURI("http://purl.org/dc/terms/creator")
person_class = conn.createURI("http://www.w3.org/ns/prov#Person")
domain_ont_str = "http://www.semanticweb.org/ontologies/2022/titutuli/nivedita/drmo#"
rdfs_label_prop = conn.createURI("http://www.w3.org/2000/01/rdf-schema#label")
has_author_prop = conn.createURI("http://www.semanticweb.org/ontologies/2022/titutuli/nivedita/drmo#hasAuthor")
document_class = conn.createURI("http://www.semanticweb.org/ontologies/2022/titutuli/nivedita/drmo#Document")
first_name_prop = conn.createURI("http://www.semanticweb.org/ontologies/2022/titutuli/nivedita/drmo#firstName")
last_name_prop = conn.createURI("http://www.semanticweb.org/ontologies/2022/titutuli/nivedita/drmo#lastName")

In [7]:
author_string_0 = "Zhang, Wengang; Xiang, Jiaying; Huang, Ruijie; Liu, Hanlong"
author_string_1 = "Sadeghyar, A; Watts, DC; Schedle, A; DeBellis, M"
author_string_2 = "Simila, HO; DeBellis, M"
author_string_3 = "Makrgeorgou, A"

In [12]:
conn.getStatements(None, RDF.TYPE, document_class)

<franz.openrdf.query.repositoryresult.RepositoryResult at 0x10512ded0>

In [8]:
def add_authors():
    statements = conn.getStatements(None, RDF.TYPE, document_class)
    with statements:
        for statement in statements:
            document = statement.getSubject()
            author_objects = conn.getStatements(document, has_author_prop, None)
            if len(author_objects) == 0:
                author_statements = conn.getStatements(document, creator_property, None)
                for author_statement in author_statements:
                    author_string = str(author_statement[2])
                    # Line below is required to strip out extra " symbols that result from transforming AG Literal to Python string
                    author_string = author_string[1:len(author_string) - 1]
                    process_authors(document, author_string)

In [9]:
# Takes string for first and last name (first name can be initials) and returns an author object if one exists
# If one doesn't exist it is created and the appropriate properties are set
# It would be nice to make the test more flexible. E.g., so that "DeWaal", "De Waal", "deWaal", "Dewaal", and "De-Waal" are all considered the same
# To make this flexible might want to change the line below from getStatements to a SPARQL query with a regex in the query
def find_or_make_author_object(first_name, last_name):
    first_name = first_name.strip()
    last_name = last_name.strip()
    if first_name != "":
        author_label = first_name + " " + last_name
    else:
        author_label = last_name
    author_statements = conn.getStatements(None, rdfs_label_prop, author_label) # This tests if any existing objects have the name of the current author
    if len(author_statements) > 0:
        for author_statement in author_statements:
            print("Found author: ", author_label)
            return author_statement.getSubject()
    else:
        print("Author label:", author_label)
        author_iri = conn.createURI(domain_ont_str + str(uuid.uuid4()))  # Creates a UUID for the IRI for a new instance of Person
        conn.add(author_iri, RDF.TYPE, person_class)
        conn.add(author_iri, rdfs_label_prop, author_label)
        conn.add(author_iri, first_name_prop, first_name)
        conn.add(author_iri, last_name_prop, last_name)
        return author_iri

In [10]:
# For testing use this ontology: https://github.com/mdebellis/DrMO_Docs/commits/main/Ontologies/drmo_na_3_29_24.ttl
# It has documents but no authors
def process_authors(document, author_string):
    print(author_string)
    author = None
    author_list = []
    if ";" in author_string:  # First test is if there is a semi-colon to delimit names of authors
        author_list_un_stripped = author_string.split(";")
        for author_string in author_list_un_stripped:  # Need to test for blank space as a delimiter so want to strip out leading and trailing blanks
            author_list.append(author_string.strip())
        print("Stripped string list:", author_list)
        if len(author_list) == 2 and " " not in author_list[0]: # In hindsight don't think this code is needed. This is to test for one author but
            last_name = author_list[0]                          # if only one author there wouldn't be a semi-colon anyway
            first_name = author_list[1]
            author = find_or_make_author_object(first_name, last_name)
            conn.add(document, has_author_prop, author)
        elif len(author_list) < 2:                       # Don't think this is needed either. To test for single author with one name
            last_name = author_list[0]                   # as above if that's the case, there won't be a semi-colon to begin with
            first_name = ""
            author = find_or_make_author_object(first_name, last_name)
            conn.add(document, has_author_prop, author)
        else:
            for author in author_list:
                if "," in author:                    # For author first and last delimited by comma. e.g., "Chomsky, Noam"
                    name_list = author.split(",")
                    last_name = name_list[0]
                    first_name = name_list[1]
                    author = find_or_make_author_object(first_name, last_name)
                    conn.add(document, has_author_prop, author)
                elif " " in author:                  # For author first and last delimited by space. e.g., "Chomsky Noam"
                    name_list = author.split(" ")
                    last_name = name_list[0]
                    first_name = name_list[1]
                    author = find_or_make_author_object(first_name, last_name)
                    conn.add(document, has_author_prop, author)
                else:                               # When there is only a last name (no delimiter) e.g., "Turing, Alan; Hauser; Chomsky, Noam"
                    last_name = author[0]
                    first_name = ""
                    author = find_or_make_author_object(first_name, last_name)
                    conn.add(document, has_author_prop, author)
    elif "," in author_string:   # Next test is if a comma is used to delimit authors
        author_list_un_stripped = author_string.split(",")
        for author_string in author_list_un_stripped:
            author_list.append(author_string.strip())  #Need to test for blank space as a delimiter so want to strip out leading and trailing blanks
        print("Stripped string list:", author_list)
        if len(author_list) == 2 and " " not in author_list[0]:   # This was to test when there is only one author but won't always work because there still may be
            last_name = author_list[0]                            # a blank. Will work if the entire string is "Chomsky,Noam" but not if it is "Chomsky, Noam"
            first_name = author_list[1]
            author = find_or_make_author_object(first_name, last_name)
            conn.add(document, has_author_prop, author)
        elif len(author_list) < 2:
            last_name = author_list[0]
            first_name = ""
            author = find_or_make_author_object(first_name, last_name)
            conn.add(document, has_author_prop, author)
        else:
            for author_string in author_list:                   # Standard case where both full names and first, last are delimited by commas
                if "," in author_string:                        # e.g., "Turing, Alan, Chomsky, Noam"
                    name_list = author_string.split(",")
                    last_name = name_list[0]
                    first_name = name_list[1]
                    author = find_or_make_author_object(first_name, last_name)
                    conn.add(document, has_author_prop, author)
                else:                                           # Where full names delimited by commas and last-first by spaces
                    name_list = author_string.split(" ")        # e.g., "Turing Alan, Chomsky Noam"
                    last_name = name_list[0]
                    first_name = name_list[1]
                    author = find_or_make_author_object(first_name, last_name)
                    conn.add(document, has_author_prop, author)

In [11]:
# Cases to add: 1) When there is just one name delimited by comma or string
# Was trying to check for those in code above but don't think it is correct. e.g.,  when complete string is "Chomsky, Noam"
# 2) When complete string is just a last name. E.g., "Chomsky"

add_authors()

Demo

In [1]:
import re

def parse_names_last_names(input_string):
    # Regular expression to match names and last names
    pattern = r'([A-Za-z]+(?: [A-Za-z]+)?), ([A-Za-z]+(?: [A-Za-z]+)?)'
    
    # Find all matches in the input string
    matches = re.findall(pattern, input_string)
    
    # Extract names and last names from matches
    parsed_names_last_names = [(name.strip(), last_name.strip()) for last_name, name in matches]
    
    return parsed_names_last_names

# Test the function with example strings
strings = [
    "Chomsky, Noam; Turing, Alan; Dawkins, Richard",
    "Chomsky, Noam; Turing, Alan; Dawkins, Richard; de Waal, Frans",
    "Chomsky, Noam; de Waal, Frans",
    "de Waal, Frans; Chomsky, Noam; Turing; Dawkins, Richard",
    "Chomsky; Turing, Alan; Dawkins, Richard",
    "Chomsky, Noam; Turing; Dawkins, Richard",
    "Chomsky, Noam; Turing, Alan; Dawkins",
    "Chomsky, Noam; Turing, Alan; Dawkins; de Waal, Frans", # a bug
    "Chomsky, Noam, Turing, Alan, Dawkins, Richard",
    "Chomsky, Noam, Turing, Alan, Dawkins, Richard, de Waal, Frans",
    "Chomsky, Turing, Alan, Dawkins, Richard",
    "Chomsky Noam, Turing Alan, Dawkins Richard",
    "Turing, Alan",
    "Dawkins Richard",
    "de Waal, Frans",
    "de Waal Frans"
]

for string in strings:
    print(parse_names_last_names(string))


[('Noam', 'Chomsky'), ('Alan', 'Turing'), ('Richard', 'Dawkins')]
[('Noam', 'Chomsky'), ('Alan', 'Turing'), ('Richard', 'Dawkins'), ('Frans', 'de Waal')]
[('Noam', 'Chomsky'), ('Frans', 'de Waal')]
[('Frans', 'de Waal'), ('Noam', 'Chomsky'), ('Richard', 'Dawkins')]
[('Alan', 'Turing'), ('Richard', 'Dawkins')]
[('Noam', 'Chomsky'), ('Richard', 'Dawkins')]
[('Noam', 'Chomsky'), ('Alan', 'Turing')]
[('Noam', 'Chomsky'), ('Alan', 'Turing'), ('Frans', 'de Waal')]
[('Noam', 'Chomsky'), ('Alan', 'Turing'), ('Richard', 'Dawkins')]
[('Noam', 'Chomsky'), ('Alan', 'Turing'), ('Richard', 'Dawkins'), ('Frans', 'de Waal')]
[('Turing', 'Chomsky'), ('Dawkins', 'Alan')]
[('Turing Alan', 'Chomsky Noam')]
[('Alan', 'Turing')]
[]
[('Frans', 'de Waal')]
[]
