#### Connecting to AllegroGraph Server

On a localhost.

In [3]:
import os

AGRAPH_HOST = os.environ.get('AGRAPH_HOST', 'localhost')
AGRAPH_PORT = int(os.environ.get('AGRAPH_PORT', '10035'))
AGRAPH_USER = os.environ.get('AGRAPH_USER', 'daanaea')
AGRAPH_PASSWORD = os.environ.get('AGRAPH_PASSWORD', '0101')
print(AGRAPH_HOST, AGRAPH_PORT, AGRAPH_USER, AGRAPH_PASSWORD)
# pip install agraph-python pycurl

from franz.openrdf.connect import ag_connect
from franz.openrdf.sail.allegrographserver import AllegroGraphServer

print("Connecting to AllegroGraph server --",
      "host:'%s' port:%s" % (AGRAPH_HOST, AGRAPH_PORT))

server = AllegroGraphServer(AGRAPH_HOST, AGRAPH_PORT,
                            AGRAPH_USER, AGRAPH_PASSWORD)
print("Available catalogs:")
for cat_name in server.listCatalogs():
    if cat_name is None:
        print('  - <root catalog>')
    else:
        print('  - ' + str(cat_name))
        
print(server.listCatalogs())
catalog = server.openCatalog('')

print("Available repositories in catalog '%s':" % catalog.getName()) # default root catalog

for repo_name in catalog.listRepositories():
    print('  - ' + repo_name)

localhost 10035 daanaea 0101
Connecting to AllegroGraph server -- host:'localhost' port:10035
Available catalogs:
  - <root catalog>
  - fedshard
  - system
[None, 'fedshard', 'system']
Available repositories in catalog 'None':
  - drmo


In [4]:
# Create a connection object and bind to conn. The conn object is used to connect with an AllegroGraph repository
conn = ag_connect(repo='drmo', host='localhost', port='10035', user='daanaea', password='0101')

#### Parser

In [5]:
from franz.openrdf.vocabulary import RDF
import uuid

In [6]:
creator_property = conn.createURI("http://purl.org/dc/terms/creator")
person_class = conn.createURI("http://www.w3.org/ns/prov#Person")
domain_ont_str = "http://www.semanticweb.org/ontologies/2022/titutuli/nivedita/drmo#"
rdfs_label_prop = conn.createURI("http://www.w3.org/2000/01/rdf-schema#label")
has_author_prop = conn.createURI("http://www.semanticweb.org/ontologies/2022/titutuli/nivedita/drmo#hasAuthor")
document_class = conn.createURI("http://www.semanticweb.org/ontologies/2022/titutuli/nivedita/drmo#Document")
first_name_prop = conn.createURI("http://www.semanticweb.org/ontologies/2022/titutuli/nivedita/drmo#firstName")
last_name_prop = conn.createURI("http://www.semanticweb.org/ontologies/2022/titutuli/nivedita/drmo#lastName")

In [7]:
author_string_0 = "Zhang, Wengang; Xiang, Jiaying; Huang, Ruijie; Liu, Hanlong"
author_string_1 = "Sadeghyar, A; Watts, DC; Schedle, A; DeBellis, M"
author_string_2 = "Simila, HO; DeBellis, M"
author_string_3 = "Makrgeorgou, A"

In [12]:
conn.getStatements(None, RDF.TYPE, document_class)

<franz.openrdf.query.repositoryresult.RepositoryResult at 0x10512ded0>

In [8]:
def add_authors():
    statements = conn.getStatements(None, RDF.TYPE, document_class)
    with statements:
        for statement in statements:
            document = statement.getSubject()
            author_objects = conn.getStatements(document, has_author_prop, None)
            if len(author_objects) == 0:
                author_statements = conn.getStatements(document, creator_property, None)
                for author_statement in author_statements:
                    author_string = str(author_statement[2])
                    # Line below is required to strip out extra " symbols that result from transforming AG Literal to Python string
                    author_string = author_string[1:len(author_string) - 1]
                    process_authors(document, author_string)

In [9]:
# Takes string for first and last name (first name can be initials) and returns an author object if one exists
# If one doesn't exist it is created and the appropriate properties are set
# It would be nice to make the test more flexible. E.g., so that "DeWaal", "De Waal", "deWaal", "Dewaal", and "De-Waal" are all considered the same
# To make this flexible might want to change the line below from getStatements to a SPARQL query with a regex in the query
def find_or_make_author_object(first_name, last_name):
    first_name = first_name.strip()
    last_name = last_name.strip()
    if first_name != "":
        author_label = first_name + " " + last_name
    else:
        author_label = last_name
    author_statements = conn.getStatements(None, rdfs_label_prop, author_label) # This tests if any existing objects have the name of the current author
    if len(author_statements) > 0:
        for author_statement in author_statements:
            print("Found author: ", author_label)
            return author_statement.getSubject()
    else:
        print("Author label:", author_label)
        author_iri = conn.createURI(domain_ont_str + str(uuid.uuid4()))  # Creates a UUID for the IRI for a new instance of Person
        conn.add(author_iri, RDF.TYPE, person_class)
        conn.add(author_iri, rdfs_label_prop, author_label)
        conn.add(author_iri, first_name_prop, first_name)
        conn.add(author_iri, last_name_prop, last_name)
        return author_iri

In [10]:
# For testing use this ontology: https://github.com/mdebellis/DrMO_Docs/commits/main/Ontologies/drmo_na_3_29_24.ttl
# It has documents but no authors
def process_authors(document, author_string):
    print(author_string)
    author = None
    author_list = []
    if ";" in author_string:  # First test is if there is a semi-colon to delimit names of authors
        author_list_un_stripped = author_string.split(";")
        for author_string in author_list_un_stripped:  # Need to test for blank space as a delimiter so want to strip out leading and trailing blanks
            author_list.append(author_string.strip())
        print("Stripped string list:", author_list)
        if len(author_list) == 2 and " " not in author_list[0]: # In hindsight don't think this code is needed. This is to test for one author but
            last_name = author_list[0]                          # if only one author there wouldn't be a semi-colon anyway
            first_name = author_list[1]
            author = find_or_make_author_object(first_name, last_name)
            conn.add(document, has_author_prop, author)
        elif len(author_list) < 2:                       # Don't think this is needed either. To test for single author with one name
            last_name = author_list[0]                   # as above if that's the case, there won't be a semi-colon to begin with
            first_name = ""
            author = find_or_make_author_object(first_name, last_name)
            conn.add(document, has_author_prop, author)
        else:
            for author in author_list:
                if "," in author:                    # For author first and last delimited by comma. e.g., "Chomsky, Noam"
                    name_list = author.split(",")
                    last_name = name_list[0]
                    first_name = name_list[1]
                    author = find_or_make_author_object(first_name, last_name)
                    conn.add(document, has_author_prop, author)
                elif " " in author:                  # For author first and last delimited by space. e.g., "Chomsky Noam"
                    name_list = author.split(" ")
                    last_name = name_list[0]
                    first_name = name_list[1]
                    author = find_or_make_author_object(first_name, last_name)
                    conn.add(document, has_author_prop, author)
                else:                               # When there is only a last name (no delimiter) e.g., "Turing, Alan; Hauser; Chomsky, Noam"
                    last_name = author[0]
                    first_name = ""
                    author = find_or_make_author_object(first_name, last_name)
                    conn.add(document, has_author_prop, author)
    elif "," in author_string:   # Next test is if a comma is used to delimit authors
        author_list_un_stripped = author_string.split(",")
        for author_string in author_list_un_stripped:
            author_list.append(author_string.strip())  #Need to test for blank space as a delimiter so want to strip out leading and trailing blanks
        print("Stripped string list:", author_list)
        if len(author_list) == 2 and " " not in author_list[0]:   # This was to test when there is only one author but won't always work because there still may be
            last_name = author_list[0]                            # a blank. Will work if the entire string is "Chomsky,Noam" but not if it is "Chomsky, Noam"
            first_name = author_list[1]
            author = find_or_make_author_object(first_name, last_name)
            conn.add(document, has_author_prop, author)
        elif len(author_list) < 2:
            last_name = author_list[0]
            first_name = ""
            author = find_or_make_author_object(first_name, last_name)
            conn.add(document, has_author_prop, author)
        else:
            for author_string in author_list:                   # Standard case where both full names and first, last are delimited by commas
                if "," in author_string:                        # e.g., "Turing, Alan, Chomsky, Noam"
                    name_list = author_string.split(",")
                    last_name = name_list[0]
                    first_name = name_list[1]
                    author = find_or_make_author_object(first_name, last_name)
                    conn.add(document, has_author_prop, author)
                else:                                           # Where full names delimited by commas and last-first by spaces
                    name_list = author_string.split(" ")        # e.g., "Turing Alan, Chomsky Noam"
                    last_name = name_list[0]
                    first_name = name_list[1]
                    author = find_or_make_author_object(first_name, last_name)
                    conn.add(document, has_author_prop, author)

In [11]:
# Cases to add: 1) When there is just one name delimited by comma or string
# Was trying to check for those in code above but don't think it is correct. e.g.,  when complete string is "Chomsky, Noam"
# 2) When complete string is just a last name. E.g., "Chomsky"

add_authors()

Demo

In [2]:
import re

def parse_names_last_names(input_string):
    # Regular expression to match names and last names
    pattern = r'([A-Za-z]+(?: [A-Za-z]+)?), ([A-Za-z]+(?: [A-Za-z]+)?)'
    
    # Find all matches in the input string
    matches = re.findall(pattern, input_string)
    
    # Extract names and last names from matches
    parsed_names_last_names = [(name.strip(), last_name.strip()) for last_name, name in matches]
    
    return parsed_names_last_names

# Test the function with example strings
strings = [
    "Chomsky, Noam; Turing, Alan; Dawkins, Richard",
    "Chomsky, Noam; Turing, Alan; Dawkins, Richard; de Waal, Frans",
    "Chomsky, Noam; de Waal, Frans",
    "de Waal, Frans; Chomsky, Noam; Turing; Dawkins, Richard",
    "Chomsky; Turing, Alan; Dawkins, Richard",
    "Chomsky, Noam; Turing; Dawkins, Richard",
    "Chomsky, Noam; Turing, Alan; Dawkins",
    "Chomsky, Noam; Turing, Alan; Dawkins; de Waal, Frans", # a bug
    "Chomsky, Noam, Turing, Alan, Dawkins, Richard",
    "Chomsky, Noam, Turing, Alan, Dawkins, Richard, de Waal, Frans",
    "Chomsky, Turing, Alan, Dawkins, Richard",
    "Chomsky Noam, Turing Alan, Dawkins Richard",
    "Turing, Alan",
    "Dawkins Richard",
    "de Waal, Frans",
    "de Waal Frans"
]

for string in strings:
    print(parse_names_last_names(string))


[('Noam', 'Chomsky'), ('Alan', 'Turing'), ('Richard', 'Dawkins')]
[('Noam', 'Chomsky'), ('Alan', 'Turing'), ('Richard', 'Dawkins'), ('Frans', 'de Waal')]
[('Noam', 'Chomsky'), ('Frans', 'de Waal')]
[('Frans', 'de Waal'), ('Noam', 'Chomsky'), ('Richard', 'Dawkins')]
[('Alan', 'Turing'), ('Richard', 'Dawkins')]
[('Noam', 'Chomsky'), ('Richard', 'Dawkins')]
[('Noam', 'Chomsky'), ('Alan', 'Turing')]
[('Noam', 'Chomsky'), ('Alan', 'Turing'), ('Frans', 'de Waal')]
[('Noam', 'Chomsky'), ('Alan', 'Turing'), ('Richard', 'Dawkins')]
[('Noam', 'Chomsky'), ('Alan', 'Turing'), ('Richard', 'Dawkins'), ('Frans', 'de Waal')]
[('Turing', 'Chomsky'), ('Dawkins', 'Alan')]
[('Turing Alan', 'Chomsky Noam')]
[('Alan', 'Turing')]
[]
[('Frans', 'de Waal')]
[]


Write RAG for retreiving names!

Teach on a huge database of names.

## Building RAG model with Claude 3 Sonnet

### Importing libraries

In [2]:
!pip install langchain
!pip install langchain-core
!pip install langchain-community
!pip install langchain-anthropic

Collecting langsmith<0.1.0,>=0.0.70 (from langchain)
  Obtaining dependency information for langsmith<0.1.0,>=0.0.70 from https://files.pythonhosted.org/packages/97/cd/1c618f89d3fcbb375c99a3ea950bffba8a01862cc0f0ab5032dfb95e8d1e/langsmith-0.0.92-py3-none-any.whl.metadata
  Using cached langsmith-0.0.92-py3-none-any.whl.metadata (9.9 kB)
INFO: pip is looking at multiple versions of langchain-core to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-core<0.2,>=0.1 (from langchain)
  Obtaining dependency information for langchain-core<0.2,>=0.1 from https://files.pythonhosted.org/packages/e9/ba/ee09e7613ff2162e15cb4b5b9181b4dae398298cbfe91bb6f431de59643e/langchain_core-0.1.44-py3-none-any.whl.metadata
  Using cached langchain_core-0.1.44-py3-none-any.whl.metadata (5.9 kB)
  Obtaining dependency information for langchain-core<0.2,>=0.1 from https://files.pythonhosted.org/packages/4a/76/420c6449e20b6a7ac65c7f58689f11399ada2c0919b74a

### Loading data

Using data provided by [Name Census](https://census.name).

In [9]:
import pandas as pd

first = pd.read_csv('first-name-database.csv', sep=';') # first names
last = pd.read_csv('surname-database.csv', sep=';') # last names

In [12]:
first['Type'] = 'Name'

print('Number of names: ', first.shape[0])
first.head()

Number of names:  27800


Unnamed: 0,Name,Name ASCII,Country code,Country name,Continent,Official,Gender,Unisex,Country Rank,Frequency,Type
0,Michael,michael,US,United States,North America,y,m,n,1,50350,Name
1,John,john,US,United States,North America,y,m,n,2,45057,Name
2,David,david,US,United States,North America,y,m,n,3,44352,Name
3,Chris,chris,US,United States,North America,y,m,n,4,41645,Name
4,James,james,GB,United Kingdom,Europe,y,m,n,1,31802,Name


In [13]:
last['Type'] = 'Surname'

print('Number of surnames: ', last.shape[0])
last.head()

Number of surnames:  13900


Unnamed: 0,Name,Name ASCII,Country code,Country name,Continent,Official,Gender,Country Rank,Frequency,Type
0,Kumar,kumar,IN,India,Asia,n,,1,41307,Surname
1,Smith,smith,US,United States,North America,y,,1,40236,Surname
2,Singh,singh,IN,India,Asia,y,,2,35097,Surname
3,Smith,smith,GB,United Kingdom,Europe,y,,1,28135,Surname
4,Johnson,johnson,US,United States,North America,y,,2,27834,Surname


In [15]:
import numpy as np

In [23]:
df = pd.DataFrame(
    np.vstack((first.drop(['Unisex'], axis=1), last)), # removing extra `Unisex` column
    columns=last.columns # assigning columns after concatenation
)
df.head()

Unnamed: 0,Name,Name ASCII,Country code,Country name,Continent,Official,Gender,Country Rank,Frequency,Type
0,Michael,michael,US,United States,North America,y,m,1,50350,Name
1,John,john,US,United States,North America,y,m,2,45057,Name
2,David,david,US,United States,North America,y,m,3,44352,Name
3,Chris,chris,US,United States,North America,y,m,4,41645,Name
4,James,james,GB,United Kingdom,Europe,y,m,1,31802,Name


In [25]:
from langchain_community.document_loaders.dataframe import DataFrameLoader

loader = DataFrameLoader(df, page_content_column='Type') # loading data to make more convenient to iterate over
data = loader.load()

In [26]:
data

[Document(page_content='Name', metadata={'Name': 'Michael', 'Name ASCII': 'michael', 'Country code': 'US', 'Country name': 'United States', 'Continent': 'North America', 'Official': 'y', 'Gender': 'm', 'Country Rank': 1, 'Frequency': 50350}),
 Document(page_content='Name', metadata={'Name': 'John', 'Name ASCII': 'john', 'Country code': 'US', 'Country name': 'United States', 'Continent': 'North America', 'Official': 'y', 'Gender': 'm', 'Country Rank': 2, 'Frequency': 45057}),
 Document(page_content='Name', metadata={'Name': 'David', 'Name ASCII': 'david', 'Country code': 'US', 'Country name': 'United States', 'Continent': 'North America', 'Official': 'y', 'Gender': 'm', 'Country Rank': 3, 'Frequency': 44352}),
 Document(page_content='Name', metadata={'Name': 'Chris', 'Name ASCII': 'chris', 'Country code': 'US', 'Country name': 'United States', 'Continent': 'North America', 'Official': 'y', 'Gender': 'm', 'Country Rank': 4, 'Frequency': 41645}),
 Document(page_content='Name', metadata={'

### Splitting data

Splitting data into small, more manageable chunks (good for models with limited contexts).

In [27]:
from langchain_text_splitters import CharacterTextSplitter

splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=30)
docs = splitter.split_documents(data)

### Embeddings

Creating vector representations for semantic search.

In [28]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Creating embeddings
embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5')

# Creating a FAISS index object for each embedding for similarity search
# and nearest neighbor retrieval
vector_db = FAISS.from_documents(docs, embeddings) 

  torch.utils._pytree._register_pytree_node(


[**FAISS**](https://python.langchain.com/docs/integrations/vectorstores/faiss/) (Facebook AI Similarity Search) is a a library for quick similarity search and clustering of dense vectors.

### Initialize LLM model

In this notebook, we used [**Claude 3 Sonnet**](https://www.anthropic.com/news/claude-3-family) model by Anthropic.

General architecture of Claude 3:

- sparse transformer: adds sparsity into self-attention
- reversible transformer layers: efficient backprogating during training

Resulted in reduced memory usage and larger scaling of the model.

Notes:

- can use multimodal data as well! possibly put in work in future
- has very good commonsense reasoning and near-perfect recall!
- fast!

In [29]:
from langchain_anthropic import ChatAnthropic

# To run the notebook locally, insert API key for Anthropic
ANTHROPIC_API_KEY = 'YOUR-KEY'
llm = ChatAnthropic(anthropic_api_key=ANTHROPIC_API_KEY, model="claude-3-sonnet-20240229", temperature=0.2, max_tokens=512)

- `temperature`: controls the randomness/creativity of the model (0.2 for more deterministic; 1.0 more diverse and exploratory output)
- `max_tokens`: maximum number of tokens that the model can generate in s single response

### Retrieve response

In [102]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

prompt_template ="""
    <|system|>
    [INST]
    Given string data given as a distinct string, a list of strings, or in a table
    (.csv, pandas DataFrame, or any any other table format),
    split it into the name and surname based on your database.

    Yield a result in a DataFrame format table with two columns called `name` and `surname`.
    Use as many rows as needed.
    {context}
    </s>
    <|user|>
    {question}
    </s>
    [/INST]
    <|assistant|>
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

In [103]:
from langchain_core.runnables import RunnablePassthrough

# Converting the vector database into a Retriever class
retriever = vector_db.as_retriever()

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

In [104]:
lst = ["Chomsky, Noam; Turing, Alan; Dawkins, Richard",
    "Chomsky, Noam; Turing, Alan; Dawkins, Richard; de Waal, Frans",
    "Chomsky, Noam; de Waal, Frans",
    "de Waal, Frans; Chomsky, Noam; Turing; Dawkins, Richard",
    "Chomsky; Turing, Alan; Dawkins, Richard",
    "Chomsky, Noam; Turing; Dawkins, Richard",
    "Chomsky, Noam; Turing, Alan; Dawkins",
    "Chomsky, Noam; Turing, Alan; Dawkins; de Waal, Frans", # a bug
    "Chomsky, Noam, Turing, Alan, Dawkins, Richard",
    "Chomsky, Noam, Turing, Alan, Dawkins, Richard, de Waal, Frans",
    "Chomsky, Turing, Alan, Dawkins, Richard",
    "Chomsky Noam, Turing Alan, Dawkins Richard",
    "Turing, Alan",
    "Dawkins Richard",
    "de Waal, Frans",
    "de Waal Frans"]

question = ' '.join(lst)

# Run similarity search and generate a response
answer = rag_chain.invoke(question)

In [105]:
print(answer)

Here is the DataFrame with the names and surnames split from the given strings:

| name | surname |
|------|---------|
| Noam | Chomsky |
| Alan | Turing  |
| Richard | Dawkins |
| Frans | de Waal |
| Noam | Chomsky |
| Alan | Turing  |
| Richard | Dawkins |
| Frans | de Waal |
| Noam | Chomsky |
| Alan | Turing  |
| Richard | Dawkins |
| Noam | Chomsky |
| Alan | Turing  |
| Richard | Dawkins |
| Noam | Chomsky |
| Alan | Turing  |
| Richard | Dawkins |
| Frans | de Waal |
| Noam | Chomsky |
| Alan | Turing  |
| Richard | Dawkins |
| Frans | de Waal |
| Noam | Chomsky |
| Alan | Turing  |
| Richard | Dawkins |
| Noam | Chomsky |
| Alan | Turing  |
| Richard | Dawkins |
| Alan | Turing  |
| Richard | Dawkins |
| Frans | de Waal |
| Frans | de Waal |


In [106]:
answer

'Here is the DataFrame with the names and surnames split from the given strings:\n\n| name | surname |\n|------|---------|\n| Noam | Chomsky |\n| Alan | Turing  |\n| Richard | Dawkins |\n| Frans | de Waal |\n| Noam | Chomsky |\n| Alan | Turing  |\n| Richard | Dawkins |\n| Frans | de Waal |\n| Noam | Chomsky |\n| Alan | Turing  |\n| Richard | Dawkins |\n| Noam | Chomsky |\n| Alan | Turing  |\n| Richard | Dawkins |\n| Noam | Chomsky |\n| Alan | Turing  |\n| Richard | Dawkins |\n| Frans | de Waal |\n| Noam | Chomsky |\n| Alan | Turing  |\n| Richard | Dawkins |\n| Frans | de Waal |\n| Noam | Chomsky |\n| Alan | Turing  |\n| Richard | Dawkins |\n| Noam | Chomsky |\n| Alan | Turing  |\n| Richard | Dawkins |\n| Alan | Turing  |\n| Richard | Dawkins |\n| Frans | de Waal |\n| Frans | de Waal |'

Testing on an actual dataset:

In [107]:
data = pd.read_csv('authors-example.csv') # 109 rows, 1 column

data.head()

Unnamed: 0,creator
0,"Worthington, HV; Khangura, S; Seal, K; Mierzwi..."
1,"Osborn, DA; Sinn, JKH"
2,"Nankervis, H; Pynn, EV; Boyle, RJ; Rushton, L;..."
3,"Innes, NPT; Ricketts, D; Chong, LY; Keightley,..."
4,"Apfelbacher, CJ; van Zuuren, EJ; Fedorowicz, Z..."


In [108]:
question = data.to_string()

In [109]:
question

"                                                                                                                                                                                                                                                                                                                                                creator\n0                                                                                                                                                                                           Worthington, HV; Khangura, S; Seal, K; Mierzwinski-Urban, M; Veitz-Keenan, A; Sahrmann, P; Schmidlin, PR; Davis, D; Iheozor-Ejiofor, Z; Rasines Alcaraz, MG\n1                                                                                                                                                                                                                                                                                                                    

In [110]:
# Run similarity search and generate a response
answer = rag_chain.invoke(question)

In [111]:
print(answer)

Here is the data split into name and surname columns in a DataFrame format:

| name | surname |
|------|---------|
| HV   | Worthington |
| S    | Khangura |
| K    | Seal |
| M    | Mierzwinski-Urban |
| A    | Veitz-Keenan |
| P    | Sahrmann |
| PR   | Schmidlin |
| D    | Davis |
| Z    | Iheozor-Ejiofor |
| MG   | Rasines Alcaraz |
| DA   | Osborn |
| JKH  | Sinn |
| H    | Nankervis |
| EV   | Pynn |
| RJ   | Boyle |
| L    | Rushton |
| HC   | Williams |
| DM   | Hewson |
| T    | Platts‐Mills |
| NPT  | Innes |
| D    | Ricketts |
| LY   | Chong |
| AJ   | Keightley |
| T    | Lamont |
| RM   | Santamaria |
| CJ   | Apfelbacher |
| EJ   | van Zuuren |
| Z    | Fedorowicz |
| A    | Jupiter |
| U    | Matterne |
| E    | Weisshaar |
| MM   | Kelleher |
| R    | Phillips |
| SJ   | Brown |
| S    | Cro |
| V    | Cornelius |
| KC   | Carlsen |
| Lødrup | Lødrup |
| HO   | Skjerven |
| EM   | Rehbinder |
| AJ   | Lowe |
| E    | Dissanayake |
| N    | Shimojo |
| K    | Yonezawa-H

In [112]:

answer_chunked

['Here is the data split into name and surname columns in a DataFrame format:',
 '',
 '| name | surname |',
 '|------|---------|',
 '| HV   | Worthington |',
 '| S    | Khangura |',
 '| K    | Seal |',
 '| M    | Mierzwinski-Urban |',
 '| A    | Veitz-Keenan |',
 '| P    | Sahrmann |',
 '| PR   | Schmidlin |',
 '| D    | Davis |',
 '| Z    | Iheozor-Ejiofor |',
 '| MG   | Rasines Alcaraz |',
 '| DA   | Osborn |',
 '| JKH  | Sinn |',
 '| H    | Nankervis |',
 '| EV   | Pynn |',
 '| RJ   | Boyle |',
 '| L    | Rushton |',
 '| HC   | Williams |',
 '| DM   | Hewson |',
 '| T    | Platts‐Mills |',
 '| NPT  | Innes |',
 '| D    | Ricketts |',
 '| LY   | Chong |',
 '| AJ   | Keightley |',
 '| T    | Lamont |',
 '| RM   | Santamaria |',
 '| CJ   | Apfelbacher |',
 '| EJ   | van Zuuren |',
 '| Z    | Fedorowicz |',
 '| A    | Jupiter |',
 '| U    | Matterne |',
 '| E    | Weisshaar |',
 '| MM   | Kelleher |',
 '| R    | Phillips |',
 '| SJ   | Brown |',
 '| S    | Cro |',
 '| V    | Cornelius |

In [154]:
result = pd.DataFrame(answer.split('\n')[2:])
result = result.drop([0, 1], axis=0)

splitted = result[0].str.split('|')

result['name'] = [x[1].split(' ')[1] for x in splitted]
result['surname'] = [x[2].split(' ')[1] for x in splitted]

In [155]:
result

Unnamed: 0,0,name,surname
2,| HV | Worthington |,HV,Worthington
3,| S | Khangura |,S,Khangura
4,| K | Seal |,K,Seal
5,| M | Mierzwinski-Urban |,M,Mierzwinski-Urban
6,| A | Veitz-Keenan |,A,Veitz-Keenan
7,| P | Sahrmann |,P,Sahrmann
8,| PR | Schmidlin |,PR,Schmidlin
9,| D | Davis |,D,Davis
10,| Z | Iheozor-Ejiofor |,Z,Iheozor-Ejiofor
11,| MG | Rasines Alcaraz |,MG,Rasines


Looks valid!