In [2]:
import json
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from xml.dom import minidom
import pandas as pd
import time

# pretty print xml
def pretty_print_xml(xml):
    xml = minidom.parseString(xml)
    xml = xml.toprettyxml()
    print(xml)

In [2]:
url = "https://frisr4.researchportal.be/ws/ProjectService?wsdl"

payload = """
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
    <soap:Body>
        <ns1:getProjects xmlns:ns1="http://fris.ewi.be/">
            <projectCriteria xmlns="http://fris.ewi.be/criteria">
                <uuids>
                    <identifier>d833bf9c-e2b4-4843-a897-1eaa0baeffa6</identifier>
                </uuids>
            </projectCriteria>
        </ns1:getProjects>
    </soap:Body>
</soap:Envelope>
"""

response = requests.request("POST", url, data=payload)

pretty_print_xml(response.text)

<?xml version="1.0" ?>
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
	<soap:Body>
		<ns1:getProjectsResponse xmlns:ns1="http://fris.ewi.be/">
			<queryResult xmlns="http://fris.ewi.be/response">
				<totalResults>1</totalResults>
				<pageSize>10</pageSize>
				<pageNumber>0</pageNumber>
				<CERIF xmlns="urn:xmlns:org:eurocris:cerif-1.5-1-FRIS" release="1.5" date="2024-06-14Z" sourceDatabase="fris">
					<frAssignment>
						<frAssignmentId>142790052</frAssignmentId>
						<cfOrgUnitId>c67d5ec0-4ab8-48f8-a774-ee86f23cc2c8</cfOrgUnitId>
						<cfClassId xmlns="urn:xmlns:org:eurocris:cerif-1.5-1">Member</cfClassId>
						<cfClassSchemeId xmlns="urn:xmlns:org:eurocris:cerif-1.5-1">Assignment Role</cfClassSchemeId>
						<cfStartDate xmlns="urn:xmlns:org:eurocris:cerif-1.5-1">2018-10-01T00:00:00.000Z</cfStartDate>
						<cfEndDate xmlns="urn:xmlns:org:eurocris:cerif-1.5-1">2024-09-30T23:59:59.999Z</cfEndDate>
						<cfPersId>ea35050e-067b-4d6d-8c10-93b0dfa71139</c

In [3]:
# get project abstract
soup = BeautifulSoup(response.text, "xml")
abstract = soup.find(lambda tag: tag.name == "cfAbstr")
abstract_html = BeautifulSoup(abstract.text, "html.parser")
print(abstract_html.get_text())

Viruses have evolved to hijack key cellular components of their natural host. The VirEOS project will analyze how medically relevant viruses such as human respiratory syncytial, hepatitis E, yellow fever, Zika or Kaposi sarcoma-associated herpes virus, interfere with RNA sensing and RNA homeostasis in their host cells. The impact of identified factors on the pathogenicity and on the immune responses will then be analyzed in vivo, using infectious models. Understanding how viruses manipulate cellular RNA should not only provide new targets to the development of antiviral drugs but also help to identify important cellular hubs in RNA physiology.


In [4]:
url = "https://frisr4.researchportal.be/ws/ResearchOutputService?wsdl"

payload = """
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
    <soap:Body>
        <getResearchOutput xmlns="http://fris.ewi.be/">
            <researchOutputCriteria xmlns="http://fris.ewi.be/criteria">
                <window>
                    <pageSize>100</pageSize>
                    <pageNumber>0</pageNumber>
                </window>
                <associatedProjects>
                    <identifier>d833bf9c-e2b4-4843-a897-1eaa0baeffa6</identifier>
                </associatedProjects>
            </researchOutputCriteria>
        </getResearchOutput>
    </soap:Body>
</soap:Envelope>
"""

response = requests.request("POST", url, data=payload)
pretty_print_xml(response.text)

<?xml version="1.0" ?>
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
	<soap:Body>
		<ns1:getResearchOutputResponse xmlns:ns1="http://fris.ewi.be/">
			<queryResult xmlns="http://fris.ewi.be/response">
				<totalResults>8</totalResults>
				<pageSize>100</pageSize>
				<pageNumber>0</pageNumber>
				<CERIF xmlns="urn:xmlns:org:eurocris:cerif-1.5-1-FRIS" release="1.5" date="2024-06-14Z" sourceDatabase="fris">
					<frAssignment>
						<frAssignmentId>1108430649</frAssignmentId>
						<cfOrgUnitId>c67d5ec0-4ab8-48f8-a774-ee86f23cc2c8</cfOrgUnitId>
						<cfClassId xmlns="urn:xmlns:org:eurocris:cerif-1.5-1">Member</cfClassId>
						<cfClassSchemeId xmlns="urn:xmlns:org:eurocris:cerif-1.5-1">Assignment Role</cfClassSchemeId>
						<cfStartDate xmlns="urn:xmlns:org:eurocris:cerif-1.5-1">2018-10-01T00:00:00.000Z</cfStartDate>
						<cfEndDate xmlns="urn:xmlns:org:eurocris:cerif-1.5-1">2024-12-31T23:59:59.999Z</cfEndDate>
						<cfPersId>7aeb2d1c-8e9b-41cf-bec8-6a39527

In [5]:
# get publication titles for a project
soup = BeautifulSoup(response.text, "xml")
titles = soup.find_all(lambda tag: tag.name == "cfTitle")
for i, title in enumerate(titles):
    print(title.text) if i % 2 == 0 else None

Deep mutational scanning of proteins in mammalian cells
Novel prime-boost immune-based therapy inhibiting both hepatitis B and D virus infections
How RSV proteins join forces to overcome the host innate immune response
Cov²MS : an automated and quantitative matrix-independent assay for mass spectrometric measurement of SARS-CoV-2 nucleocapsid protein
Human hepatocyte PNPLA3-148M exacerbates rapid non-alcoholic fatty liver disease development in chimeric mice
An unexpected encounter : respiratory syncytial virus nonstructural protein 1 interacts with mediator subunit MED25
Viral interference of hepatitis C and E virus replication in novel experimental co-infection systems
Study of hepatitis E virus-4 infection in human liver-chimeric, immunodeficient, and immunocompetent mice


In [6]:
# get all publication IDs for a project
soup = BeautifulSoup(response.text, "xml")
results = soup.find_all(lambda tag: tag.name == "cfResPublId")
pub_list = [result.text for i, result in enumerate(results) if i % 2 == 0]
print(len(pub_list))
for pub in pub_list:
    print(pub)


8
a32ac085-a975-4ed8-921f-f6f83cc1355d
55975b31-3cc2-4534-a097-85b1eab6cd53
bbaf834d-f8e1-4187-84b4-a35484625c09
50ad741e-6c36-4fbb-9b48-0fa901e9f63b
383f820f-0e0c-4aee-bfb4-6ebd5f6c2b63
702dd3d4-ebb1-4be2-87a1-03a9190e7608
ca3fcf40-66cf-44cd-a51e-3cff9e189cf5
d0399103-8325-4b32-9476-3d3729793829


In [7]:
url = "https://frisr4.researchportal.be/ws/ResearchOutputService?wsdl"

payload = """
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
    <soap:Body>
        <getResearchOutput xmlns="http://fris.ewi.be/">
            <researchOutputCriteria xmlns="http://fris.ewi.be/criteria">
                <window>
                    <pageSize>100</pageSize>
                    <pageNumber>0</pageNumber>
                </window>
                <uuids>
                    <identifier>a32ac085-a975-4ed8-921f-f6f83cc1355d</identifier>
                </uuids>
            </researchOutputCriteria>
        </getResearchOutput>
    </soap:Body>
</soap:Envelope>
"""

response = requests.request("POST", url, data=payload)
pretty_print_xml(response.text)

<?xml version="1.0" ?>
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
	<soap:Body>
		<ns1:getResearchOutputResponse xmlns:ns1="http://fris.ewi.be/">
			<queryResult xmlns="http://fris.ewi.be/response">
				<totalResults>1</totalResults>
				<pageSize>100</pageSize>
				<pageNumber>0</pageNumber>
				<CERIF xmlns="urn:xmlns:org:eurocris:cerif-1.5-1-FRIS" release="1.5" date="2024-06-14Z" sourceDatabase="fris">
					<frAssignment>
						<frAssignmentId>1108430649</frAssignmentId>
						<cfOrgUnitId>c67d5ec0-4ab8-48f8-a774-ee86f23cc2c8</cfOrgUnitId>
						<cfClassId xmlns="urn:xmlns:org:eurocris:cerif-1.5-1">Member</cfClassId>
						<cfClassSchemeId xmlns="urn:xmlns:org:eurocris:cerif-1.5-1">Assignment Role</cfClassSchemeId>
						<cfStartDate xmlns="urn:xmlns:org:eurocris:cerif-1.5-1">2018-10-01T00:00:00.000Z</cfStartDate>
						<cfEndDate xmlns="urn:xmlns:org:eurocris:cerif-1.5-1">2024-12-31T23:59:59.999Z</cfEndDate>
						<cfPersId>7aeb2d1c-8e9b-41cf-bec8-6a39527

In [8]:
# get publication abstract
soup = BeautifulSoup(response.text, "xml")
abstract = soup.find(lambda tag: tag.name == "cfAbstr" and tag.get("cfLangCode") == "en")
print(abstract.text)

Protein mutagenesis is essential for unveiling the molecular mechanisms underlying protein function in health, disease, and evolution. In the past decade, deep mutational scanning methods have evolved to support the functional analysis of nearly all possible single-amino acid changes in a protein of interest. While historically these methods were developed in lower organisms such as E. coli and yeast, recent technological advancements have resulted in the increased use of mammalian cells, particularly for studying proteins involved in human disease. These advancements will aid significantly in the classification and interpretation of variants of unknown significance, which are being discovered at large scale due to the current surge in the use of whole-genome sequencing in clinical contexts. Here, we explore the experimental aspects of deep mutational scanning studies in mammalian cells and report the different methods used in each step of the workflow, ultimately providing a useful gu

In [9]:
def get_pub_ids(uuid):
    '''Retrieves all publication IDs for a project given its UUID.'''
    url = "https://frisr4.researchportal.be/ws/ResearchOutputService?wsdl"

    payload = f"""
    <soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
        <soap:Body>
            <getResearchOutput xmlns="http://fris.ewi.be/">
                <researchOutputCriteria xmlns="http://fris.ewi.be/criteria">
                    <window>
                        <pageSize>100</pageSize>
                        <pageNumber>0</pageNumber>
                    </window>
                    <associatedProjects>
                        <identifier>{uuid}</identifier>
                    </associatedProjects>
                </researchOutputCriteria>
            </getResearchOutput>
        </soap:Body>
    </soap:Envelope>
    """

    response = requests.request("POST", url, data=payload)

    soup = BeautifulSoup(response.text, "xml")
    results = soup.find_all(lambda tag: tag.name == "cfResPublId")
    pub_list = [result.text for i, result in enumerate(results) if i % 2 == 0]
    pub_ids = []
    for pub in pub_list:
        if ":" in pub:
            pub = pub.split(":")[1]
        pub_ids.append(pub)
    return pub_ids

In [10]:
def get_project(uuid, verbose=False):
    '''Retrieves project information given its UUID.'''
    url = "https://frisr4.researchportal.be/ws/ProjectService?wsdl"

    payload = f"""
    <soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
        <soap:Body>
            <ns1:getProjects xmlns:ns1="http://fris.ewi.be/">
                <projectCriteria xmlns="http://fris.ewi.be/criteria">
                    <uuids>
                        <identifier>{uuid}</identifier>
                    </uuids>
                </projectCriteria>
            </ns1:getProjects>
        </soap:Body>
    </soap:Envelope>
    """

    response = requests.request("POST", url, data=payload)

    if verbose:
        pretty_print_xml(response.text)

    # abstract
    soup = BeautifulSoup(response.text, "xml")
    abstract_soup = soup.find(lambda tag: tag.name == "cfAbstr")
    abstract_html = BeautifulSoup(abstract_soup.text, "html.parser")
    abstract = abstract_html.get_text()

    # start date, format: 2018-10-01T00:00:00.000Z
    start_date_soup = soup.find(lambda tag: tag.name == "cfProj")
    start_date = start_date_soup.find("cfStartDate").text.split("Z")[0]

    # organization
    organizations_soup = soup.find_all(lambda tag: tag.name == "cfOrgUnitId")
    organizations = [organization.text for organization in organizations_soup]
    organizations = list(set(organizations))

    # disciplines
    pro_class_soup = soup.find_all(lambda tag: tag.name == "cfProj_Class")
    disciplines = []
    for pro_class in pro_class_soup:
        if pro_class.find("cfClassSchemeId").text == "Flemish Research Disciplines":
            disciplines.append(pro_class.find("cfClassId").text[:4])
    disciplines = sorted(list(set(disciplines)))
    
    # authors
    last_name_soup = soup.find_all(lambda tag: tag.name == "cfFamilyNames")
    first_name_soup = soup.find_all(lambda tag: tag.name == "cfFirstNames")
    authors = []
    for last_name, first_name in zip(last_name_soup, first_name_soup):
        authors.append(f"{first_name.text} {last_name.text}")
    authors = list(set(authors))

    # funding
    try:
        funding_soup = soup.find_all("cfFundId")
        funding_ids = [funding.text for funding in funding_soup]
        funding_ids = list(set(funding_ids))
    except:
        funding_ids = []
    
    return abstract, disciplines, start_date, organizations, authors, funding_ids

In [11]:
def get_publication(pub_id, vods_data: json, verbose=False):
    '''Retrieves publication information given its ID.'''
    url = "https://frisr4.researchportal.be/ws/ResearchOutputService?wsdl"

    payload = f"""
    <soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
        <soap:Body>
            <getResearchOutput xmlns="http://fris.ewi.be/">
                <researchOutputCriteria xmlns="http://fris.ewi.be/criteria">
                    <uuids>
                        <identifier>{pub_id}</identifier>
                    </uuids>
                </researchOutputCriteria>
            </getResearchOutput>
        </soap:Body>
    </soap:Envelope>
    """

    response = requests.request("POST", url, data=payload)

    if verbose:
        pretty_print_xml(response.text)

    soup = BeautifulSoup(response.text, "xml")
    
    # abstract
    abstract_soup = soup.find(lambda tag: tag.name == "cfAbstr")
    abstract_html = BeautifulSoup(abstract_soup.text, "html.parser")
    abstract = abstract_html.get_text()

    # disciplines
    cfFedIds = soup.find_all('cfFedId')
    wos_id = None
    for fed in cfFedIds:
        try:
            if fed.find('cfClassId').text == 'WoS Id':
                wos_id = fed.find('cfFedId').text
        except:
            continue
    
    if wos_id:
        disciplines = sorted(vods_data[wos_id].split(","))
    else:
        disciplines = []

    # organizations
    organizations_soup = soup.find_all('cfOrgUnitId')
    organizations = [organization.text for organization in organizations_soup]
    organizations = list(set(organizations))

    # date, format: 2018-10-01Z
    date = soup.find('cfResPublDate').text.split("Z")[0]

    # authors
    first_name_soup = soup.find_all(lambda tag: tag.name == "cfFirstNames")
    last_name_soup = soup.find_all(lambda tag: tag.name == "cfFamilyNames")
    authors = []
    for first_name, last_name in zip(first_name_soup, last_name_soup):
        authors.append(f"{first_name.text} {last_name.text}")
    authors = list(set(authors))

    # funding
    try:
        funding_soup = soup.find_all('cfFundId')
        funding_ids = [funding.text for funding in funding_soup]
        funding_ids = list(set(funding_ids))
    except:
        funding_ids = []

    return abstract, disciplines, date, organizations, authors, funding_ids

In [12]:
# create json file with FRIS project and publication IDs
df = pd.read_csv("FRIS_projects_pubs.csv")
uuids = df["pro_id"]

json_dict = {}
for i, uuid in enumerate(uuids):
    try:
        pub_ids = get_pub_ids(uuid)
        json_dict[uuid] = pub_ids
        print(i+1, uuid, pub_ids)
    except:
        json_dict[uuid] = []
        print(i+1, uuid, "error")
        continue
    
    time.sleep(5)

with open("FRIS_project_pub_ids.json", "w") as f:
    json.dump(json_dict, f, indent=4)

1 94ddc20f-5e93-4140-bb03-a9c27447d4e2 ['cae434a4-8451-4cf5-bcf5-bd5461c44115', 'dc8d154e-7f2a-4f51-b746-983de655ac02', '791ffbf9-a610-47ce-a116-6ba1c6152887', '3ccca422-f012-4fd9-a1e2-246193bde4f3']
2 d833bf9c-e2b4-4843-a897-1eaa0baeffa6 ['a32ac085-a975-4ed8-921f-f6f83cc1355d', '55975b31-3cc2-4534-a097-85b1eab6cd53', 'bbaf834d-f8e1-4187-84b4-a35484625c09', '50ad741e-6c36-4fbb-9b48-0fa901e9f63b', '383f820f-0e0c-4aee-bfb4-6ebd5f6c2b63', '702dd3d4-ebb1-4be2-87a1-03a9190e7608', 'ca3fcf40-66cf-44cd-a51e-3cff9e189cf5', 'd0399103-8325-4b32-9476-3d3729793829']
3 79de26a7-e00c-40a8-ac03-94a60f7b8110 ['64d2c2b3-e381-4eb5-8de1-7bf659de1708', '69fa8bc4-2514-4e70-8119-56926da2b8ac', '316c80cd-3872-4112-9290-a0af40bb1746', '3e03f4bf-4611-4696-9509-292854c98fc6', 'e5bd239a-29ce-406c-9fb7-96fbb785e7f8', '00bf1a70-b38a-47d7-b45d-b052e7c9460e', '65853d3e-e6ff-42e5-babe-de8687d9109a', 'e706eceb-dc6c-4025-b59c-f0f5b0a5ebda', '674e771c-3398-476a-8685-39b130b579c9']
4 9cc34d7b-2607-4aa5-89ea-4e33cf69f92d [

In [98]:
# create FRIS dataset with all the project and publication information
with open("Concordantie_VODS_2018_WOS_ID.txt") as file:
    vods_data = json.load(file)

with open("FRIS_project_pub_ids.json") as file:
    id_data = json.load(file)

json_dict = {}

for pro_id in id_data.keys():
    uuid = pro_id
    pub_ids = id_data[pro_id]
    try:
        project_abstract, disciplines, start_date, organizations, authors, project_funding = get_project(uuid)
        pub_dict = {}
        pub_list = []
        if pub_ids:
            for pub_id in pub_ids:
                try:
                    pub_abstract, pub_disciplines, pub_date, pub_organizations, pub_authors, pub_funding = get_publication(pub_id, vods_data)
                    pub_dict[pub_id] = {
                        "abstract": pub_abstract,
                        "disciplines": pub_disciplines,
                        "date": pub_date,
                        "organizations": pub_organizations,
                        "authors": pub_authors,
                        "funding": pub_funding
                    }
                    pub_list.append(pub_id)
                except Exception as e:
                    continue
        else:
            print(i+1, uuid, "no pubs")

        json_dict[uuid] = {
            "abstract": project_abstract,
            "disciplines": disciplines,
            "start_date": start_date,
            "organizations": organizations,
            "authors": authors,
            "funding": project_funding,
            "publications": pub_dict
        }
        print(i+1, uuid, pub_list)

    except Exception as e:
        print(i+1, uuid, e)

# dict to json
with open("FRIS_data.json", "w") as file:
    json.dump(json_dict, file, indent=4)

1 94ddc20f-5e93-4140-bb03-a9c27447d4e2 ['3ccca422-f012-4fd9-a1e2-246193bde4f3']
2 d833bf9c-e2b4-4843-a897-1eaa0baeffa6 ['a32ac085-a975-4ed8-921f-f6f83cc1355d', '55975b31-3cc2-4534-a097-85b1eab6cd53', 'bbaf834d-f8e1-4187-84b4-a35484625c09', '50ad741e-6c36-4fbb-9b48-0fa901e9f63b', '383f820f-0e0c-4aee-bfb4-6ebd5f6c2b63', 'ca3fcf40-66cf-44cd-a51e-3cff9e189cf5', 'd0399103-8325-4b32-9476-3d3729793829']
3 79de26a7-e00c-40a8-ac03-94a60f7b8110 ['69fa8bc4-2514-4e70-8119-56926da2b8ac', '316c80cd-3872-4112-9290-a0af40bb1746']
4 9cc34d7b-2607-4aa5-89ea-4e33cf69f92d ['ae73aff3-91db-470d-bc96-ea6d2e9fe137']
5 1c7f82a2-5610-4813-9bdb-766bb98b8482 []
6 e94c1597-1a58-4165-8fdb-d9a7f62a03ae ['d640f97c-94eb-4330-9772-f19d23865d19']
7 4c845bc0-4c79-4105-a2c9-09fbe5275fbd ['db848229-540e-455a-baf4-55a73676b869']
8 616a043a-1c18-4b5f-83a1-c73b35b74fb8 no pubs
8 616a043a-1c18-4b5f-83a1-c73b35b74fb8 []
9 87709c06-002a-4b0b-99b6-eb8dd1be9606 ['8a4dc512-1865-45f1-90d6-aeb015495ac1', 'cf5824dc-bba4-4994-b74f-4b90



255 4545c102-be28-4d52-bf2a-80e1d1bcac86 ['1f1f39c1-e48a-4b5a-abfe-4cdb681c05d3', 'fb773c05-6a37-42b0-b788-ee17d1c9693f', '0e11a69f-bb73-41ae-bdb5-8e3ee6434d5f']
256 dcdb2ac3-baca-4a3d-b0f8-717c118664fb ['2e8fa7de-1d9e-4eb8-9738-85b66aa02103', 'b7292919-eb62-4969-b140-9399b0e327ff']
257 cab44891-63c4-4192-a994-1b55a498eb23 no pubs
257 cab44891-63c4-4192-a994-1b55a498eb23 []
258 e102bdee-cfba-4066-ae3c-6f8a00a64124 []
259 4b9ea739-2077-4e49-82ef-3690daff4ec0 []
260 2d9b2885-30bc-4ff6-96bf-a1d123fd643b ['5dd6478b-ec5d-4250-a483-70fda690ced9']
261 0b3d50e7-dc0d-425e-937a-5d2d674578a6 ['85f8ef7c-01d8-4013-b603-6d5a6d6d7110', '1eaf8921-b413-45de-bc0e-52808dfb42ab']
262 48d929e5-5760-4360-9d7b-7ce46f91f2d9 ['511ad211-209a-4922-bd33-c193828ea2fb']
263 953c5021-65c5-4b2a-9a48-2b0e782af4ee []
264 a1a0a387-97cb-4290-ba4f-464c26963ff9 []
265 a09284cf-9966-4730-aa9d-9a4b294a4a37 ['c17971c5-8a37-404e-bdce-5558062127c0']
266 5c1f7fa6-ce50-4302-a1b7-a7897ef695d4 ['cafc5556-cb49-4dad-8489-9b03ab41d3e6

In [None]:
# create Dimensions dataset with all the project and publication information
pro_df = pd.read_csv('Dimensions_grants.csv')
dim_data = {}

for id in pro_df['id']:
    dis_list = pro_df[pro_df['id'] == id]['cat_for_l2'].values[0].split("'")
    dis_list = [dis[:4] for i, dis in enumerate(dis_list) if i % 2 == 1]
    dim_data[id[6:]] = {'title': pro_df[pro_df['id'] == id]['title'].values[0],
                        'abstract': pro_df[pro_df['id'] == id]['abstract'].values[0],
                        'disciplines': dis_list,
                        'publications': {}}

pub_df = pd.read_csv('Dimensions_pubs.csv')

dim_keys = list(dim_data.keys())
for doi in pub_df['doi']:
    try:
        dis_list = pub_df[pub_df['doi'] == doi]['cat_for_l2'].values[0].split("'")
        dis_list = [dis[:4] for i, dis in enumerate(dis_list) if i % 2 == 1]
    except:
        continue
    pro_ids = pub_df[pub_df['doi'] == doi]['supporting_grant_ids'].values[0].split(",")
    pro_ids = [pro_id[6:] for pro_id in pro_ids]
    for pro_id in pro_ids:
        if pro_id in dim_keys:
            dim_data[pro_id]['publications'][doi] = {'title': pub_df[pub_df['doi'] == doi]['title'].values[0],
                                                    'abstract': pub_df[pub_df['doi'] == doi]['abstract'].values[0],
                                                    'disciplines': dis_list}
        
# dict to json
with open('Dimensions_data.json', 'w') as f:
    json.dump(dim_data, f, indent=4)

In [4]:
# create a small Dimensions data sample with all the discipline codes
with open('Dimensions_data.json') as f:
    data = json.load(f)
print(len(data))

clean_data = {}
for pro_id in data.keys():
    if data[pro_id]['publications'] and \
          data[pro_id]['disciplines'] and \
            type(data[pro_id]['abstract']) == str and \
                data[pro_id]['abstract'] != '':
        clean_data[pro_id] = data[pro_id]
print(len(clean_data))

pro_id_list = list(clean_data.keys())
for pro_id in pro_id_list:
    pubs = {}
    for pub_id in clean_data[pro_id]['publications'].keys():
        if type(clean_data[pro_id]['publications'][pub_id]['abstract']) == str and \
            clean_data[pro_id]['publications'][pub_id]['abstract'] != '':
            pubs[pub_id] = clean_data[pro_id]['publications'][pub_id]
    if pubs:
        clean_data[pro_id]['publications'] = pubs
    else:
        del clean_data[pro_id]

pub_data = {}
for pro_id in clean_data.keys():
    for pub_id in clean_data[pro_id]['publications'].keys():
        pub_data[pub_id] = clean_data[pro_id]['publications'][pub_id]
print(len(clean_data), len(pub_data))

def get_unique_dis(data):
    disciplines = set()
    for pro_id in data.keys():
        disciplines.update(data[pro_id]['disciplines'])
    return disciplines
print(len(get_unique_dis(clean_data)))

dis_dict = {}
for dis in get_unique_dis(clean_data):
    dis_dict[dis] = []
for pro_id in clean_data.keys():
    for dis in clean_data[pro_id]['disciplines']:
        dis_dict[dis].append(pro_id)

pro_id_list = [dis_dict[dis][0] for dis in dis_dict.keys()]
sample_data = {pro_id: clean_data[pro_id] for pro_id in pro_id_list}
print(len(get_unique_dis(sample_data)))

pub_data = {}
for pro_id in sample_data.keys():
    for pub_id in sample_data[pro_id]['publications'].keys():
        pub_data[pub_id] = sample_data[pro_id]['publications'][pub_id]
print(len(get_unique_dis(pub_data)))

rest = set(clean_data.keys()) - set(sample_data.keys())
rest_data = {pro_id: clean_data[pro_id] for pro_id in rest}

dis_dict = {}
for dis in get_unique_dis(sample_data) - get_unique_dis(pub_data):
    dis_dict[dis] = []
for pro_id in rest_data.keys():
    dis_set = set()
    for pub in rest_data[pro_id]['publications'].values():
        dis_set = dis_set.union(set(pub['disciplines']))
    for dis in get_unique_dis(sample_data) - get_unique_dis(pub_data):
        if dis in dis_set:
            dis_dict[dis].append(pro_id)

pro_id_list = [dis_dict[dis][0] for dis in dis_dict.keys()]
sample_data.update({pro_id: clean_data[pro_id] for pro_id in pro_id_list})

pub_data = {}
for pro_id in sample_data.keys():
    for pub_id in sample_data[pro_id]['publications'].keys():
        pub_data[pub_id] = sample_data[pro_id]['publications'][pub_id]
print(len(get_unique_dis(pub_data)))
print(len(sample_data))

# select a sample of the data such that the sample has at least one project for each discipline
rest = set(clean_data.keys()) - set(sample_data.keys())
rest_data = {pro_id: clean_data[pro_id] for pro_id in rest}
num_test = int(0.25 * len(sample_data))

train_data = sample_data
test_data = {pro_id: clean_data[pro_id] for pro_id in list(rest_data.keys())[:num_test]}

train_project_data = {}
train_pub_data = []
for pro_id in train_data.keys():
    train_project_data[pro_id] = (train_data[pro_id]['abstract'], train_data[pro_id]['disciplines'])
    for pub_id in train_data[pro_id]['publications'].keys():
        train_pub_data.append((pub_id, train_data[pro_id]['publications'][pub_id]['abstract'], train_data[pro_id]['publications'][pub_id]['disciplines']))

test_project_data = {}
test_pub_data = []
for pro_id in test_data.keys():
    test_project_data[pro_id] = (test_data[pro_id]['abstract'], test_data[pro_id]['disciplines'])
    for pub_id in test_data[pro_id]['publications'].keys():
        test_pub_data.append((pub_id, test_data[pro_id]['publications'][pub_id]['abstract'], test_data[pro_id]['publications'][pub_id]['disciplines']))

print(len(train_project_data), len(train_pub_data))
print(len(test_project_data), len(test_pub_data))
print(len(get_unique_dis(train_data)), len(get_unique_dis({pub[0]: {'disciplines': pub[2]} for pub in train_pub_data})))

# all_data = {**train_data, **test_data}
# # dict to json
# with open('dim_sample.json', 'w') as f:
#     json.dump(all_data, f, indent=4)

10488
10004
9875 29181
170
170
130
170
182
182 483
45 145
170 170
