# ETL thesis
This notebook gets thesis data from Pure and creates several visualisations.

In [None]:
import requests
from requests.auth import HTTPBasicAuth
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Load API token from secrets file (.env)
%load_ext dotenv
%dotenv
import os

api_token = os.getenv("API_TOKEN")

### Config

In [None]:
# Config
headers = {'Accept': 'application/xml', 'UBKey': api_token}
page_size = 500
min_year=2010
max_year=2020


### Start ETL

Initialize data frame

In [None]:
# Initial call to determine amount of results and number of pages (iterations)
tmp = requests.get('https://crisapi.library.maastrichtuniversity.nl/api/ResearchOutput?page=1&types=/dk/atira/pure/researchoutput/researchoutputtypes/thesis/doc&size={}&year_begin={}&year_end={}&extra=personAssociations,publicationStatuses'.format(page_size, min_year, max_year), headers=headers)
tmp_string_xml = ET.fromstring(tmp.content)
tmp_tree = ET.ElementTree(tmp_string_xml)
result_count = int(tmp_tree.getroot()[0].text)
iterations = int(np.ceil(result_count / page_size))


Perform API calls, parse XML and populate data frame

In [None]:
# Initialize cols and rows for data frame
df_cols = ["uuid", "title", "author", "author_uuid", "year", "language", "org_unit_uuid", "org_unit_extid", "org_unit_name", "doi"]
rows = []

# Loop over pages until there are no more results left.
for i in range(1,iterations+1):
    out = requests.get('https://crisapi.library.maastrichtuniversity.nl/api/ResearchOutput?page={}&types=/dk/atira/pure/researchoutput/researchoutputtypes/thesis/doc&size={}&year_begin={}&year_end={}&extra=personAssociations,publicationStatuses'.format(i, page_size, min_year, max_year), headers=headers)

    # Convert response to XML tree
    string_xml = ET.fromstring(out.content)
    tree = ET.ElementTree(string_xml)
    root = tree.getroot()
    items = root[2]
    
    for node in items.findall(".//thesis"):
        pub_uuid = node.attrib.get("uuid")
        pub_title = node.find("title").text

        if node.find("./personAssociations/personAssociation/person/name/text") is not None:
            pub_author = node.find("./personAssociations/personAssociation/person/name/text").text
            pub_author_uuid = node.find("./personAssociations/personAssociation/person").attrib.get("uuid")
        elif node.find("./personAssociations/personAssociation/externalPerson/name/text") is not None:
            pub_author = node.find("./personAssociations/personAssociation/externalPerson/name/text").text
            pub_author_uuid = node.find("./personAssociations/personAssociation/externalPerson").attrib.get("uuid")
        else:
            pub_author = np.nan
            pub_author_uuid = np.nan

        pub_year = node.find("./publicationStatuses/publicationStatus/publicationDate/year").text
        pub_lang = node.find("./language/term/text").text

        if node.find("./personAssociations/personAssociation/organisationalUnits/organisationalUnit/name/text") is not None:
            pub_org_unit_uuid = node.find("./personAssociations/personAssociation/organisationalUnits/organisationalUnit").attrib.get("uuid")
            pub_org_unit_extid = node.find("./personAssociations/personAssociation/organisationalUnits/organisationalUnit").attrib.get("externalId")
            # Ugly data quality replacement
            if pub_org_unit_extid is None:
                pub_org_unit_extid = np.nan
            pub_org_unit_name = node.find("./personAssociations/personAssociation/organisationalUnits/organisationalUnit/name/text").text
        else:
            pub_org_unit_uuid = np.nan
            pub_org_unit_name = np.nan


        if node.find("./electronicVersions/electronicVersion/doi") is not None:
            pub_doi = node.find("./electronicVersions/electronicVersion/doi").text
        else: 
            pub_doi = np.nan

        rows.append({"uuid": pub_uuid,
                     "title" : pub_title,
                     "author" : pub_author,
                     "author_uuid" : pub_author_uuid,
                     "year" : pub_year,
                     "language" : pub_lang,
                     "org_unit_uuid" : pub_org_unit_uuid,
                     "org_unit_extid" : pub_org_unit_extid,
                     "org_unit_name" : pub_org_unit_name,
                     "doi" : pub_doi
                    })
    
pub_df = pd.DataFrame(rows, columns = df_cols)

### Data inspection

In [None]:
pub_df.describe()

In [None]:
pub_df.head(10)

### Visualisations

In [None]:
plt.hist(x=pub_df['year'], rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.title('Amount of theses per year')
plt.show()

In [None]:
plt.hist(x=pub_df['language'], log=True)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Language')
plt.ylabel('Frequency (log)')
plt.title('Amount of theses per language')
plt.show()

In [None]:
top10_departments_name = pub_df.groupby('org_unit_name').size().sort_values(ascending=False)[0:10]

In [None]:
plt.bar(x=top10_departments_name.keys(), height=top10_departments_name)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Department')
plt.ylabel('Amount')
plt.xticks(rotation='vertical')
plt.title('Top 10 theses-producing departments (2010 - 2020)')
plt.show()