In [2]:
from uplink import Consumer, get, post, Body, returns, Field, headers
from uplink import json as uplink_json
import json


In [3]:
api_url = 'https://polon.nauka.gov.pl/opi-ws/'


In [4]:
class PolonAPI(Consumer):

    @returns.json
    @get("api/academicInstitutions")
    def academic_institutions(self): pass

    @returns.json
    @get("api/scientificInstitutions")
    def scientific_institutions(self): pass

    @returns.json
    @get("api/institutions/{uid}")
    def institution(self, uid: str): pass


polon_api = PolonAPI(base_url=api_url)


In [5]:
academic_institutions = polon_api.academic_institutions()
scientific_institutions = polon_api.scientific_institutions()


In [6]:
# Remove string ' w likwidacji' from institution name string
def clean_institution_name(iname):
    iname = iname.replace(' w likwidacji', '').strip()
    iname = iname.replace(' Polskiej Akademii Nauk', '').strip()
    iname = iname.replace(' - Państwowy Instytut Badawczy', '').strip()
    iname = iname.replace(' - PIB', '').strip()
    iname = iname.replace(' - Instytut Badawczy', '').strip()
    iname = iname.replace('Sieć Badawcza Łukasiewicz - ', '').strip()
    return iname



def clean_inst_detail_data(inst_detail_data):
    inst_detail_data.pop('results', None)
    inst_detail_data.pop('version', None)
    inst_detail_data.pop('mainUid', None)
    #inst_detail_data.pop('address', None)
    inst_detail_data.update(inst_detail_data.pop('address', {}))

    inst_detail_data['name_clean'] = clean_institution_name(
        inst_detail_data['name'])


In [7]:
for inst_data in scientific_institutions['scientificInstitutions']:
    uid = inst_data['uid']
    inst_data.update(polon_api.institution(uid))
    clean_inst_detail_data(inst_data)


for inst_data in academic_institutions['institutions']:
    uid = inst_data['uid']
    inst_data.update(polon_api.institution(uid))
    clean_inst_detail_data(inst_data)


academic_institutions['institutions'].extend(
    scientific_institutions['scientificInstitutions'])


In [8]:
import pandas as pd

institutions_df = pd.DataFrame(academic_institutions['institutions'])



In [9]:
# Convert instutions_disciplines so that each column represents one discipline
institutions_disciplines = pd.read_excel(
    'zestawienie.xlsx', index_col='Lp').ffill(axis=0)


dos_dummy = pd.get_dummies(
    institutions_disciplines['Dyscypliny - Nazwa dyscypliny'])
fos_dummy = pd.get_dummies(
    institutions_disciplines['Dyscypliny - Nazwa dziedziny'])

dos_dummy.columns = ['dos_'+col for col in dos_dummy.columns]
fos_dummy.columns = ['fos_'+col for col in fos_dummy.columns]


institutions_disciplines.drop(
    'Dyscypliny - Nazwa dyscypliny', axis=1, inplace=True)
institutions_disciplines.drop(
    'Dyscypliny - Nazwa dziedziny', axis=1, inplace=True)


institutions_disciplines = institutions_disciplines.join(dos_dummy)

institutions_disciplines = institutions_disciplines.join(fos_dummy)

institutions_disciplines = institutions_disciplines.groupby(['Unikalny identyfikator rekordu', 'Nazwa instytucji']).max().reset_index()



In [10]:
len(institutions_disciplines)


283

In [11]:
# Merge on name where possible
in_discp = institutions_df.merge(
    institutions_disciplines, left_on='name', right_on='Nazwa instytucji', how='left')


In [12]:

#institutions_df.to_excel('out/polon.xlsx', index=False, encoding='utf-8')
in_discp.to_excel('out/polon_discp.xlsx', index=False, encoding='utf-8')
