In [1]:
import pandas as pd
from scripts.openalex_works import get_works
import config

In [2]:
output_path = f'{config.project_path}/tables/oalex_'

In [3]:
affiliations = pd.read_csv(f'{config.project_path}/affiliations.csv', dtype=object).applymap(lambda x: x.strip())
# strip: making sure no whitespaces are left

publications table

In [11]:
def extract_countries(work):
    # needed for collaborations
    work_countries = []
    for author in work.get('authorships', []):
        author_countries = set()
        # set: if an author has more than 1 affiliations in a country, we count it only once
        for institution in author.get('institutions', []):
            author_countries.add(institution.get('country_code'))
        work_countries.extend(author_countries)
    return work_countries


def extract_metadata(work):
    publication = {
        'id': work.get('id'),
        'doi': work.get('doi'),
        'title': work.get('title'),
        'concepts': work.get('concepts'),
        'cited_by_count': work.get('cited_by_count'),
        'countries': extract_countries(work)
    }
    return publication

In [15]:
subject_id = 'https://openalex.org/C15744967'  # psychology concept id
year_range = range(2016, 2022)  # end year is exclusive, 2016-2022
publications = []
for index, row in affiliations.iterrows():
    for year in year_range:
        works_filter = f'authorships.institutions.ror:{row["ror"]},publication_year:{year},concepts.id:{subject_id}'
        works = get_works(works_filter)
        for work in works:
            pub = extract_metadata(work)
            pub['university'] = row['university']
            pub['year'] = year
            publications.append(pub)

In [16]:
len(publications)

30419

In [57]:
publications_table = pd.json_normalize(publications)
publications_table = publications_table.drop_duplicates('id').drop(columns=['university', 'concepts'])

In [58]:
publications_table.to_csv(f'{output_path}publications.csv', index=False)

affiliations table

In [19]:
pubs_affs_table = pd.json_normalize(publications)[['id', 'university']]

In [20]:
pubs_affs_table.to_csv(f'{output_path}pubs_affs.csv', index=False)

concepts table
- level 1 (with parent psychology)
- level 2 (with grandparent psychology)
- restcategory (psychology without child)

In [40]:
pubs_concepts_table = pd.json_normalize(publications)[['id', 'concepts']]
pubs_concepts_table = pubs_concepts_table.drop_duplicates('id')
pubs_concepts_table = pubs_concepts_table.explode('concepts').reset_index(drop=True)
pubs_concepts_table = pd.concat([pubs_concepts_table,
                                 pd.json_normalize(pubs_concepts_table['concepts']).add_prefix('concept_')],
                                axis=1).drop(columns='concepts')

In [42]:
concepts_hierarchy = pd.read_csv('../openalex_concepts_hierarchy.csv')
# concept ids are lowercase, parent ids not
concepts_hierarchy['parent_ids'] = concepts_hierarchy['parent_ids'].str.lower()

In [101]:
psy_children = concepts_hierarchy[concepts_hierarchy.parent_ids.str.find(subject_id.lower()) > -1]
psy_grandchildren = concepts_hierarchy[concepts_hierarchy.parent_ids.
    apply(lambda x: any([y in psy_children['openalex_id'].values for y in x.split(', ')]) if not pd.isna(x) else False)]

In [104]:
pubs_concepts_table1 = pubs_concepts_table.copy()[pubs_concepts_table.concept_level==1]

# rest category (all remaining level 1 concepts)
pubs_concepts_table1_rest = \
    pubs_concepts_table1.copy()[~pubs_concepts_table1.concept_id.str.lower().isin(psy_children['openalex_id'])]
pubs_concepts_table1_rest['concept_display_name'] = 'REST'

pubs_concepts_table1 = pubs_concepts_table1[pubs_concepts_table1.concept_id.str.lower().isin(psy_children['openalex_id'])]
# append rest
pubs_concepts_table1 = pd.concat([pubs_concepts_table1, pubs_concepts_table1_rest])

In [106]:
pubs_concepts_table1.to_csv(f'{output_path}pubs_concepts1.csv', index=False)

In [107]:
pubs_concepts_table2 = pubs_concepts_table.copy()[pubs_concepts_table.concept_level==2]
pubs_concepts_table2 = pubs_concepts_table2[pubs_concepts_table2.concept_id.str.lower().isin(psy_grandchildren['openalex_id'])]
pubs_concepts_table2.to_csv(f'{output_path}pubs_concepts2.csv', index=False)

collaborations

In [108]:
from scripts.collaborations import collaborations

In [110]:
c_data = pubs_affs_table.merge(pubs_concepts_table1, how='left', on='id').\
    merge(publications_table, how='left', on='id')

In [112]:
# how many publications have no level 1 concept? also non-psychology children are included
len(c_data[pd.isna(c_data.concept_display_name)]['id'].unique())/len(c_data['id'].unique())

0.05504587155963303

In [113]:
c_data = c_data[~pd.isna(c_data.concept_display_name)]

In [114]:
c_int, c_ext = collaborations(c_data, 'concept_display_name', 'university', 'id', 'countries', 'NL')

In [115]:
pd.DataFrame(c_int).to_csv(f'{output_path}collaborations.csv', index=False)
pd.DataFrame(c_ext).to_csv(f'{output_path}collaborations_rest.csv', index=False)