In [1]:
import pandas as pd
import json
import config
import glob
import gzip

# Create concept relations table for network visualisation
- first, find "core" level 2 concepts for the entire dataset (based on publication counts)
- keep publication counts per university for color coding

## first, prepare the data

In [2]:
# load publications data
with open('../data/publications.json', 'r') as f:
    publications = pd.DataFrame(json.loads(f.read()))

In [3]:
# load concept hierarchy
# note: concept ids are lowercase, parent ids not
concepts_hierarchy = pd.read_csv('../openalex_concepts_hierarchy.csv')
concepts_hierarchy['parent_ids'] = concepts_hierarchy['parent_ids'].str.lower()

In [4]:
# filter on psychology concepts
psy_id = concepts_hierarchy[concepts_hierarchy.display_name=='Psychology']['openalex_id'].iloc[0]
psy_children = concepts_hierarchy[concepts_hierarchy.parent_ids.str.find(psy_id) > -1]
psy_grandchildren = concepts_hierarchy[concepts_hierarchy.parent_ids.
    apply(lambda x: any([y in psy_children['openalex_id'].values for y in x.split(', ')]) if not pd.isna(x) else False)]

## create concepts subset

In [5]:
publications = publications.explode('concepts')
publications = pd.concat([publications.reset_index(drop=True),
                          pd.json_normalize(publications.concepts).reset_index(drop=True).rename(lambda x: 'concept.'+x, axis='columns')],
                        axis=1)
publications = publications.drop(columns='concepts')

In [6]:
# only level 2 concepts with grandparent psychology
publications_2 = publications.copy()[publications['concept.level']==2]
publications_2['concept.id'] = publications_2['concept.id'].str.lower()
publications_2 = publications_2[publications_2['concept.id'].isin(psy_grandchildren.openalex_id)]

In [7]:
# count how often each concept appears in the publication set
# then filter using a cutoff (~50 nodes?)
counts = publications_2.value_counts('concept.id')
n_nodes = 50
print(len(counts), 'concepts in total')
print(n_nodes, 'are', round((n_nodes/len(counts))*100), '% of total')

1358 concepts in total
50 are 4 % of total


In [8]:
concepts_subset = list(counts.copy()[counts >= counts.quantile(1-(n_nodes/len(counts)))].index)
print('number of nodes (concepts):', len(concepts_subset))

number of nodes (concepts): 51


In [9]:
publications_2_subset = publications_2.copy()[publications_2['concept.id'].isin(concepts_subset)]

aggregate, but keep the counts per university for color coding

In [10]:
# counts per university
# NOTE: works are now counted more than once if they are shared amongst universities
works_count = publications_2_subset.groupby(['university', 'concept.id']).agg(count=('id', 'count'))

In [11]:
works_count.to_csv('../data/works_count.csv')

## create concept relations table

In [12]:
concepts_2 = publications_2_subset.copy()[['concept.id', 'concept.display_name', 'concept.level', 'concept.score']]
concepts_2 = concepts_2.rename(lambda x: x.replace('concept.', ''), axis='columns')

In [13]:
# NOTE: works are now counted more than once if they are shared amongst universities
concepts_2['works_count'] = concepts_2.groupby(['id'])['id'].transform('size')

In [14]:
concepts_2 = concepts_2.drop_duplicates('id', keep='first')

In [15]:
print(len(concepts_2))

51


In [16]:
# get concept relations from data snapshot
concepts_meta = []
for filename in glob.glob(f'{config.project_path}/openalex-snapshot-concepts/**/*.gz', recursive=True):
    with gzip.open(filename, 'rb') as f:
        for line in f:
            concepts_meta.append(json.loads(line))

In [17]:
len(concepts_meta)  # that's the exact amount that is stated on https://api.openalex.org/concepts

65073

In [18]:
concepts_meta_df = pd.DataFrame(concepts_meta)[['id', 'related_concepts']]
concepts_meta_df['id'] = concepts_meta_df['id'].str.lower()

In [19]:
concepts_2 = concepts_2.merge(concepts_meta_df, how='left', on='id')

In [20]:
concepts_2_relations = concepts_2.copy().explode('related_concepts').reset_index(drop=True)

In [21]:
concepts_2_relations = pd.concat([concepts_2_relations,
                                  pd.json_normalize(concepts_2_relations['related_concepts']).add_prefix('rel_')],
                                 axis=1)

In [22]:
concepts_2_relations['rel_id'] = concepts_2_relations['rel_id'].str.lower()

In [23]:
# filter out the related concepts that are not part of our set
concepts_2_relations = concepts_2_relations[concepts_2_relations['rel_id'].isin(concepts_2['id'])]

In [24]:
# level 1 concepts (parents) will be added to the graph
parents_relations = psy_grandchildren[psy_grandchildren.openalex_id.isin(concepts_subset)].\
    apply(lambda x: x.str.split(', ') if x.name in ['parent_display_names', 'parent_ids'] else x).\
    explode(['parent_display_names', 'parent_ids'])
parents_relations = parents_relations[parents_relations.parent_ids.isin(psy_children.openalex_id)]

In [25]:
aux = pd.DataFrame({'id': parents_relations.openalex_id,
                    'display_name': parents_relations.display_name,
                    'rel_id': parents_relations.parent_ids,
                    'rel_display_name': parents_relations.parent_display_names,
                    'rel_level': 1})

In [26]:
concepts_2_relations = pd.concat([concepts_2_relations, aux[aux.id.isin(concepts_2_relations.id)]])

In [27]:
concepts_2_relations.to_csv('../data/concepts_relations.csv', index=False)