In [9]:
import os
import re
import json
import psycopg2
import pickle
import pandas as pd
import numpy as np

#### Getting concept parents

In [8]:
def replace_id_with_pred(ids, ):
    if isinstance(ids, list):
        return [y for y in [tag_id_vocab_inv.get(x, " ") for x in ids] if y!=" "]
    else:
        return tag_id_vocab_inv.get(ids, "")

In [9]:
def get_correct_levels(ids, levels):
    return [x for x,y in zip(levels,ids) if tag_id_vocab_inv.get(y)]

In [10]:
conn = ## DB connection

In [11]:
cursor = conn.cursor()

In [14]:
current_tags = ## concepts table from OpenAlex snapshot
print(current_tags.shape)
current_tags['child_id'] = current_tags['openalex_id'].apply(lambda x: x.split("/")[-1][1:]).astype('int')
current_tags.sample()

(65026, 7)


Unnamed: 0,openalex_id,display_name,normalized_name,level,wikidata_id,parent_display_names,parent_ids,child_id
24411,https://openalex.org/c67503058,Antisymmetric tensor,antisymmetric tensor,3,https://www.wikidata.org/wiki/Q1325769,Gauge theory,https://openalex.org/C181830111,67503058


In [15]:
concept_ancestors = pd.read_csv("concept-ancestors.csv")

In [30]:
concept_ancestors["ancestor_name"] = concept_ancestors["ancestor_name"].apply(lambda x: str(x).lower())

In [31]:
concept_ancestors.sample(5)

Unnamed: 0,id,name,level,ancestor_id,ancestor_name,ancestor_level
239943,2780553850,Alphaherpesvirinae,5,86803240,biology,0
134624,29906990,Frameshift mutation,4,127716648,phenotype,3
110037,53728453,STX1A,5,148785051,synaptic vesicle,4
172006,555894677,Lanthanum,2,185592680,chemistry,0
373410,38506071,Freivalds' algorithm,5,33923547,mathematics,0


In [17]:
query = \
"""select field_of_study_id as id, normalized_name as child_name
   from mid.concept 
   where wikidata_id is not null"""

In [18]:
cursor.execute("ROLLBACK;")
cursor.execute(query)
tables = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])

In [19]:
tables.dropna().shape

(65073, 2)

#### This is code to get the 'ancestor chains' used in the V3 model

In [32]:
childs_with_ancestors = tables.merge(concept_ancestors, how='inner', on='id').rename(columns={'id':'child_id'}) \
[['child_id','child_name','level','ancestor_id','ancestor_name','ancestor_level']]
childs_with_ancestors = childs_with_ancestors[(childs_with_ancestors['child_id'].isin(current_tags['child_id'].tolist())) 
                                              & (childs_with_ancestors['ancestor_id'].isin(current_tags['child_id'].tolist()))].copy()
only_parents = childs_with_ancestors[childs_with_ancestors['level'] == 
                                     childs_with_ancestors['ancestor_level']+1] \
[['child_id','ancestor_id','ancestor_name','ancestor_level']].copy()

In [33]:
fully_expanded_hierarchy = childs_with_ancestors.drop_duplicates('child_id') \
[['child_id','child_name','level']] \
.merge(only_parents.rename(columns={'ancestor_id':'ancestor_id_1', 
                                    'ancestor_level':'ancestor_level_1',
                                    'ancestor_name': 'ancestor_name_1'}), 
                            how='left', on='child_id') \
.merge(only_parents.rename(columns={'ancestor_id':'ancestor_id_2', 
                                    'ancestor_level':'ancestor_level_2',
                                    'ancestor_name': 'ancestor_name_2',
                                    'child_id':'ancestor_id_1'}), 
                            how='left', on='ancestor_id_1') \
.merge(only_parents.rename(columns={'ancestor_id':'ancestor_id_3', 
                                    'ancestor_level':'ancestor_level_3',
                                    'ancestor_name': 'ancestor_name_3',
                                    'child_id':'ancestor_id_2'}), 
                            how='left', on='ancestor_id_2') \
.merge(only_parents.rename(columns={'ancestor_id':'ancestor_id_4', 
                                    'ancestor_level':'ancestor_level_4',
                                    'ancestor_name': 'ancestor_name_4',
                                    'child_id':'ancestor_id_3'}), 
                            how='left', on='ancestor_id_3') \
.merge(only_parents.rename(columns={'ancestor_id':'ancestor_id_5', 
                                    'ancestor_level':'ancestor_level_5',
                                    'ancestor_name': 'ancestor_name_5',
                                    'child_id':'ancestor_id_4'}), 
                            how='left', on='ancestor_id_4')

In [34]:
def get_hierarchy_row_list(level, anc_1, anc_2, anc_3, anc_4, anc_5):
    if isinstance(anc_1, str):
        if level == 5:
            return [anc_1, anc_2, anc_3, anc_4, anc_5]
        elif level == 4:
            return [anc_1, anc_2, anc_3, anc_4]
        elif level == 3:
            return [anc_1, anc_2, anc_3]
        elif level == 2:
            return [anc_1, anc_2]
        elif level == 1:
            return [anc_1]
        else:
            return []
    else:
        if level == 5:
            return [int(anc_1), int(anc_2), int(anc_3), int(anc_4), int(anc_5)]
        elif level == 4:
            return [int(anc_1), int(anc_2), int(anc_3), int(anc_4)]
        elif level == 3:
            return [int(anc_1), int(anc_2), int(anc_3)]
        elif level == 2:
            return [int(anc_1), int(anc_2)]
        elif level == 1:
            return [int(anc_1)]
        else:
            return []

In [35]:
fully_expanded_hierarchy['anc_names_chain'] = \
    fully_expanded_hierarchy.apply(lambda x: get_hierarchy_row_list(x.level,
                                                                    x.ancestor_name_1, 
                                                                    x.ancestor_name_2, 
                                                                    x.ancestor_name_3, 
                                                                    x.ancestor_name_4, 
                                                                    x.ancestor_name_5), axis=1)
fully_expanded_hierarchy['anc_id_chain'] = \
    fully_expanded_hierarchy.apply(lambda x: get_hierarchy_row_list(x.level,
                                                                    x.ancestor_id_1, 
                                                                    x.ancestor_id_2, 
                                                                    x.ancestor_id_3, 
                                                                    x.ancestor_id_4, 
                                                                    x.ancestor_id_5), axis=1)

In [36]:
fully_expanded_hierarchy['anc_pred_id_chain'] = \
    fully_expanded_hierarchy['anc_id_chain'].apply(lambda x: [tag_id_vocab_inv[i] for i in x])

In [37]:
fully_expanded_hierarchy['pred_id'] = \
    fully_expanded_hierarchy['child_id'].apply(lambda x: tag_id_vocab_inv[x])

In [38]:
table_for_saving = fully_expanded_hierarchy[['pred_id','level','anc_names_chain',
                                             'anc_pred_id_chain']].groupby(['pred_id',
                                                                            'level']).agg(list).reset_index()

In [39]:
table_for_saving.sample(5)

Unnamed: 0,pred_id,level,anc_names_chain,anc_pred_id_chain
46998,47015,4,"[[energy intensity, efficient energy use, elec...","[[56409, 1035, 22008, 52408], [56409, 1035, 39..."
38134,38149,5,"[[enterprise systems engineering, enterprise a...","[[25578, 1372, 46201, 34874, 1], [25578, 1372,..."
63117,63137,2,"[[thermodynamics, physics]]","[[11868, 60518]]"
9426,9433,3,"[[aqueous solution, physical chemistry, chemis...","[[61811, 38795, 677], [61811, 31978, 677]]"
58352,58371,3,"[[optical fiber, optics, physics], [optical fi...","[[7666, 38456, 60518], [7666, 53058, 2356], [7..."


In [40]:
ancestors = {i:{'anc_tags': j, 'anc_ids': k} for i,j,k in \
             zip(table_for_saving['pred_id'].tolist(),
                 table_for_saving['anc_names_chain'].tolist(),
                 table_for_saving['anc_pred_id_chain'].tolist())}

In [41]:
childs_without_ancestors = tables[(~tables['id'].isin(fully_expanded_hierarchy['child_id'].tolist())) & 
                                  (tables['id'].isin(current_tags['child_id'].tolist()))].copy()
childs_without_ancestors['child_pred_id'] = childs_without_ancestors['id'].apply(replace_id_with_pred)
childs_without_ancestors.shape

(19, 3)

In [42]:
for child_id in childs_without_ancestors['child_pred_id'].tolist():
    ancestors[child_id] = {'level': 0, 'anc_tags': [], 'anc_ids': []}

In [43]:
len(ancestors)

65026

In [46]:
with open("ancestor_chains.pkl", "wb") as f:
    pickle.dump(ancestors, f)