In [1]:
import pandas as pd
import matplotlib
from unidecode import unidecode
import requests
import json
import textdistance

In [2]:
edges = pd.read_csv('data/edges.csv')
row_filter = edges.academic_degree != 'mestrado'
col_filter = ['source_id_lattes', 'target_id_lattes', 'institution', 'conclusion_year', 'academic_degree']
df = edges.loc[row_filter, col_filter]
df


Unnamed: 0,source_id_lattes,target_id_lattes,institution,conclusion_year,academic_degree
3,3744329382300316,39015885890,instituto de física de são carlos,2015,pos-doutorado
4,4628431262205595,39015885890,universidade federal do abc,2014,doutorado
5,8623798867256227,39015885890,instituto de pesquisas energéticas e nucleares,2019,pos-doutorado
10,2710474728753403,104116232000,universidade estadual de maringá,2016,pos-doutorado
11,9856154294275621,104116232000,universidade estadual de maringá,2015,doutorado
...,...,...,...,...,...
1633236,9979596352166630,9975608697544074,universidade federal de goiás,2021,doutorado
1633237,9999274930956418,9975992877773188,universidade de são paulo,2010,doutorado
1633239,9986529923037985,9978858327330851,universidade federal de viçosa,2009,doutorado
1633246,9991718680751230,9992478466506285,universidade de são paulo,2005,doutorado


In [3]:
tmp = df.loc[df.academic_degree == 'doutorado', ['target_id_lattes', 'institution']]
tmp

Unnamed: 0,target_id_lattes,institution
4,39015885890,universidade federal do abc
11,104116232000,universidade estadual de maringá
19,111305956227,universidade federal do ceará
31,174385402367,universidade federal de são carlos
33,183606503103,universidade federal de goiás
...,...,...
1633236,9975608697544074,universidade federal de goiás
1633237,9975992877773188,universidade de são paulo
1633239,9978858327330851,universidade federal de viçosa
1633246,9992478466506285,universidade de são paulo


In [4]:
df.rename(columns={'institution': 'target_institution'}, inplace=True)
tmp.rename(columns={'institution': 'source_institution', 'target_id_lattes': 'source_id_lattes'}, inplace=True)

In [5]:
institutions_raw = df.merge(tmp, on='source_id_lattes', how='left').dropna()
institutions_raw

Unnamed: 0,source_id_lattes,target_id_lattes,target_institution,conclusion_year,academic_degree,source_institution
0,3744329382300316,39015885890,instituto de física de são carlos,2015,pos-doutorado,universidade de são paulo
1,4628431262205595,39015885890,universidade federal do abc,2014,doutorado,universidade de são paulo
2,8623798867256227,39015885890,instituto de pesquisas energéticas e nucleares,2019,pos-doutorado,instituto de pesquisas energéticas e nucleares
3,2710474728753403,104116232000,universidade estadual de maringá,2016,pos-doutorado,universidade estadual de campinas
4,9856154294275621,104116232000,universidade estadual de maringá,2015,doutorado,universidade federal do rio de janeiro
...,...,...,...,...,...,...
502755,9979596352166630,9975608697544074,universidade federal de goiás,2021,doutorado,universidade de brasília
502756,9999274930956418,9975992877773188,universidade de são paulo,2010,doutorado,universidade federal do rio de janeiro
502757,9986529923037985,9978858327330851,universidade federal de viçosa,2009,doutorado,universidade estadual do norte fluminense darc...
502758,9991718680751230,9992478466506285,universidade de são paulo,2005,doutorado,universidade de são paulo


In [6]:
for year in range(1900, 2023, 5):
    row_filter = (institutions_raw.conclusion_year >= year) & (institutions_raw.conclusion_year <= year+4)
    institutions_raw.loc[row_filter, 'year_interval'] = f'{year}-{year+4}'
institutions_raw.year_interval

  institutions_raw.loc[row_filter, 'year_interval'] = f'{year}-{year+4}'


0         2015-2019
1         2010-2014
2         2015-2019
3         2015-2019
4         2015-2019
            ...    
502755    2020-2024
502756    2010-2014
502757    2005-2009
502758    2005-2009
502759    1995-1999
Name: year_interval, Length: 429334, dtype: object

In [7]:
dataviz = institutions_raw[['source_institution', 'target_institution']]

sourceviz = dataviz.groupby('source_institution').count().reset_index().rename(columns={
    'target_institution': 'weight',
    'source_institution': 'institution'
    })
targetviz = dataviz.groupby('target_institution').count().reset_index().rename(columns={
    'source_institution': 'weight',
    'target_institution': 'institution'
    })

In [8]:
sourceviz.groupby('weight').count().to_csv('source_weight_count')
targetviz.groupby('weight').count().to_csv('target_weight_count')

In [9]:
sourceviz = sourceviz[sourceviz.weight > 50]
targetviz = targetviz[targetviz.weight > 50]

In [10]:
source_institutions = list(sourceviz.institution.unique())
target_institutions = list(targetviz.institution.unique())
institutions = set(source_institutions + target_institutions)


In [11]:
name_table = pd.read_csv('jaccard.csv')
name_table.drop(columns='Unnamed: 0', inplace=True)
new_names = set(name_table.ror_name.to_list())
name_table

Unnamed: 0,acacia_name,ror_name
0,universidad nacional de educación a distancia,universidad nacional de educación a distancia
1,escola de comunicações e artes da usp,escola de comunicações e artes da usp
2,universidade de coimbra,Hospitais da Universidade de Coimbra
3,technical university of denmark,Technical University of Denmark
4,faculdade de engenharia da universidade do porto,faculdade de engenharia da universidade do porto
...,...,...
715,université de caen,université de caen
716,pontifícia universidade gregoriana,pontifícia universidade gregoriana
717,university of tokyo,The University of Tokyo
718,technische universität münchen,technische universität münchen


In [12]:
institutions = new_names
name_table['distinct'] = 'distinct'
name_table.groupby('distinct').nunique()


Unnamed: 0_level_0,acacia_name,ror_name
distinct,Unnamed: 1_level_1,Unnamed: 2_level_1
distinct,720,694


In [13]:
name_table.drop(columns='distinct', inplace=True)
institutions_raw = institutions_raw.merge(
    name_table, left_on='source_institution', right_on='acacia_name'
    ).drop(
        columns=['acacia_name', 'source_institution']
        ).rename(columns={'ror_name': 'source_institution'})
institutions_raw = institutions_raw.merge(
    name_table, left_on='target_institution', right_on='acacia_name'
    ).drop(
        columns=['acacia_name', 'target_institution']
        ).rename(columns={'ror_name': 'target_institution'})
institutions_raw

Unnamed: 0,source_id_lattes,target_id_lattes,conclusion_year,academic_degree,year_interval,source_institution,target_institution
0,3744329382300316,39015885890,2015,pos-doutorado,2015-2019,Universidade de São Paulo,instituto de física de são carlos
1,8198392523716590,20850693692136,2003,doutorado,2000-2004,Universidade de São Paulo,instituto de física de são carlos
2,1426399772341383,191932527256862,2012,doutorado,2010-2014,Universidade de São Paulo,instituto de física de são carlos
3,194007981724312,10202349,2019,pos-doutorado,2015-2019,Universidade de São Paulo,instituto de física de são carlos
4,194007981724312,10256510,2012,pos-doutorado,2010-2014,Universidade de São Paulo,instituto de física de são carlos
...,...,...,...,...,...,...,...
363761,5791603940369864,10627359,2006,doutorado,2005-2009,Novosibirsk State University,Novosibirsk State University
363762,5791603940369864,10637073,1998,doutorado,1995-1999,Novosibirsk State University,Novosibirsk State University
363763,5791603940369864,10637073,1998,doutorado,1995-1999,Novosibirsk State University,Novosibirsk State University
363764,5791603940369864,10650488,2001,doutorado,2000-2004,Novosibirsk State University,Novosibirsk State University


In [14]:
# cols = ['institution', 'distancia', 'orientacoes', 'orientadores', 'orientandos', 'institutos', 'year_interval', 'academic_degree']
node_tree_metrics = pd.DataFrame()
edge_tree_metrics = pd.DataFrame()

node_metrics = pd.DataFrame()


for institution in institutions:
    visited_academics = set()
    seeds =  set(institutions_raw[institutions_raw.source_institution == institution].source_id_lattes)
    distance = 1
    
    
    while len(seeds) > 0:
        visited_academics.update(seeds)
        influence_df = institutions_raw[institutions_raw.source_id_lattes.isin(seeds)]

        node_tree_metrics_tmp = influence_df.groupby(['academic_degree', 'year_interval']).agg(
            orientacoes=('source_id_lattes', 'count'),
            orientadores=('source_id_lattes', 'nunique'),
            orientandos=('target_id_lattes', 'nunique'),
            institutos_fonte=('source_institution', 'nunique'),
            institutos_alvo=('target_institution', 'nunique'),
            primeira_ocorrencia=('conclusion_year', 'min')
        ).reset_index()
        
        node_tree_metrics_tmp['reference_institution'] = institution
        node_tree_metrics_tmp['distancia'] = distance
        node_tree_metrics = pd.concat([node_tree_metrics, node_tree_metrics_tmp])

        edge_tree_metrics_tmp = influence_df.groupby(
            ['source_institution', 'target_institution', 'academic_degree', 'year_interval']
            ).agg(
            orientacoes=('source_id_lattes', 'count'),
            orientadores=('source_id_lattes', 'nunique'),
            orientandos=('target_id_lattes', 'nunique'),
            primeira_ocorrencia=('conclusion_year', 'min')
        ).reset_index()

        edge_tree_metrics_tmp['reference_institution'] = institution
        edge_tree_metrics_tmp['distancia'] = distance
        edge_tree_metrics = pd.concat([edge_tree_metrics, edge_tree_metrics_tmp])


        distance += 1
        seeds = set(influence_df.target_id_lattes) - visited_academics
node_tree_metrics.sort_values(['reference_institution', 'year_interval','distancia'])

edge_tree_metrics.rename(columns={
    'source_institution': 'Source',
    'target_institution': 'Target'
}, inplace=True)
node_tree_metrics.to_csv('results/intervaled/search-tree-nodes.csv', index=False)
edge_tree_metrics.to_csv('results/intervaled/search-tree-edges.csv', index=False)

In [15]:
node_tree_metrics

Unnamed: 0,academic_degree,year_interval,orientacoes,orientadores,orientandos,institutos_fonte,institutos_alvo,primeira_ocorrencia,reference_institution,distancia
0,doutorado,1990-1994,1,1,1,1,1,1994,universidad nacional de educación a distancia,1
1,doutorado,2005-2009,1,1,1,1,1,2006,universidad nacional de educación a distancia,1
2,doutorado,2010-2014,5,2,5,1,3,2010,universidad nacional de educación a distancia,1
3,doutorado,2015-2019,5,2,4,2,2,2015,universidad nacional de educación a distancia,1
4,doutorado,2020-2024,1,1,1,1,1,2022,universidad nacional de educación a distancia,1
...,...,...,...,...,...,...,...,...,...,...
0,doutorado,2015-2019,10,4,10,3,5,2015,University of Stirling,2
1,doutorado,2020-2024,5,4,5,3,3,2020,University of Stirling,2
2,pos-doutorado,2005-2009,1,1,1,1,1,2007,University of Stirling,2
3,pos-doutorado,2010-2014,4,1,4,1,1,2010,University of Stirling,2


In [16]:
edge_tree_metrics

Unnamed: 0,Source,Target,academic_degree,year_interval,orientacoes,orientadores,orientandos,primeira_ocorrencia,reference_institution,distancia
0,Universidade Estadual Paulista (Unesp),Universidade Estadual Paulista (Unesp),doutorado,2015-2019,1,1,1,2018,universidad nacional de educación a distancia,1
1,universidad nacional de educación a distancia,Universidad Autónoma de Asunción,doutorado,2010-2014,3,1,3,2010,universidad nacional de educación a distancia,1
2,universidad nacional de educación a distancia,Universidad Autónoma de Asunción,doutorado,2015-2019,3,1,3,2015,universidad nacional de educación a distancia,1
3,universidad nacional de educación a distancia,Universidad Autónoma de Asunción,doutorado,2020-2024,1,1,1,2022,universidad nacional de educación a distancia,1
4,universidad nacional de educación a distancia,Universidad Complutense de Madrid,doutorado,1990-1994,1,1,1,1994,universidad nacional de educación a distancia,1
...,...,...,...,...,...,...,...,...,...,...
9,universidade estadual de maringá,universidade estadual de maringá,pos-doutorado,2015-2019,2,1,2,2015,University of Stirling,2
10,universidade federal do pará,Museu Paraense Emílio Goeldi,doutorado,2020-2024,1,1,1,2020,University of Stirling,2
11,universidade federal do pará,Museu Paraense Emílio Goeldi,pos-doutorado,2015-2019,1,1,1,2019,University of Stirling,2
12,universidade federal do pará,universidade federal do amazonas,doutorado,2015-2019,2,1,2,2016,University of Stirling,2


In [17]:
# cols = ['institution', 'distancia', 'orientacoes', 'orientadores', 'orientandos', 'institutos', 'year_interval', 'academic_degree']
node_tree_metrics = pd.DataFrame()
edge_tree_metrics = pd.DataFrame()


for institution in institutions:
    visited_academics = set()
    seeds =  set(institutions_raw[institutions_raw.source_institution == institution].source_id_lattes)
    distance = 1
    
    
    while len(seeds) > 0:
        visited_academics.update(seeds)
        influence_df = institutions_raw[institutions_raw.source_id_lattes.isin(seeds)]
        node_tree_metrics_tmp = influence_df.drop(columns='year_interval').groupby(['academic_degree']).agg(
            orientacoes=('source_id_lattes', 'count'),
            orientadores=('source_id_lattes', 'nunique'),
            orientandos=('target_id_lattes', 'nunique'),
            institutos_fonte=('source_institution', 'nunique'),
            institutos_alvo=('target_institution', 'nunique'),
            primeira_ocorrencia=('conclusion_year', 'min')
        ).reset_index()
        
        node_tree_metrics_tmp['reference_institution'] = institution
        node_tree_metrics_tmp['distancia'] = distance
        node_tree_metrics = pd.concat([node_tree_metrics, node_tree_metrics_tmp])

        edge_tree_metrics_tmp = influence_df.drop(columns='year_interval').groupby(
            ['source_institution', 'target_institution', 'academic_degree']
            ).agg(
            orientacoes=('source_id_lattes', 'count'),
            orientadores=('source_id_lattes', 'nunique'),
            orientandos=('target_id_lattes', 'nunique'),
            primeira_ocorrencia=('conclusion_year', 'min')
        ).reset_index()

        edge_tree_metrics_tmp['reference_institution'] = institution
        edge_tree_metrics_tmp['distancia'] = distance
        edge_tree_metrics = pd.concat([edge_tree_metrics, edge_tree_metrics_tmp])


        distance += 1
        seeds = set(influence_df.target_id_lattes) - visited_academics
node_tree_metrics.sort_values(['reference_institution','distancia'])

edge_tree_metrics.rename(columns={
    'source_institution': 'Source',
    'target_institution': 'Target'
}, inplace=True)

node_tree_metrics.to_csv('results/search-tree-nodes.csv', index=False)
edge_tree_metrics.to_csv('results/search-tree-edges.csv', index=False)

In [18]:
node_tree_metrics

Unnamed: 0,academic_degree,orientacoes,orientadores,orientandos,institutos_fonte,institutos_alvo,primeira_ocorrencia,reference_institution,distancia
0,doutorado,13,4,12,2,5,1994,universidad nacional de educación a distancia,1
0,doutorado,1,1,1,1,1,2015,universidad nacional de educación a distancia,2
0,doutorado,30,2,30,1,5,2010,universiadad del norte,1
0,doutorado,11,2,6,2,1,2012,universiadad del norte,2
0,doutorado,72,5,66,2,6,2003,universität dortmund,1
...,...,...,...,...,...,...,...,...,...
0,doutorado,8,2,8,2,2,2016,Universidad de Ciencias Médicas de la Habana,2
0,doutorado,83,11,83,1,10,1999,University of Stirling,1
1,pos-doutorado,19,5,19,1,7,2004,University of Stirling,1
0,doutorado,15,6,15,4,7,2015,University of Stirling,2


In [19]:
edge_tree_metrics

Unnamed: 0,Source,Target,academic_degree,orientacoes,orientadores,orientandos,primeira_ocorrencia,reference_institution,distancia
0,Universidade Estadual Paulista (Unesp),Universidade Estadual Paulista (Unesp),doutorado,1,1,1,2018,universidad nacional de educación a distancia,1
1,universidad nacional de educación a distancia,Universidad Autónoma de Asunción,doutorado,7,1,7,2010,universidad nacional de educación a distancia,1
2,universidad nacional de educación a distancia,Universidad Complutense de Madrid,doutorado,2,1,2,1994,universidad nacional de educación a distancia,1
3,universidad nacional de educación a distancia,Universidade Estadual Paulista (Unesp),doutorado,1,1,1,2018,universidad nacional de educación a distancia,1
4,universidad nacional de educación a distancia,universidad de granada,doutorado,1,1,1,2013,universidad nacional de educación a distancia,1
...,...,...,...,...,...,...,...,...,...
7,universidade estadual de maringá,universidade estadual de maringá,pos-doutorado,6,1,6,2010,University of Stirling,2
8,universidade federal do pará,Museu Paraense Emílio Goeldi,doutorado,1,1,1,2020,University of Stirling,2
9,universidade federal do pará,Museu Paraense Emílio Goeldi,pos-doutorado,1,1,1,2019,University of Stirling,2
10,universidade federal do pará,universidade federal do amazonas,doutorado,2,1,2,2016,University of Stirling,2


In [20]:
filtered_df = institutions_raw[
    institutions_raw.source_institution.isin(institutions) & institutions_raw.target_institution.isin(institutions)
]
edge_metrics = filtered_df.drop(columns='year_interval').groupby(['source_institution', 'target_institution', 'academic_degree']).agg(
    orientacoes=('source_id_lattes', 'count'),
    orientadores=('source_id_lattes', 'nunique'),
    orientandos=('target_id_lattes', 'nunique'),
    primeira_ocorrencia=('conclusion_year', 'min')
).reset_index().dropna(subset=['source_institution', 'target_institution'])
edge_metrics.rename(columns={
    'source_institution': 'Source',
    'target_institution': 'Target'
}, inplace=True)
edge_metrics.to_csv('results/edges.csv', index=False)
edge_metrics


Unnamed: 0,Source,Target,academic_degree,orientacoes,orientadores,orientandos,primeira_ocorrencia
0,ARC Centre of Excellence for the Digital Child,ARC Centre of Excellence for the Digital Child,doutorado,7,1,7,1996
1,ARC Centre of Excellence for the Digital Child,ARC Centre of Excellence for the Digital Child,pos-doutorado,10,1,10,2003
2,ARC Centre of Excellence for the Digital Child,Escola Nacional de Saúde Pública,doutorado,2,1,2,2005
3,ARC Centre of Excellence for the Digital Child,Hospital de Clínicas Universidade Federal do P...,doutorado,12,1,12,2004
4,ARC Centre of Excellence for the Digital Child,Hospital de Clínicas Universidade Federal do P...,pos-doutorado,2,1,2,2010
...,...,...,...,...,...,...,...
15805,école des hautes études en sciences sociales,universidade metodista de são paulo,doutorado,2,1,2,1998
15806,école des hautes études en sciences sociales,université de franche comté,doutorado,1,1,1,2019
15807,école des hautes études en sciences sociales,université libre de bruxelles,doutorado,1,1,1,2011
15808,école des hautes études en sciences sociales,université paris est créteil val de marne,doutorado,1,1,1,2013


In [21]:
filtered_df = institutions_raw[
    institutions_raw.source_institution.isin(institutions) & institutions_raw.target_institution.isin(institutions)
]
edge_metrics = filtered_df.groupby(['source_institution', 'target_institution', 'academic_degree', 'year_interval']).agg(
    orientacoes=('source_id_lattes', 'count'),
    orientadores=('source_id_lattes', 'nunique'),
    orientandos=('target_id_lattes', 'nunique'),
    primeira_ocorrencia=('conclusion_year', 'min')
).reset_index().dropna(subset=['source_institution', 'target_institution'])
edge_metrics.sort_values(['year_interval'])
edge_metrics.rename(columns={
    'source_institution': 'Source',
    'target_institution': 'Target'
}, inplace=True)
edge_metrics.to_csv('results/intervaled/edges.csv', index=False)
edge_metrics


Unnamed: 0,Source,Target,academic_degree,year_interval,orientacoes,orientadores,orientandos,primeira_ocorrencia
0,ARC Centre of Excellence for the Digital Child,ARC Centre of Excellence for the Digital Child,doutorado,1995-1999,3,1,3,1996
1,ARC Centre of Excellence for the Digital Child,ARC Centre of Excellence for the Digital Child,doutorado,2000-2004,4,1,4,2000
2,ARC Centre of Excellence for the Digital Child,ARC Centre of Excellence for the Digital Child,pos-doutorado,2000-2004,3,1,3,2003
3,ARC Centre of Excellence for the Digital Child,ARC Centre of Excellence for the Digital Child,pos-doutorado,2005-2009,3,1,3,2005
4,ARC Centre of Excellence for the Digital Child,ARC Centre of Excellence for the Digital Child,pos-doutorado,2010-2014,4,1,4,2011
...,...,...,...,...,...,...,...,...
38975,école des hautes études en sciences sociales,université libre de bruxelles,doutorado,2010-2014,1,1,1,2011
38976,école des hautes études en sciences sociales,université paris est créteil val de marne,doutorado,2010-2014,1,1,1,2013
38977,école des hautes études en sciences sociales,école des hautes études en sciences sociales,doutorado,2000-2004,1,1,1,2003
38978,école des hautes études en sciences sociales,école des hautes études en sciences sociales,doutorado,2010-2014,1,1,1,2014


In [22]:
def query_ror(name):
    query_name = unidecode(name).replace(' ', '%20')
    query_str = f'https://api.ror.org/organizations?query.advanced=name:{query_name}'
    results = requests.get(query_str)
    std_name = json.loads(results.text)
    if len(std_name['items']) == 0:
        return name
    match_val = 0
    idx = 0
    for i in range(int(std_name['number_of_results'])):
        if len(std_name['items'][i:]) < 1:
            continue
        tmp = textdistance.levenshtein.similarity(std_name['items'][i]['name'], name)
        if match_val < tmp:
            match_val = tmp
            idx = i
    return std_name['items'][idx]['name']
# name_table = pd.DataFrame(data={'acacia_name': list(institutions)})
# name_table['ror_name'] = name_table.acacia_name.apply(query_ror)
# name_table
# name_table.to_csv('levenshtein.csv')


In [23]:
name_table['distinct'] = 'distinct'
name_table.groupby('distinct').nunique()


Unnamed: 0_level_0,acacia_name,ror_name
distinct,Unnamed: 1_level_1,Unnamed: 2_level_1
distinct,720,694


In [25]:
src_nodes = institutions_raw.groupby(['source_institution', 'academic_degree', 'year_interval']).agg(
    orientacoes_externas=('source_id_lattes','count'),
    orientadores_externos=('source_id_lattes', 'nunique'),
    orientandos_externos=('target_id_lattes', 'nunique'),
    instituicoes_alvo=('target_institution', 'nunique'),
    primeira_ocorrencia_externa=('conclusion_year', 'min')
).reset_index().rename(columns={'source_institution': 'institution'})
tgt_nodes = institutions_raw.groupby(['target_institution', 'academic_degree', 'year_interval']).agg(
    orientacoes_internas=('source_id_lattes','count'),
    orientadores_internos=('source_id_lattes', 'nunique'),
    orientandos_internos=('target_id_lattes', 'nunique'),
    instituicoes_fonte=('source_institution', 'nunique'),
    primeira_ocorrencia_interna=('conclusion_year', 'min')
).reset_index().rename(columns={'target_institution': 'institution'})
tree = pd.read_csv('results/intervaled/search-tree-nodes.csv')
tree = tree[['distancia', 'reference_institution', 'year_interval', 'academic_degree']]
tree = tree.groupby(['distancia', 'reference_institution', 'year_interval', 'academic_degree']).max().reset_index()
tree = tree.rename({'reference_institution': 'institution'})
nodes = tgt_nodes.merge(src_nodes, on=['institution', 'academic_degree', 'year_interval'], how='outer')
nodes = nodes.merge(tree, on=['academic_degree', 'institution', 'year_interval'], how='left')
nodes = nodes.fillna(0)
cols = [col for col in nodes.columns if col not in ['academic_degree', 'institution', 'year_interval']]
nodes[cols] = nodes[cols].astype(int)
nodes.to_csv('results/intervaled/nodes.csv', index=False)

KeyError: 'institution'

In [None]:
src_nodes = institutions_raw.drop(columns='year_interval').groupby(['source_institution', 'academic_degree']).agg(
    orientacoes_externas=('source_id_lattes','count'),
    orientadores_externos=('source_id_lattes', 'nunique'),
    orientandos_externos=('target_id_lattes', 'nunique'),
    instituicoes_alvo=('target_institution', 'nunique'),
    primeira_ocorrencia_externa=('conclusion_year', 'min')
).reset_index().rename(columns={'source_institution': 'institution'})
tgt_nodes = institutions_raw.drop(columns='year_interval').groupby(['target_institution', 'academic_degree']).agg(
    orientacoes_internas=('source_id_lattes','count'),
    orientadores_internos=('source_id_lattes', 'nunique'),
    orientandos_internos=('target_id_lattes', 'nunique'),
    instituicoes_fonte=('source_institution', 'nunique'),
    primeira_ocorrencia_interna=('conclusion_year', 'min')
).reset_index().rename(columns={'target_institution': 'institution'})
tree = pd.read_csv('results/search-tree-nodes.csv')
tree = tree[['distancia', 'reference_institution', 'academic_degree']]
tree = tree.groupby(['distancia', 'reference_institution', 'academic_degree']).max().reset_index()
tree = tree.rename({'reference_institution': 'institution'})
nodes = tgt_nodes.merge(src_nodes, on=['institution', 'academic_degree'], how='outer')
nodes = nodes.merge(tree, on=['academic_degree', 'institution'], how='left')nodes = nodes.fillna(0)
cols = [col for col in nodes.columns if col not in ['academic_degree', 'institution']]
nodes[cols] = nodes[cols].astype(int)
nodes.to_csv('results/intervaled/nodes.csv', index=False)