In [1]:
import pandas as pd
from tqdm import tqdm
import networkx as nx
from bs4 import BeautifulSoup
import seaborn as sns
import numpy as np
import pickle as pkl
from typing import List
from collections import Counter
import torch
from torch_geometric.utils.convert import to_networkx, from_networkx
from networkx.algorithms.community.quality import modularity
from tqdm import tqdm
import gc; gc.enable();

* Given many graph clustering metrics and considering the current citation graph as ground truth, we will compare different communities partitioning,
firstly considering keyword and concepts extraction methods.
* OpenAlex keyword extraction uses https://github.com/ourresearch/openalex-keywords/tree/main/v1

In [2]:
df = pd.read_pickle("df_en_sorted_minimal.pkl")
edge_indices = torch.load("torch_geometric_ds.bin")

In [3]:
G_networkx = to_networkx(edge_indices)

In [4]:
gc.collect()

0

In [6]:
open_alex_keyword_communities_first_kw = {}
#keys_dict = {}

for i, row in df.iterrows():
    ks = eval(row["keywords"])
    if len(ks) > 0:
        first_row_key = ks[0]
    else:
        first_row_key = "no_keyword_attributed"
    
    if first_row_key not in open_alex_keyword_communities_first_kw:
        open_alex_keyword_communities_first_kw[first_row_key] = {i}
    else:
        open_alex_keyword_communities_first_kw[first_row_key] = open_alex_keyword_communities_first_kw[first_row_key] | {i}


In [7]:
modularity(G_networkx, list(open_alex_keyword_communities_first_kw.values()), weight=None, resolution=1)

0.024036344837202683

In [9]:
frozenset()

frozenset()

In [5]:
from itertools import chain, combinations

def powerset(iterable):
    s = list(iterable)
    return list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))
    
def fill_tree(tree, sorted_keys, val):

    element = sorted_keys[0]
    if len(sorted_keys) == 1:
        if element in tree.keys():
            tree[element] = {
                "nodes": tree[element]["nodes"] | {val},
                "leafs": tree[element]["leafs"]
            }
        else:
            tree[element] = {
                "nodes": {val},
                "leafs": {}
            }
    else:    
        sorted_keys.pop(0)
        if element in tree:
            tree[element]["leafs"] = fill_tree(tree[element]["leafs"], sorted_keys, val)
        else:
            tree[element] = {
                "nodes": set(),
                "leafs": {}
            }
            tree[element]["leafs"] = fill_tree(tree[element]["leafs"], sorted_keys, val)
                
    return tree

In [6]:
def sum_tree_nodes(tree, visited=set()):
    
    for key in tree.keys():
        visited = visited | tree[key]["nodes"]
        visited = visited | sum_tree_nodes(tree[key]["leafs"], visited) 
    
    return visited

In [7]:
def reduce_tree(tree, target, visited=set()):

    for key in tree.keys():
        #if target.issubsetxx(key):
        if key in target:
            visited = visited | tree[key]["nodes"]
            visited = visited | sum_tree_nodes(tree[key]["leafs"], visited=visited)
        visited = visited | reduce_tree(tree[key]["leafs"], target, visited=visited)
    return visited

In [39]:
def partition_tree(tree, height=1, max_height=3):
    if len(tree["nodes"]) > 0:
        #partitions.append(tree["nodes"])
        yield tree["nodes"]
    
    for key in tree["leafs"].keys():
        if height + 1 == max_height:
            #print(tree["leafs"][key]["nodes"] | sum_tree_nodes(tree["leafs"][key]["leafs"], visited=set()))
            #partitions.append(tree["leafs"][key]["nodes"] | sum_tree_nodes(tree["leafs"][key]["leafs"], visited=set()))
            yield tree["leafs"][key]["nodes"] | sum_tree_nodes(tree["leafs"][key]["leafs"], visited=set())
        else:
            #partitions.append(partition_tree(tree["leafs"][key], height + 1, max_height=max_height))
            yield from partition_tree(tree["leafs"][key], height + 1, max_height=max_height)
    #return partitions

In [72]:
df_samp = df #.sample(n=10000).reset_index(drop=True)
nodes_keywords = []
threshold = 0.7
for i, kwd in tqdm(enumerate(df_samp["concepts"].tolist()), total=len(df_samp["concepts"])):
    kwd = [x[0] for x in eval(kwd) if x[1] >= threshold]#[:3]
    #kwd = frozenset(kwd)
    if len(kwd) == 0:
        nodes_keywords.append((i, ["no_keyword_attributed"], 1))
    else:
        nodes_keywords.append((i, kwd, len(kwd)))
#nodes_keywords = sorted(nodes_keywords, key=lambda x: x[2])

100%|████████████████████| 2048707/2048707 [01:43<00:00, 19800.54it/s]


In [73]:
tree = {}
for nodes_info in tqdm(nodes_keywords, total=len(nodes_keywords)):
    node_idx, node_keys, _ = nodes_info
    #if node_keys not in tree:
    #    tree[frozenset()]
    tree = fill_tree(tree, node_keys, node_idx)

100%|█████████████████████| 2048707/2048707 [30:36<00:00, 1115.77it/s]


In [108]:
import json
#pkl.dump(tree, open(f"trie_threshold_{int(100 * threshold)}.pkl", "wb"))

In [152]:
gen = partition_tree(tree["Graphene"], height=0, max_height=2)

In [144]:
all = []
for obj in gen:
    all.append(obj)

In [162]:
partitions = []
for k in tree.keys():
    #gen = partition_tree(tree[k], height=0, max_height=1)
    #for obj in gen:
    #    partitions.append(obj)
    partitions.append(tree[k]["nodes"] | sum_tree_nodes(tree[k]["leafs"], visited=set()))

In [166]:
modularity(G_networkx, partitions, weight=None, resolution=0.01)

0.24058036398363553

In [167]:
[len(x) for x in partitions]

[5355,
 3472,
 395896,
 329189,
 3748,
 577,
 297,
 300,
 2239,
 1,
 3401,
 35027,
 73,
 13,
 316,
 197,
 27,
 18,
 71,
 9474,
 850,
 1034,
 3873,
 2205,
 7,
 4925,
 3789,
 123,
 49,
 34,
 1439,
 696,
 379,
 4189,
 1024,
 57,
 61,
 38,
 3170,
 3504,
 382,
 19326,
 589,
 61,
 10898,
 13,
 433,
 129,
 754,
 595,
 1844,
 49584,
 8,
 51,
 186,
 970,
 12,
 4,
 391,
 1379,
 143,
 26,
 5416,
 6755,
 1101,
 1070,
 5,
 840,
 22,
 376,
 1622,
 203,
 2383,
 652,
 4246,
 10,
 862,
 2036,
 228,
 35,
 963,
 654,
 52,
 12,
 341,
 899,
 1037,
 8,
 320,
 1,
 1170,
 161,
 101,
 3855,
 87,
 2141,
 1,
 2403,
 4787,
 216,
 112,
 1036,
 7514,
 397,
 151,
 1340,
 1566,
 2153,
 2427,
 230,
 1164,
 92,
 1383,
 57,
 1241,
 955,
 70,
 338,
 210,
 106,
 540,
 7891,
 99,
 656,
 242,
 842,
 479,
 11,
 1552,
 2039,
 22,
 41,
 4078,
 2733,
 6,
 7,
 1671,
 232,
 1776,
 4720,
 10791,
 456,
 130,
 320,
 865,
 533,
 1179,
 36,
 980,
 163,
 103,
 5,
 24,
 507,
 1099,
 81,
 21,
 402,
 661,
 5625,
 162,
 908,
 178,
 458,
 2

In [112]:
tree["Graphene"]

{'nodes': {1802240,
  1753091,
  1732614,
  1568775,
  1433608,
  1630215,
  1945606,
  716813,
  983053,
  1687569,
  1237011,
  167958,
  1962007,
  458778,
  1757215,
  1933348,
  1314853,
  1462309,
  1208360,
  536618,
  1208362,
  524333,
  155696,
  1740849,
  24626,
  1073207,
  745529,
  1691708,
  901184,
  1646658,
  1794115,
  1728591,
  544848,
  1876052,
  684117,
  1589335,
  1962074,
  1486939,
  1646683,
  1777758,
  200799,
  323680,
  2003040,
  1929322,
  1634411,
  2031726,
  1634417,
  1917041,
  1007732,
  1179767,
  1024122,
  176251,
  2048126,
  389247,
  1179776,
  1790081,
  1470594,
  1155205,
  368774,
  8327,
  1540230,
  782476,
  389262,
  1147024,
  1679505,
  1880208,
  1491091,
  1835156,
  1376405,
  1122454,
  1667222,
  1671317,
  8346,
  1061025,
  1691811,
  1544359,
  2027691,
  1015988,
  1192123,
  1851581,
  1654979,
  1777860,
  1683653,
  1835203,
  979146,
  1491151,
  418004,
  1880276,
  1908952,
  1122523,
  1683676,
  1155297,
  11757

In [113]:
tree["Graphene"]["leafs"]["Crystallite"]["leafs"]

{'Nucleation': {'nodes': set(),
  'leafs': {'Materials science': {'nodes': {398460}, 'leafs': {}}}},
 'Materials science': {'nodes': {1887921},
  'leafs': {'Fracture toughness': {'nodes': {598381}, 'leafs': {}}}},
 'Misorientation': {'nodes': set(),
  'leafs': {'Materials science': {'nodes': set(),
    'leafs': {'Condensed matter physics': {'nodes': set(),
      'leafs': {'Band gap': {'nodes': {740112}, 'leafs': {}}}}}}}},
 'Grain boundary': {'nodes': set(),
  'leafs': {'Chemical vapor deposition': {'nodes': {744932}, 'leafs': {}},
   'Materials science': {'nodes': {1434680}, 'leafs': {}}}}}

In [70]:
df.iloc[398460]["concepts"]

"[('Graphene', 0.86116934), ('Crystallite', 0.7720017), ('Nucleation', 0.73972344), ('Materials science', 0.73344624), ('Grain boundary', 0.674698), ('Coalescence (physics)', 0.61393905), ('Chemical vapor deposition', 0.57172805), ('Scaling', 0.56947136), ('Condensed matter physics', 0.56415194), ('Charge carrier', 0.5582014), ('Grain size', 0.54764503), ('Chemical physics', 0.49424174), ('Mean free path', 0.423418), ('Nanotechnology', 0.3530321), ('Microstructure', 0.1381002), ('Chemistry', 0.12767544), ('Optoelectronics', 0.12635553), ('Composite material', 0.1180577), ('Physics', 0.09466535), ('Thermodynamics', 0.08643311), ('Optics', 0.07143453), ('Metallurgy', 0.06517342), ('Geometry', 0.0), ('Mathematics', 0.0), ('Astrobiology', 0.0), ('Scattering', 0.0)]"

In [14]:
#gen[1]

In [17]:
partitions = []
for obj in gen:
    partitions.append(obj)

In [18]:
next(gen)

StopIteration: 

In [15]:
partitions

[]

In [40]:
len(sum_tree_nodes(tree[frozenset(["Graphene"])]["leafs"]))

15015

In [12]:
tree["Graphene"].keys()

dict_keys(['nodes', 'leafs'])

In [87]:
#test_tree = tree[frozenset(['model'])] #["leafs"]
df.iloc[sorted(list(tree["Graphene"]["nodes"] | sum_tree_nodes(tree["Graphene"]["leafs"])))]

Unnamed: 0,id,abstract,title,publication_date,keywords,concepts
239,https://openalex.org/W1018953785,we show that graphene single crystals as large...,chemical vapor deposition growth of 5 mm hexag...,2015-11-01,"['graphene', 'ethanol', 'single-crystal']","[('Graphene', 0.8950863), ('Chemical vapor dep..."
267,https://openalex.org/W1021326259,we theoretically study the influence of surfac...,surface optical phonon-assisted cyclotron reso...,2015-08-01,"['graphene', 'cyclotron resonance', 'optical',...","[('Graphene', 0.9213928), ('Materials science'..."
413,https://openalex.org/W1031618044,the telescopic contact between graphene layers...,tunneling conductance of telescopic contacts b...,2015-11-01,"['graphene layers', 'tunneling conductance', '...","[('Graphene', 0.8410138), ('Quantum tunnelling..."
604,https://openalex.org/W1048209023,graphene is a promising material for high-perf...,"synthesis, charge transport and device applica...",2015-12-01,"['graphene nanoribbons', 'charge']","[('Graphene', 0.85890615), ('Graphene nanoribb..."
608,https://openalex.org/W1048450219,"graphene, a single atomic layer of graphite, h...",growth morphology and properties of metals on ...,2015-12-01,"['graphene', 'metals', 'growth']","[('Graphene', 0.9511391), ('Materials science'..."
...,...,...,...,...,...,...
2048126,https://openalex.org/W957922153,carrier mobility and chemical doping level are...,graphene mobility mapping,2015-07-24,['graphene mobility mapping'],"[('Graphene', 0.94014764), ('Materials science..."
2048128,https://openalex.org/W958056737,"graphene is a semimetal with zero band gap, wh...",semiconducting graphene on silicon from first-...,2015-08-10,"['graphene', 'semiconducting', 'silicon', 'fir...","[('Graphene', 0.9208318), ('Bilayer graphene',..."
2048218,https://openalex.org/W965259619,intrinsic localized modes or discrete breather...,long-lived discrete breathers in free-standing...,2016-06-01,"['discrete breathers', 'long-lived', 'free-sta...","[('Graphene', 0.90510213), ('Breather', 0.8442..."
2048525,https://openalex.org/W987021357,stacking graphene sheets forms graphite. two i...,graphite under uniaxial compression along thec...,2015-09-22,"['graphite', 'uniaxial compression', 'strain',...","[('Graphene', 0.747055), ('Graphite', 0.658367..."


In [27]:
idx = sorted(list(reduce_tree(tree, ["Graphene"], visited=set())))
df.iloc[idx]

KeyboardInterrupt: 

In [26]:
idx = sorted(list(sum_tree_nodes(tree["Graphene"]["leafs"], visited=set())))
df.iloc[idx]

Unnamed: 0,id,abstract,title,publication_date,keywords,concepts
239,https://openalex.org/W1018953785,we show that graphene single crystals as large...,chemical vapor deposition growth of 5 mm hexag...,2015-11-01,"['graphene', 'ethanol', 'single-crystal']","[('Graphene', 0.8950863), ('Chemical vapor dep..."
267,https://openalex.org/W1021326259,we theoretically study the influence of surfac...,surface optical phonon-assisted cyclotron reso...,2015-08-01,"['graphene', 'cyclotron resonance', 'optical',...","[('Graphene', 0.9213928), ('Materials science'..."
413,https://openalex.org/W1031618044,the telescopic contact between graphene layers...,tunneling conductance of telescopic contacts b...,2015-11-01,"['graphene layers', 'tunneling conductance', '...","[('Graphene', 0.8410138), ('Quantum tunnelling..."
604,https://openalex.org/W1048209023,graphene is a promising material for high-perf...,"synthesis, charge transport and device applica...",2015-12-01,"['graphene nanoribbons', 'charge']","[('Graphene', 0.85890615), ('Graphene nanoribb..."
608,https://openalex.org/W1048450219,"graphene, a single atomic layer of graphite, h...",growth morphology and properties of metals on ...,2015-12-01,"['graphene', 'metals', 'growth']","[('Graphene', 0.9511391), ('Materials science'..."
...,...,...,...,...,...,...
2048126,https://openalex.org/W957922153,carrier mobility and chemical doping level are...,graphene mobility mapping,2015-07-24,['graphene mobility mapping'],"[('Graphene', 0.94014764), ('Materials science..."
2048128,https://openalex.org/W958056737,"graphene is a semimetal with zero band gap, wh...",semiconducting graphene on silicon from first-...,2015-08-10,"['graphene', 'semiconducting', 'silicon', 'fir...","[('Graphene', 0.9208318), ('Bilayer graphene',..."
2048218,https://openalex.org/W965259619,intrinsic localized modes or discrete breather...,long-lived discrete breathers in free-standing...,2016-06-01,"['discrete breathers', 'long-lived', 'free-sta...","[('Graphene', 0.90510213), ('Breather', 0.8442..."
2048525,https://openalex.org/W987021357,stacking graphene sheets forms graphite. two i...,graphite under uniaxial compression along thec...,2015-09-22,"['graphite', 'uniaxial compression', 'strain',...","[('Graphene', 0.747055), ('Graphite', 0.658367..."


In [30]:
# normal

In [23]:
nodes_concepts = [list([c[0] for c in eval(x)[:5]]) for x in df["concepts"].tolist()]

In [24]:
df[list(map(lambda x: list(x)[0] == "Graphene", nodes_concepts))]

Unnamed: 0,id,abstract,title,publication_date,keywords,concepts
239,https://openalex.org/W1018953785,we show that graphene single crystals as large...,chemical vapor deposition growth of 5 mm hexag...,2015-11-01,"['graphene', 'ethanol', 'single-crystal']","[('Graphene', 0.8950863), ('Chemical vapor dep..."
267,https://openalex.org/W1021326259,we theoretically study the influence of surfac...,surface optical phonon-assisted cyclotron reso...,2015-08-01,"['graphene', 'cyclotron resonance', 'optical',...","[('Graphene', 0.9213928), ('Materials science'..."
413,https://openalex.org/W1031618044,the telescopic contact between graphene layers...,tunneling conductance of telescopic contacts b...,2015-11-01,"['graphene layers', 'tunneling conductance', '...","[('Graphene', 0.8410138), ('Quantum tunnelling..."
604,https://openalex.org/W1048209023,graphene is a promising material for high-perf...,"synthesis, charge transport and device applica...",2015-12-01,"['graphene nanoribbons', 'charge']","[('Graphene', 0.85890615), ('Graphene nanoribb..."
608,https://openalex.org/W1048450219,"graphene, a single atomic layer of graphite, h...",growth morphology and properties of metals on ...,2015-12-01,"['graphene', 'metals', 'growth']","[('Graphene', 0.9511391), ('Materials science'..."
...,...,...,...,...,...,...
2048126,https://openalex.org/W957922153,carrier mobility and chemical doping level are...,graphene mobility mapping,2015-07-24,['graphene mobility mapping'],"[('Graphene', 0.94014764), ('Materials science..."
2048128,https://openalex.org/W958056737,"graphene is a semimetal with zero band gap, wh...",semiconducting graphene on silicon from first-...,2015-08-10,"['graphene', 'semiconducting', 'silicon', 'fir...","[('Graphene', 0.9208318), ('Bilayer graphene',..."
2048218,https://openalex.org/W965259619,intrinsic localized modes or discrete breather...,long-lived discrete breathers in free-standing...,2016-06-01,"['discrete breathers', 'long-lived', 'free-sta...","[('Graphene', 0.90510213), ('Breather', 0.8442..."
2048525,https://openalex.org/W987021357,stacking graphene sheets forms graphite. two i...,graphite under uniaxial compression along thec...,2015-09-22,"['graphite', 'uniaxial compression', 'strain',...","[('Graphene', 0.747055), ('Graphite', 0.658367..."


In [9]:
tree["Superconductivity"]

{'nodes': set(),
 'leafs': {'Pairing': {'nodes': set(),
   'leafs': {'Condensed matter physics': {'nodes': {84,
      9882,
      24210,
      36448,
      60407,
      81717,
      101188,
      107409,
      146889,
      186471,
      190047,
      197240,
      208518,
      235224,
      236147,
      283450,
      315021,
      350360,
      365299,
      386555,
      397272,
      414573,
      428601,
      442278,
      450251,
      535690,
      567112,
      624503,
      709425,
      727307,
      778850,
      793793,
      796546,
      799436,
      829488,
      854986,
      867562,
      878194,
      883509,
      923481,
      961708,
      1030931,
      1039981,
      1044112,
      1089351,
      1144365,
      1153481,
      1188070,
      1210224,
      1228060,
      1240076,
      1277000,
      1306953,
      1312159,
      1319961,
      1365759,
      1426874,
      1445213,
      1451173,
      1554178,
      1559446,
      1580920,
      1598038,
    

In [9]:
df.iloc[list(tree["Superconductivity"]["nodes"])]

Unnamed: 0,id,abstract,title,publication_date,keywords,concepts


In [17]:
df.iloc[sorted(list(sum_tree_nodes(tree["Physics"]["leafs"], visited=set())))]

Unnamed: 0,id,abstract,title,publication_date,keywords,concepts
2,https://openalex.org/W100006313,in a wire spark-chamber experiment the squared...,experimental study on the τ± decay matrix element,1972-03-01,['matrix'],"[('Physics', 0.9268166), ('Matrix element', 0...."
5,https://openalex.org/W100035585,motivated by the peculiar features observed th...,nonlinearc-axis transport inbi2sr2cacu2o,2009-04-01,"['transport', 'nonlinearc-axis', 'two-barrier']","[('Physics', 0.73089707), ('Pseudogap', 0.6889..."
9,https://openalex.org/W1000585164,"in this work, we study systems composed of a $...",the $$\rho (\omega ) b^* (b)$$ ρ ( ω ) b ∗ ( b...,2016-02-01,['interaction'],"[('Physics', 0.83831257), ('Omega', 0.82759345..."
10,https://openalex.org/W1000603767,quantum steering inequalities allow to demonst...,einstein-podolsky-rosen steering: closing the ...,2013-02-13,"['detection loophole', 'einstein-podolsky-rose...","[('Physics', 0.90650284), ('Quantum key distri..."
17,https://openalex.org/W10010732,an estimate of the magnitude of the $\mathrm{s...,"the (1, 8)+(8, 1) term insu(3)×su(3)symmetry b...",1971-07-01,['insu3×su3symmetry'],"[('Physics', 0.590814), ('Symmetry breaking', ..."
...,...,...,...,...,...,...
2048696,https://openalex.org/W99876827,we consider the decay ${b}^{0}(t)\ensuremath{\...,newcpobservables inb0(t)→hyperon+antihyperonfr...,1998-10-29,"['parity violation', 'decay']","[('Physics', 0.8435714), ('Hyperon', 0.5917092..."
2048700,https://openalex.org/W99919431,we report a high-statistics experiment measuri...,observation of coherent interference pattern b...,1970-11-09,"['coherent interference pattern', 'between<mml...","[('Physics', 0.75475097), ('Omega', 0.7398769)..."
2048701,https://openalex.org/W99922069,we have done a new evaluation of the lowest-or...,hadronic part of the muon anomalous magnetic m...,1990-08-01,"['anomalous magnetic moment', 'muon', 'hadroni...","[('Physics', 0.8732732), ('Muon', 0.7624949), ..."
2048704,https://openalex.org/W99994197,measurements are reported of the transmission ...,infrared optical excitations inla2nio4,1995-10-01,['infrared optical excitations'],"[('Physics', 0.62062633), ('Absorption (acoust..."


In [None]:
# Symmetry breaking

In [22]:
k = "Hyperon"
df.iloc[sorted(list(tree[k]["nodes"] | sum_tree_nodes(tree[k]["leafs"], visited=set())))]

Unnamed: 0,id,abstract,title,publication_date,keywords,concepts
8574,https://openalex.org/W1493763504,the properties of \ensuremath{\sigma}-hyperons...,σσinteractions in finite-density qcd sum rules,2008-04-14,['finite-density'],"[('Hyperon', 0.845796), ('Quantum chromodynami..."
9577,https://openalex.org/W1497134819,"by using resonance model, we investigate ${k}^...",resonance model study onk+n→kpηnear threshold,2012-07-13,[],"[('Hyperon', 0.61568487), ('Resonance (particl..."
10957,https://openalex.org/W1501775630,we study the long range part of the \ensuremat...,baryon decuplet in the chiral dynamics of λ hy...,2007-03-28,"['chiral dynamics', 'hyperons', 'nuclear']","[('Hyperon', 0.95031285), ('Physics', 0.919366..."
14937,https://openalex.org/W1515654579,we calculate the longitudinal polarizations of...,longitudinal polarization of hyperons in highp...,2002-06-12,"['singly polarizedppcollisions', 'longitudinal...","[('Hyperon', 0.79821134), ('Physics', 0.773141..."
19113,https://openalex.org/W1531116484,a model for the process ${\ensuremath{\pi}}^{+...,resonance model forς+−k+production,1965-03-08,['model'],"[('Hyperon', 0.820969), ('Physics', 0.8052546)..."
...,...,...,...,...,...,...
2031752,https://openalex.org/W4235095995,we obtain a model-independent expression for t...,radiative corrections to the semileptonic dali...,2002-02-28,"['semileptonic dalitz plot', 'hyperons', 'radi...","[('Hyperon', 0.95104784), ('Physics', 0.938554..."
2031995,https://openalex.org/W4235644188,we calculate the bulk viscosity due to nonequi...,bulk viscosity of superfluid hyperon stars,2008-10-14,"['bulk viscosity', 'stars']","[('Hyperon', 0.975749), ('Physics', 0.9171221)..."
2035163,https://openalex.org/W4242816380,the formulas for spin and angular correlations...,beta decay of hyperons,1971-04-01,['beta decay'],"[('Hyperon', 0.94621056), ('Physics', 0.838032..."
2037660,https://openalex.org/W4248651992,we use baryon chiral perturbation theory in th...,hyperon polarizabilities,1992-10-01,[],"[('Hyperon', 0.91110796), ('Physics', 0.896715..."


In [14]:
k = 'Physics'
df.iloc[list(sum_tree_nodes(tree[k]["leafs"]["Superconductivity"]["leafs"], visited=set())) + list(tree[k]["nodes"])]

Unnamed: 0,id,abstract,title,publication_date,keywords,concepts
1572865,https://openalex.org/W2153625147,"in these lectures, a variety of non-equilibriu...",collective transport in random media: from sup...,1998-07-01,"['collective transport', 'random media', 'eart...","[('Physics', 0.8717339), ('Superconductivity',..."
1474564,https://openalex.org/W2117192125,abstract a theoretical model describing the fo...,twin spacing versus size of a monocrystal for ...,1993-08-01,"['superconductors', 'monocrystal', 'nonstoichi...","[('Physics', 0.8272569), ('Superconductivity',..."
1015818,https://openalex.org/W2055778563,we report the effects of 200-kev ${\mathrm{he}...,200-kevhe+-ion irradiation effects on the prop...,1991-03-01,"['irradiation', 'pulsed-laser-depositedyba']","[('Physics', 0.6325699), ('Superconductivity',..."
1974288,https://openalex.org/W3102829868,the planckian relaxation rate $\ensuremath{\hb...,reentrant superconductivity in a quantum dot c...,2019-12-26,"['reentrant superconductivity', 'quantum dot',...","[('Physics', 0.8672319), ('Superconductivity',..."
1441810,https://openalex.org/W2105172673,recent measurements of the ultrasonic at tenua...,on the ultrasonic attenuation in multigap supe...,1971-06-01,['ultrasonic attenuation'],"[('Physics', 0.911649), ('Superconductivity', ..."
...,...,...,...,...,...,...
671728,https://openalex.org/W2021652064,the study of β-γ circular polarization correla...,β−γcircular polarization correlation in aj−jtr...,1957-08-15,['display=inline><mmlmi>β</mmlmi><mmlmo>−</mml...,"[('Physics', 0.32917964)]"
1712113,https://openalex.org/W2413005821,a reply to the comment by r. l. workman et al....,thielet al.reply:,2013-04-19,"['alreply', 'alreply']","[('Physics', 0.44117755)]"
155634,https://openalex.org/W197082344,a pole-resonance model is used to fit the data...,parameters of low-energyλ−k0production,1968-09-25,"['parameters', 'low-energy']","[('Physics', 0.4975796)]"
991221,https://openalex.org/W2053332080,we report measurements of the normal state mag...,angular dependence of thec-axis normal state m...,1996-01-01,"['angular dependence', 'single crystal<mmlmath']","[('Physics', 0.38475502)]"


In [19]:
all_partitions = [list(sum_tree_nodes(tree[k]["leafs"], visited=set())) + list(tree[k]["nodes"]) for k in tqdm(tree.keys(), total=len(tree))]

  0%|                            | 2/21654 [00:03<10:21:40,  1.72s/it]

KeyboardInterrupt: 

In [None]:
modularity(G_networkx, list(open_alex_keyword_communities_first_kw.values()), weight=None, resolution=1)

In [13]:
tree["Superconductivity"]

{'nodes': set(),
 'leafs': {'Pairing': {'nodes': set(),
   'leafs': {'Condensed matter physics': {'nodes': set(),
     'leafs': {'Physics': {'nodes': set(),
       'leafs': {'Surface (topology)': {'nodes': {84}, 'leafs': {}},
        'Charge density wave': {'nodes': {9882}, 'leafs': {}},
        'Electron': {'nodes': {60407}, 'leafs': {}},
        'Coulomb': {'nodes': {81717}, 'leafs': {}},
        'Spin (aerodynamics)': {'nodes': {283450}, 'leafs': {}},
        'Hubbard model': {'nodes': {365299}, 'leafs': {}},
        'Phase (matter)': {'nodes': {397272}, 'leafs': {}},
        'Observable': {'nodes': {414573}, 'leafs': {}},
        'Singlet state': {'nodes': {450251}, 'leafs': {}},
        'Chemistry': {'nodes': {535690}, 'leafs': {}},
        'Anisotropy': {'nodes': {567112}, 'leafs': {}},
        'Photoemission spectroscopy': {'nodes': {709425}, 'leafs': {}},
        'Cooper pair': {'nodes': {793793}, 'leafs': {}},
        'Andreev reflection': {'nodes': {829488}, 'leafs': {}},
   

In [89]:
tree = {}
tree = fill_tree(tree, frozenset(['phase transitions']), 4)
for i in range(3):
    tree = fill_tree(tree, frozenset(['phase transitions', 'x-ray']), i+1) #["phase transitions"]

In [90]:
tree = fill_tree(tree, frozenset(['phase transitions', 'x-ray', 'balinha juquinha']), 5)

In [91]:
tree = fill_tree(tree, frozenset(['phase transitions', 'x-ray', 'maca verde gostosinha']), 6)

In [92]:
tree = fill_tree(tree, frozenset(['phase transitions']), 12)

In [93]:
tree.keys()

dict_keys([frozenset({'phase transitions'}), frozenset({'x-ray'}), frozenset({'balinha juquinha'}), frozenset({'maca verde gostosinha'})])

In [94]:
tree

{frozenset({'phase transitions'}): {'nodes': {4, 12},
  'leafs': {frozenset({'x-ray'}): {'nodes': {1, 2, 3},
    'leafs': {frozenset({'balinha juquinha'}): {'nodes': {5}, 'leafs': {}},
     frozenset({'maca verde gostosinha'}): {'nodes': {6}, 'leafs': {}}}},
   frozenset({'balinha juquinha'}): {'nodes': set(),
    'leafs': {frozenset({'x-ray'}): {'nodes': {5}, 'leafs': {}}}},
   frozenset({'maca verde gostosinha'}): {'nodes': set(),
    'leafs': {frozenset({'x-ray'}): {'nodes': {6}, 'leafs': {}}}}}},
 frozenset({'x-ray'}): {'nodes': set(),
  'leafs': {frozenset({'phase transitions'}): {'nodes': {1, 2, 3},
    'leafs': {frozenset({'balinha juquinha'}): {'nodes': {5}, 'leafs': {}},
     frozenset({'maca verde gostosinha'}): {'nodes': {6}, 'leafs': {}}}},
   frozenset({'balinha juquinha'}): {'nodes': set(),
    'leafs': {frozenset({'phase transitions'}): {'nodes': {5}, 'leafs': {}}}},
   frozenset({'maca verde gostosinha'}): {'nodes': set(),
    'leafs': {frozenset({'phase transitions'}):

In [95]:
tree = fill_tree(tree, frozenset(['x-ray']), 8)

In [96]:
tree = fill_tree(tree, frozenset(['phase transitions', 'x-ray']), 10) #["phase transitions"]

In [97]:
tree[frozenset(["phase transitions"])] #["leafs"]

{'nodes': {4, 12},
 'leafs': {frozenset({'x-ray'}): {'nodes': {1, 2, 3, 10},
   'leafs': {frozenset({'balinha juquinha'}): {'nodes': {5}, 'leafs': {}},
    frozenset({'maca verde gostosinha'}): {'nodes': {6}, 'leafs': {}}}},
  frozenset({'balinha juquinha'}): {'nodes': set(),
   'leafs': {frozenset({'x-ray'}): {'nodes': {5}, 'leafs': {}}}},
  frozenset({'maca verde gostosinha'}): {'nodes': set(),
   'leafs': {frozenset({'x-ray'}): {'nodes': {6}, 'leafs': {}}}}}}

In [99]:
tree

{frozenset({'phase transitions'}): {'nodes': {4, 12},
  'leafs': {frozenset({'x-ray'}): {'nodes': {1, 2, 3, 10},
    'leafs': {frozenset({'balinha juquinha'}): {'nodes': {5}, 'leafs': {}},
     frozenset({'maca verde gostosinha'}): {'nodes': {6}, 'leafs': {}}}},
   frozenset({'balinha juquinha'}): {'nodes': set(),
    'leafs': {frozenset({'x-ray'}): {'nodes': {5}, 'leafs': {}}}},
   frozenset({'maca verde gostosinha'}): {'nodes': set(),
    'leafs': {frozenset({'x-ray'}): {'nodes': {6}, 'leafs': {}}}}}},
 frozenset({'x-ray'}): {'nodes': {8},
  'leafs': {frozenset({'phase transitions'}): {'nodes': {1, 2, 3, 10},
    'leafs': {frozenset({'balinha juquinha'}): {'nodes': {5}, 'leafs': {}},
     frozenset({'maca verde gostosinha'}): {'nodes': {6}, 'leafs': {}}}},
   frozenset({'balinha juquinha'}): {'nodes': set(),
    'leafs': {frozenset({'phase transitions'}): {'nodes': {5}, 'leafs': {}}}},
   frozenset({'maca verde gostosinha'}): {'nodes': set(),
    'leafs': {frozenset({'phase transitio

In [None]:
def construct_hierarchy_tree_of_communities(nodes_communities, fill_empty="no_keyword_attributed"):
    tree = {}
    current_keys = []
    
    def dfs_search_tree(tree, criteria_fn, access_path=[] ):
        if len(list(tree.keys())) == 0:
            
        if criteria_fn()
    
    for i, c_curr in enumerate(tqdm(nodes_communities, total=len(nodes_communities))):
        #k_curr = set(eval(row["keywords"]))
        if len(c_curr) == 0:
            c_curr = {fill_empty}

        
    
        for j, k in enumerate(current_keys):
            s_sorted_keys = str(sorted(list(k_curr)))
            s_sorted_keys_old = str(sorted(list(k)))
            if len(k - k_curr) == 0:
                if len(k_curr) > len(k):
                    current_keys[j] = k_curr
                    oa_keyword_communities_agg[s_sorted_keys] = oa_keyword_communities_agg[s_sorted_keys_old] | {i}
                    del oa_keyword_communities_agg[s_sorted_keys_old]
                else:
                    oa_keyword_communities_agg[s_sorted_keys_old] = oa_keyword_communities_agg[s_sorted_keys_old] | {i}
        
        if len(ks) > 0:
            first_row_key = ks[0]
        else:
            first_row_key = "no_keyword_attributed"
        
        if first_row_key not in open_alex_keyword_communities_first_kw:
            open_alex_keyword_communities_first_kw[first_row_key] = {i}
        else:
            open_alex_keyword_communities_first_kw[first_row_key] = open_alex_keyword_communities_first_kw[first_row_key] | {i}

In [None]:
oa_keyword_communities_tree = {}
current_keys = []
for i, row in df.iterrows():
    k_curr = set(eval(row["keywords"]))
    if len(k_curr) == 0:
        k_curr = {no_keyword_attributed}

    for j, k in enumerate(current_keys):
        s_sorted_keys = str(sorted(list(k_curr)))
        s_sorted_keys_old = str(sorted(list(k)))
        if len(k - k_curr) == 0:
            if len(k_curr) > len(k):
                current_keys[j] = k_curr
                oa_keyword_communities_agg[s_sorted_keys] = oa_keyword_communities_agg[s_sorted_keys_old] | {i}
                del oa_keyword_communities_agg[s_sorted_keys_old]
            else:
                oa_keyword_communities_agg[s_sorted_keys_old] = oa_keyword_communities_agg[s_sorted_keys_old] | {i}
    
    if len(ks) > 0:
        first_row_key = ks[0]
    else:
        first_row_key = "no_keyword_attributed"
    
    if first_row_key not in open_alex_keyword_communities_first_kw:
        open_alex_keyword_communities_first_kw[first_row_key] = {i}
    else:
        open_alex_keyword_communities_first_kw[first_row_key] = open_alex_keyword_communities_first_kw[first_row_key] | {i}


In [3]:
df

Unnamed: 0,id,abstract,title,publication_date,keywords,concepts
0,https://openalex.org/W100001730,the raman spectra of the parent compound naxco...,raman spectroscopy study ofnaxcoo2and supercon...,2004-08-10,"['raman spectroscopy', 'spectroscopy study']","[('Raman spectroscopy', 0.89063823), ('Superco..."
1,https://openalex.org/W1000042173,publisher summary this chapter discusses the s...,chapter 1: vortices in rotating superfluid3he,1986-01-01,"['superfluid3he', 'vortices']","[('Vortex', 0.7865634), ('Physics', 0.376391),..."
2,https://openalex.org/W100006313,in a wire spark-chamber experiment the squared...,experimental study on the τ± decay matrix element,1972-03-01,['matrix'],"[('Physics', 0.9268166), ('Matrix element', 0...."
3,https://openalex.org/W1000096099,"using angle-resolved photoemission, we have ma...",fermi surface and electronic structure ofnd2−xce,1993-05-17,"['fermi surface', 'electronic structure']","[('Fermi surface', 0.59463906), ('Physics', 0...."
4,https://openalex.org/W1000312985,we demonstrate controlled manipulation of larg...,microparticle manipulation using inertial forces,2006-02-27,"['forces', 'manipulation']","[('Particle (ecology)', 0.62976944), ('Micropa..."
...,...,...,...,...,...,...
2048702,https://openalex.org/W999422512,a mostly single bcc phase with nanoscale grain...,high saturation magnetization and soft magneti...,1991-01-01,"['soft magnetic properties', 'high saturation ...","[('Materials science', 0.725253), ('Amorphous ..."
2048703,https://openalex.org/W999575332,publisher summary this chapter discusses a phe...,chapter 1 protein-lipid interactions and membr...,1993-01-01,"['membrane heterogeneity', 'protein-lipid']","[('Chemistry', 0.4417297), ('Biology', 0.38962..."
2048704,https://openalex.org/W99994197,measurements are reported of the transmission ...,infrared optical excitations inla2nio4,1995-10-01,['infrared optical excitations'],"[('Physics', 0.62062633), ('Absorption (acoust..."
2048705,https://openalex.org/W999961912,this chapter reviews the computational methods...,energies and asymptotic analysis for helium ry...,1993-01-01,['asymptotic analysis'],"[('Physics', 0.88181573), ('Rydberg formula', ..."
