In [1]:
from neo4j import GraphDatabase
import pandas as pd

# Init the connection to the database
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "neuroinformatics_orc_id"), encrypted=False)
pd.set_option('display.max_columns', None)

def cyperQueryToDataFrame(query):
  with driver.session() as session:
    result = session.run(query)
    return pd.DataFrame(result.data(), columns=result.keys())



## WCC 

In [2]:
wcc = cyperQueryToDataFrame("MATCH (n:Author) RETURN id(n), n.__wcc")

In [3]:
wcc

Unnamed: 0,id(n),n.__wcc
0,2730,0
1,2731,0
2,2732,2
3,2733,3
4,2734,4
...,...,...
13030,26680,13030
13031,26681,13031
13032,26682,13032
13033,26683,13033


In [4]:
wcc.astype(object).describe()

Unnamed: 0,id(n),n.__wcc
count,13035,13035
unique,13035,6253
top,2730,3
freq,1,4749


In [5]:
wcc_grouped = wcc.groupby(['n.__wcc']).count()

In [6]:
wcc_grouped

Unnamed: 0_level_0,id(n)
n.__wcc,Unnamed: 1_level_1
0,48
2,1
3,4749
4,1
10,2
...,...
13030,1
13031,1
13032,1
13033,1


In [7]:
wcc_grouped.describe()

Unnamed: 0,id(n)
count,6253.0
mean,2.084599
std,60.05027
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,4749.0


## Degree

In [8]:
degree_df = cyperQueryToDataFrame("MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree")

In [9]:
degree_df.describe()

Unnamed: 0,wcc,degree
count,13035.0,13035.0
mean,4263.024933,3.747142
std,4408.435855,12.666121
min,0.0,0.0
25%,3.0,0.0
50%,2948.0,1.0
75%,8232.0,3.0
max,13034.0,254.0


In [10]:
degree_df.astype(object).describe()

Unnamed: 0,n.id,wcc,degree
count,13035,13035,13035.0
unique,13035,6253,126.0
top,0000-0002-4132-3055,3,0.0
freq,1,4749,5091.0


In [11]:
degree_df.sort_values(by="degree", ascending=False) 

Unnamed: 0,n.id,wcc,degree
726,0000-0002-0198-4588,3,254.0
1005,0000-0002-1733-263x,3,233.0
2787,0000-0003-0308-5583,3,210.0
200,0000-0002-9595-3220,3,194.0
556,0000-0001-7133-4970,3,188.0
...,...,...,...
8977,0000-0002-4961-7832,8977,0.0
8976,0000-0002-6873-7768,8976,0.0
8975,0000-0003-2038-7954,8975,0.0
8973,0000-0003-3757-0376,8973,0.0


In [12]:
degree_df[degree_df['degree'] == 2]

Unnamed: 0,n.id,wcc,degree
0,0000-0002-2561-3458,0,2.0
5,0000-0003-1144-3272,3,2.0
9,0000-0002-0392-7608,3,2.0
56,0000-0003-1192-9942,3,2.0
63,0000-0002-2925-0244,3,2.0
...,...,...,...
12952,0000-0002-8168-5405,3,2.0
12980,0000-0001-6261-353x,11583,2.0
12981,0000-0003-2804-274x,11583,2.0
12982,0000-0002-6930-0699,3,2.0


## Degree by author and number of resource published

In [13]:
df = cyperQueryToDataFrame("MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree, n.resource_published as resources")

In [14]:
df.describe()

Unnamed: 0,wcc,degree,resources
count,13035.0,13035.0,13035.0
mean,4263.024933,3.747142,3.707633
std,4408.435855,12.666121,8.361192
min,0.0,0.0,1.0
25%,3.0,0.0,1.0
50%,2948.0,1.0,1.0
75%,8232.0,3.0,3.0
max,13034.0,254.0,275.0


già qui si potrebbe pensare ad un indicatore che esprime quanto l'autore è propenso a fare risorse con persone diverse

## Adding page rank

In [15]:
df = cyperQueryToDataFrame("""
    MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree, n.resource_published as resources, n.__parerank as pagerank
""")

In [16]:
df

Unnamed: 0,n.id,wcc,degree,resources,pagerank
0,0000-0002-2561-3458,0,2.0,2,0.928861
1,0000-0003-0851-0665,0,3.0,3,1.114622
2,0000-0001-7479-2694,2,0.0,1,0.150000
3,0000-0003-3025-1292,3,6.0,26,0.918303
4,0000-0002-2918-4152,4,0.0,9,0.150000
...,...,...,...,...,...
13030,0000-0002-4485-2665,13030,0.0,1,0.150000
13031,0000-0001-8308-5393,13031,0.0,1,0.150000
13032,0000-0001-9765-0875,13032,0.0,1,0.150000
13033,0000-0002-0616-0617,13033,0.0,1,0.150000


In [17]:
df.describe()

Unnamed: 0,wcc,degree,resources,pagerank
count,13035.0,13035.0,13035.0,13035.0
mean,4263.024933,3.747142,3.707633,0.668021
std,4408.435855,12.666121,8.361192,0.726383
min,0.0,0.0,1.0,0.15
25%,3.0,0.0,1.0,0.15
50%,2948.0,1.0,1.0,0.49127
75%,8232.0,3.0,3.0,1.0
max,13034.0,254.0,275.0,13.099599


## Adding local clustering coefficient

In [18]:
df = cyperQueryToDataFrame("""
    MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree, n.resource_published as resources, n.__parerank as pagerank,
    n.__localClusteringCoefficient as localclusteringcoefficient
""")

In [19]:
df

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient
0,0000-0002-2561-3458,0,2.0,2,0.928861,0.000000
1,0000-0003-0851-0665,0,3.0,3,1.114622,0.333333
2,0000-0001-7479-2694,2,0.0,1,0.150000,0.000000
3,0000-0003-3025-1292,3,6.0,26,0.918303,0.266667
4,0000-0002-2918-4152,4,0.0,9,0.150000,0.000000
...,...,...,...,...,...,...
13030,0000-0002-4485-2665,13030,0.0,1,0.150000,0.000000
13031,0000-0001-8308-5393,13031,0.0,1,0.150000,0.000000
13032,0000-0001-9765-0875,13032,0.0,1,0.150000,0.000000
13033,0000-0002-0616-0617,13033,0.0,1,0.150000,0.000000


In [20]:
df.describe()

Unnamed: 0,wcc,degree,resources,pagerank,localclusteringcoefficient
count,13035.0,13035.0,13035.0,13035.0,13035.0
mean,4263.024933,3.747142,3.707633,0.668021,0.250897
std,4408.435855,12.666121,8.361192,0.726383,0.398681
min,0.0,0.0,1.0,0.15,0.0
25%,3.0,0.0,1.0,0.15,0.0
50%,2948.0,1.0,1.0,0.49127,0.0
75%,8232.0,3.0,3.0,1.0,0.404904
max,13034.0,254.0,275.0,13.099599,1.0


In [21]:
df.sort_values(by="localclusteringcoefficient", ascending=False) 

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient
5679,0000-0002-3918-1860,5678,3.0,1,1.000000,1.0
6996,0000-0001-6797-5476,3,2.0,1,0.446698,1.0
5680,0000-0001-6119-4521,5678,3.0,1,1.000000,1.0
2842,0000-0002-2145-1404,2842,2.0,7,1.000000,1.0
8961,0000-0002-6505-403x,5463,2.0,1,1.000000,1.0
...,...,...,...,...,...,...
5847,0000-0002-2450-636x,5847,0.0,1,0.150000,0.0
5848,0000-0003-2663-0759,5848,0.0,1,0.150000,0.0
5849,0000-0001-7936-3522,3,1.0,1,0.350740,0.0
5850,0000-0003-3042-4878,2022,2.0,6,1.298246,0.0


Il 2 è un errore c'è una doppia relazione tra lo stesso autore e la stessa risorsa

## First approach using DBScan

In [22]:
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor, NearestNeighbors

In [23]:
df = cyperQueryToDataFrame("""
    MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree, n.resource_published as resources, n.__parerank as pagerank,
    n.__localClusteringCoefficient as localclusteringcoefficient
""")

In [24]:
X = StandardScaler().fit_transform(df[["degree", "resources", "pagerank", "localclusteringcoefficient"]].to_numpy())

In [25]:
db = DBSCAN(eps=0.05, min_samples=4, algorithm="ball_tree", leaf_size=1000).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

In [26]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

In [27]:
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 173
Estimated number of noise points: 2392


In [31]:
db2 = DBSCAN(eps=1, min_samples=4, algorithm="ball_tree", leaf_size=1000).fit(X)
core_samples_mask = np.zeros_like(db2.labels_, dtype=bool)
core_samples_mask[db2.core_sample_indices_] = True
labels = db2.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 5
Estimated number of noise points: 71


In [32]:
df[labels == -1]

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient
27,0000-0002-1376-8532,3,182.0,113,9.389351,0.312428
53,0000-0003-3379-8744,3,45.0,84,6.068915,0.089899
54,0000-0001-6363-2759,3,55.0,187,8.178575,0.054545
55,0000-0002-0465-2028,3,97.0,91,6.892115,0.366838
70,0000-0002-7821-117x,3,18.0,90,4.370136,0.045752
...,...,...,...,...,...,...
2786,0000-0003-2967-9662,3,184.0,17,8.234506,0.351271
2787,0000-0003-0308-5583,3,210.0,11,7.886872,0.343267
2789,0000-0003-4073-532x,3,98.0,19,4.701190,0.514622
2873,0000-0002-3413-570x,3,89.0,10,4.261657,0.552094


In [33]:
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.0001) # in pratica 20 neighbour sembra funzionare bene
y_pred = clf.fit_predict(X)

In [34]:
X_scores = clf.negative_outlier_factor_

In [35]:
X_scores.max()

-0.8989470928101306

In [36]:
X_scores.min()

-2152622371.843041

In [37]:
df['lof'] = X_scores

In [38]:
df.sort_values(by=['lof'], ascending=True)

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient,lof
6875,0000-0003-1073-2638,437,3.0,3,1.787789,0.0,-2.152622e+09
981,0000-0002-0136-1049,3,2.0,2,1.092730,0.0,-1.182014e+09
3424,0000-0002-8002-922x,3424,2.0,2,1.298246,0.0,-1.161731e+09
917,0000-0003-1174-6054,917,2.0,2,1.298246,0.0,-1.161731e+09
11824,0000-0001-7913-9109,3424,2.0,2,1.298246,0.0,-1.161731e+09
...,...,...,...,...,...,...,...
11123,0000-0002-7849-4325,3,2.0,1,0.438268,1.0,-9.315338e-01
5230,0000-0002-8418-1675,3,2.0,1,0.433075,1.0,-9.312373e-01
5229,0000-0002-1577-8806,3,2.0,1,0.433075,1.0,-9.312373e-01
10877,0000-0002-2022-5739,3,2.0,1,0.435922,1.0,-9.304815e-01


In [48]:
df = cyperQueryToDataFrame("""
    MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree, n.resource_published as resources, n.__parerank as pagerank,
    n.__localClusteringCoefficient as localclusteringcoefficient, n.`__fastrp_embedding`, reduce(accumulator = 0, variable IN n.`__fastrp_embedding` | accumulator + variable) as sum_rp
""")

In [49]:
df

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient,n.`__fastrp_embedding`,sum_rp
0,0000-0002-2561-3458,0,2.0,2,0.928861,0.000000,"[0.3070467710494995, -0.3500320017337799, 0.42...",11.125182
1,0000-0003-0851-0665,0,3.0,3,1.114622,0.333333,"[0.10494255274534225, -0.7914800643920898, 0.6...",13.781511
2,0000-0001-7479-2694,2,0.0,1,0.150000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000
3,0000-0003-3025-1292,3,6.0,26,0.918303,0.266667,"[0.21004994213581085, 1.0289921760559082, -0.4...",2.105424
4,0000-0002-2918-4152,4,0.0,9,0.150000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000
...,...,...,...,...,...,...,...,...
13030,0000-0002-4485-2665,13030,0.0,1,0.150000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000
13031,0000-0001-8308-5393,13031,0.0,1,0.150000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000
13032,0000-0001-9765-0875,13032,0.0,1,0.150000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000
13033,0000-0002-0616-0617,13033,0.0,1,0.150000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000


In [51]:
len(df[df['sum_rp'] == 0]) # numero di nodi che hanno tutto a 0 (cioè non hanno collaborato con nessuno)

5094

In [40]:
X = np.array(df["n.`__fastrp_embedding`"].to_list())

In [52]:
X

array([[ 0.30704677, -0.350032  ,  0.4241665 , ...,  0.5577575 ,
        -0.47319341, -1.33884799],
       [ 0.10494255, -0.79148006,  0.64841884, ...,  0.69398236,
        -0.96658051, -1.79946589],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [53]:
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.0001) # in pratica 20 neighbour sembra funzionare bene
y_pred = clf.fit_predict(X)

In [54]:
X_scores.min()

-88420734381.45988

In [55]:
pd.set_option('display.max_rows', 100)
X_scores = clf.negative_outlier_factor_
df['lof'] = X_scores
df.sort_values(by=['lof'], ascending=True) # quelli che sono in comunità dissiminate, "lontane e staccate dalla grande community"

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient,n.`__fastrp_embedding`,sum_rp,lof
8986,0000-0002-3822-2923,8986,2.0,1,1.000000,1.000000,"[0.0, 0.8005028963088989, 0.0, 0.0, -0.0502952...",24.045234,-8.842073e+10
1973,0000-0003-1553-9327,1973,2.0,4,1.000000,1.000000,"[0.0, 0.0, -0.8096112608909607, 0.809611260890...",-3.836021,-8.841097e+10
8988,0000-0001-6307-8786,8986,2.0,1,1.000000,1.000000,"[0.0, 0.8028557300567627, 0.0, 0.0, 0.08094320...",24.154807,-8.839846e+10
6466,0000-0002-7529-001x,6465,2.0,1,1.000000,1.000000,"[-2.1589813232421875, 0.0, -0.7232538461685181...",4.359829,-8.839242e+10
1975,0000-0003-1655-1547,1973,2.0,2,1.000000,1.000000,"[0.0, 0.0, -0.745400071144104, 0.7454000711441...",-3.899730,-8.838234e+10
...,...,...,...,...,...,...,...,...,...
2118,0000-0002-1931-1365,3,4.0,7,0.578361,0.666667,"[0.32956355810165405, -0.4229787290096283, 0.5...",-0.616788,-9.579432e-01
544,0000-0002-0831-3541,3,11.0,25,1.905148,0.309091,"[-0.46868106722831726, 0.6814004182815552, -0....",1.863681,-9.571397e-01
2904,0000-0002-1103-0649,3,4.0,2,0.612479,1.000000,"[-0.3895499110221863, 0.03607933968305588, 0.9...",-10.143443,-9.564176e-01
6892,0000-0002-4407-8542,3,2.0,2,0.369976,1.000000,"[-0.09558090567588806, -0.24181139469146729, 1...",-3.802882,-9.553838e-01


In [56]:
# curioso vedere i nearest neighbors e trovare quali autori sono più vicini tra loro
# il paper fast rp raccomanda la cosine similarity, ma sklearn usa euclidean, quindi normalizzo fast rp per avere lo stesso ordinamento

In [58]:
l2_norm = np.sum(np.abs(X)**2,axis=-1)**(1./2)
l2_norm[l2_norm == 0] = 1
l2_norm.max()

9.984107199743846

In [59]:
X_norm = X / l2_norm[:, np.newaxis]

In [60]:
l2_norm = np.sum(np.abs(X_norm)**2,axis=-1)**(1./2)
l2_norm.max()

1.0000000000000002

In [61]:
neigh = NearestNeighbors(n_neighbors=2, metric='euclidean')
neigh.fit(X_norm)

NearestNeighbors(metric='euclidean', n_neighbors=2)

In [62]:
X_norm[1:10, :]

array([[ 0.01156006, -0.08718632,  0.07142726, ...,  0.07644635,
        -0.10647469, -0.19822205],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.02443047,  0.11967991, -0.05065115, ...,  0.08362475,
         0.20845652,  0.02169579],
       ...,
       [-0.10201191,  0.1270666 ,  0.11397008, ...,  0.05029615,
         0.09507713,  0.1481531 ],
       [ 0.09511089, -0.08340565, -0.02773664, ...,  0.09013592,
         0.05882417, -0.18067247],
       [-0.06805397, -0.04492413, -0.06213671, ..., -0.01154949,
         0.04316761,  0.09857748]])

In [63]:
distances = neigh.kneighbors(X_norm, 2, return_distance=True) # è una tupla, primo elemento le distanze secondo gli indici, ovviamente come primo elemento mi ritornerà sempre se stesso, anche se stranamente non me lo ritorna a 0 (comunque un numero molto vicino a 0)

In [64]:
distances

(array([[0.        , 0.12931063],
        [0.        , 0.17197403],
        [0.        , 0.        ],
        ...,
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ]]),
 array([[   0, 6867],
        [   1, 1290],
        [   2,    4],
        ...,
        [   2,    4],
        [   2,    4],
        [   2,    4]]))

In [65]:
distances_second = distances[0][:, 1]
indices_second = distances[1][:, 1]

In [66]:
nearest_neighbour_df = df[['n.id']].iloc[indices_second]

In [67]:
df['nearest_neighbour'] = nearest_neighbour_df['n.id'].to_list()

In [68]:
df['distance_to_nearest_neighbour'] = distances_second

In [69]:
df[df['degree'] > 1].sort_values(by=['distance_to_nearest_neighbour'], ascending=True) # sembra un pattern ricorrente scrivere il nome per esteso se sono solo e puntato se sono con altri
# il primo sono 2 collegati 2 volte con la stessa risorsa
# Brendan Kelley c'è una risorsa dove hanno contribuito tutti 2 volte

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient,n.`__fastrp_embedding`,sum_rp,lof,nearest_neighbour,distance_to_nearest_neighbour
7808,0000-0002-9547-1889,5666,2.0,3,0.819149,1.000000,"[-1.7583332061767578, 0.0, -1.5091028213500977...",-14.751538,-8.088133e+10,0000-0002-3154-1518,0.000000
13000,0000-0003-2265-0162,3,3.0,1,0.638614,1.000000,"[0.4152911603450775, -0.0067856572568416595, 0...",-8.538730,-9.850888e-01,0000-0003-2265-0162,0.000000
4704,0000-0001-5410-7135,4704,2.0,6,1.459459,0.000000,"[0.0, 0.5299989581108093, -0.5299989581108093,...",-10.781060,-6.540390e+10,0000-0002-9354-9486,0.000000
4020,0000-0002-0161-654x,3,2.0,10,0.429642,1.000000,"[0.5651296377182007, 0.3545893728733063, -1.37...",1.584527,-1.072636e+00,0000-0002-0161-654x,0.000000
4726,0000-0001-6743-360x,4725,2.0,3,1.459459,0.000000,"[0.7293249368667603, 0.15952207148075104, 0.0,...",6.313166,-6.093757e+10,0000-0001-6743-360x,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
7298,0000-0002-5487-8076,3,2.0,3,0.391041,0.000000,"[-0.3369683623313904, -0.9641990661621094, -0....",6.291831,-1.407289e+00,0000-0001-5180-7179,0.641140
927,0000-0002-4315-3299,3,3.0,15,0.706450,0.000000,"[0.23910915851593018, -1.8296360969543457, 0.7...",2.372397,-1.049076e+00,0000-0002-3438-3914,0.643799
3743,0000-0002-9618-6457,3,3.0,7,0.577955,0.333333,"[0.30925899744033813, -0.37121668457984924, -0...",1.121508,-1.026409e+00,0000-0002-8687-8185,0.653437
906,0000-0002-1345-6360,3,3.0,13,0.615464,0.000000,"[0.019334442913532257, -0.2596214711666107, 0....",-2.568416,-1.129022e+00,0000-0003-0320-1257,0.672024
