In [1]:
from neo4j import GraphDatabase
import pandas as pd

# Init the connection to the database
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "neuroinformatics"), encrypted=False)
pd.set_option('display.max_columns', None)

def cyperQueryToDataFrame(query):
  with driver.session() as session:
    result = session.run(query)
    return pd.DataFrame(result.data(), columns=result.keys())



## WCC 

In [6]:
wcc = cyperQueryToDataFrame("MATCH (n:Author) RETURN id(n), n.__wcc")

In [7]:
wcc

Unnamed: 0,id(n),n.__wcc
0,21446,0
1,21447,0
2,21448,0
3,21449,3
4,21450,3
...,...,...
425042,832552,425042
425043,832553,425042
425044,832554,425042
425045,832555,425045


In [9]:
wcc.astype(object).describe()

Unnamed: 0,id(n),n.__wcc
count,425047,425047
unique,425047,27344
top,21446,0
freq,1,330226


In [10]:
wcc_grouped = wcc.groupby(['n.__wcc']).count()

In [13]:
wcc_grouped

Unnamed: 0_level_0,id(n)
n.__wcc,Unnamed: 1_level_1
0,330226
3,4
12,1
58,5
75,5
...,...
425006,3
425020,3
425036,1
425042,3


In [12]:
wcc_grouped.describe()

Unnamed: 0,id(n)
count,27344.0
mean,15.544434
std,1996.992529
min,1.0
25%,1.0
50%,2.0
75%,4.0
max,330226.0


## Degree

In [21]:
degree_df = cyperQueryToDataFrame("MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree")

In [22]:
degree_df.describe()

Unnamed: 0,wcc,degree
count,425047.0,425047.0
mean,44970.782259,25.829275
std,101354.419554,98.524533
min,0.0,0.0
25%,0.0,4.0
50%,0.0,7.0
75%,0.0,13.0
max,425045.0,3607.0


In [23]:
degree_df.astype(object).describe()

Unnamed: 0,n.id,wcc,degree
count,425047,425047,425047.0
unique,425047,27344,1286.0
top,Rodrigo Basilio,0,4.0
freq,1,330226,38383.0


In [24]:
degree_df.sort_values(by="degree", ascending=False) 

Unnamed: 0,n.id,wcc,degree
3011,Paul M. Thompson,0,3607.0
15477,"Jahanshad, Neda",0,2771.0
5111,"Agartz, Ingrid",0,2573.0
5232,"McDonald, Colm",0,2534.0
3053,Arthur W. Toga,0,2425.0
...,...,...,...
199053,"Samel, Mirachel. D",199053,0.0
59682,Nils I. Bachen,59682,0.0
199052,"Jeong, Seong Wook",199052,0.0
59662,Barbara R. Jasny,59662,0.0


In [27]:
degree_df[degree_df['degree'] == 2]

Unnamed: 0,n.id,wcc,degree
0,"Maimon-Mor, Roni O.",0,2.0
2,"Makin, Tamar R.",0,2.0
51,"Cockett, Peter",0,2.0
52,"Yuan, Ye",0,2.0
194,"Song, Xiaopeng",0,2.0
...,...,...,...
425020,Sung-Woo Byun,425020,2.0
425021,Hyuk Soo Han,425020,2.0
425022,Seok-Pil Lee,425020,2.0
425043,Florian Gondesen,425042,2.0


## Degree by author and number of resource published

In [29]:
df = cyperQueryToDataFrame("MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree, n.resource_published as resources")

In [30]:
df.describe()

Unnamed: 0,wcc,degree,resources
count,425047.0,425047.0,425047.0
mean,44970.782259,25.829275,2.4432
std,101354.419554,98.524533,5.657282
min,0.0,0.0,1.0
25%,0.0,4.0,1.0
50%,0.0,7.0,1.0
75%,0.0,13.0,2.0
max,425045.0,3607.0,487.0


già qui si potrebbe pensare ad un indicatore che esprime quanto l'autore è propenso a fare risorse con persone diverse

## Adding page rank

In [35]:
df = cyperQueryToDataFrame("""
    MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree, n.resource_published as resources, n.__parerank as pagerank
""")

In [36]:
df

Unnamed: 0,n.id,wcc,degree,resources,pagerank
0,"Maimon-Mor, Roni O.",0,2.0,1,0.388988
1,"Johansen-Berg, Heidi",0,27.0,6,2.340052
2,"Makin, Tamar R.",0,2.0,1,0.388988
3,"Chang, Luke J",3,3.0,1,1.000000
4,"Smith, Alec",3,3.0,1,1.000000
...,...,...,...,...,...
425042,Dieter Gollmann,425042,3.0,1,1.248175
425043,Florian Gondesen,425042,2.0,1,0.875912
425044,Matthias Marx,425042,2.0,1,0.875912
425045,Sunila Jain,425045,1.0,1,1.000000


In [37]:
df.describe()

Unnamed: 0,wcc,degree,resources,pagerank
count,425047.0,425047.0,425047.0,425047.0
mean,44970.782259,25.829275,2.4432,0.98219
std,101354.419554,98.524533,5.657282,1.080809
min,0.0,0.0,1.0,0.15
25%,0.0,4.0,1.0,0.550568
50%,0.0,7.0,1.0,0.815309
75%,0.0,13.0,2.0,1.0
max,425045.0,3607.0,487.0,79.315635


## Adding local clustering coefficient

In [4]:
df = cyperQueryToDataFrame("""
    MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree, n.resource_published as resources, n.__parerank as pagerank,
    n.__localClusteringCoefficient as localclusteringcoefficient
""")

In [39]:
df

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient
0,"Maimon-Mor, Roni O.",0,2.0,1,0.388988,1.000000
1,"Johansen-Berg, Heidi",0,27.0,6,2.340052,0.267806
2,"Makin, Tamar R.",0,2.0,1,0.388988,1.000000
3,"Chang, Luke J",3,3.0,1,1.000000,1.000000
4,"Smith, Alec",3,3.0,1,1.000000,1.000000
...,...,...,...,...,...,...
425042,Dieter Gollmann,425042,3.0,1,1.248175,0.333333
425043,Florian Gondesen,425042,2.0,1,0.875912,1.000000
425044,Matthias Marx,425042,2.0,1,0.875912,1.000000
425045,Sunila Jain,425045,1.0,1,1.000000,0.000000


In [40]:
df.describe()

Unnamed: 0,wcc,degree,resources,pagerank,localclusteringcoefficient
count,425047.0,425047.0,425047.0,425047.0,425047.0
mean,44970.782259,25.829275,2.4432,0.98219,0.7932
std,101354.419554,98.524533,5.657282,1.080809,0.323026
min,0.0,0.0,1.0,0.15,0.0
25%,0.0,4.0,1.0,0.550568,0.6
50%,0.0,7.0,1.0,0.815309,1.0
75%,0.0,13.0,2.0,1.0,1.0
max,425045.0,3607.0,487.0,79.315635,2.0


In [41]:
df.sort_values(by="localclusteringcoefficient", ascending=False) 

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient
280205,R. Gentner,0,2.0,1,0.337758,2.0
0,"Maimon-Mor, Roni O.",0,2.0,1,0.388988,1.0
257955,RobertW. Baker,0,3.0,1,0.354722,1.0
257965,Paul Clouston,0,4.0,1,0.530304,1.0
257964,C.L Lim,0,4.0,1,0.530304,1.0
...,...,...,...,...,...,...
16926,William Bosl,16926,0.0,1,0.150000,0.0
163485,Rodrick Wallace,163484,1.0,1,1.000000,0.0
16928,Edward C. Clark,0,2.0,2,0.629109,0.0
181292,Elizabeth Hampson,181292,0.0,1,0.150000,0.0


Il 2 è un errore c'è una doppia relazione tra lo stesso autore e la stessa risorsa

## First approach using DBScan

In [55]:
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor, NearestNeighbors

In [3]:
df = cyperQueryToDataFrame("""
    MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree, n.resource_published as resources, n.__parerank as pagerank,
    n.__localClusteringCoefficient as localclusteringcoefficient
""")

In [4]:
X = StandardScaler().fit_transform(df[["degree", "resources", "pagerank", "localclusteringcoefficient"]].to_numpy())

In [None]:
db = DBSCAN(eps=0.05, min_samples=4, algorithm="ball_tree", leaf_size=1000).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

In [6]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

In [10]:
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 1270
Estimated number of noise points: 29408


In [5]:
db2 = DBSCAN(eps=0.05, min_samples=3, algorithm="ball_tree", leaf_size=1000).fit(X)
core_samples_mask = np.zeros_like(db2.labels_, dtype=bool)
core_samples_mask[db2.core_sample_indices_] = True
labels = db2.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 2025
Estimated number of noise points: 25635


In [8]:
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.0001) # in pratica 20 neighbour sembra funzionare bene
y_pred = clf.fit_predict(X)

In [9]:
X_scores = clf.negative_outlier_factor_

In [12]:
X_scores.max()

-0.8546343095113361

In [13]:
X_scores.min()

-3317180512.3399343

In [15]:
df['lof'] = X_scores

In [17]:
df.sort_values(by=['lof'], ascending=True)

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient,lof
76316,"Liu, Chang",0,630.0,2,1.487150,0.981104,-3.317181e+09
378373,Yulianti,0,629.0,2,1.481493,0.984203,-3.276834e+09
11113,"Bruchez, Marcel",0,629.0,2,1.447845,0.984203,-3.118579e+09
379142,"Li, Jun",0,628.0,2,1.400440,0.987322,-2.886776e+09
46418,"Kelly, Ciaran",0,628.0,2,1.400440,0.987322,-2.886776e+09
...,...,...,...,...,...,...,...
26874,Eun Jeong Kim,0,5.0,1,0.702767,1.000000,-8.591685e-01
398065,Manasi Iyer,120663,6.0,1,0.776154,1.000000,-8.546343e-01
398063,Jeremy M. Shea,120663,6.0,1,0.776154,1.000000,-8.546343e-01
398067,Géraldine Gontier,120663,6.0,1,0.776154,1.000000,-8.546343e-01


In [19]:
df = cyperQueryToDataFrame("""
    MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree, n.resource_published as resources, n.__parerank as pagerank,
    n.__localClusteringCoefficient as localclusteringcoefficient, n.`__fastrp-embedding`
""")

In [25]:
X = np.array(df["n.`__fastrp-embedding`"].to_list())

In [26]:
X

array([[-0.83164656,  0.01154545,  0.30540568, ...,  0.48315668,
         1.23521674,  0.41503212],
       [-0.95615113, -0.03100635,  0.05266985, ...,  0.52140504,
         1.56521952,  0.36397046],
       [-0.8902142 ,  0.01169572,  0.24230045, ...,  0.48351014,
         1.17733634,  0.41553381],
       ...,
       [ 0.32591417,  1.17309511, -0.74403322, ..., -0.8471809 ,
         0.        , -0.8471809 ],
       [ 0.79056942,  0.7624929 , -0.02807654, ..., -0.7624929 ,
         0.        ,  0.        ],
       [ 0.79056942,  0.7624929 , -0.02807654, ..., -0.7624929 ,
         0.        ,  0.        ]])

In [27]:
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.0001) # in pratica 20 neighbour sembra funzionare bene
y_pred = clf.fit_predict(X)

In [29]:
X_scores.min()

-100000013518.23679

In [31]:
pd.set_option('display.max_rows', 100)
X_scores = clf.negative_outlier_factor_
df['lof'] = X_scores
df.sort_values(by=['lof'], ascending=True) # quelli più outlier secondo lui sono quelli che hanno più connessioni verso una risorsa

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient,n.`__fastrp-embedding`,lof
360103,Maxim Viktorovich Lukoyanov,360103,1.0,1,1.000000,0.000000,"[0.0, -1.524985909461975, 0.0, 0.0, 0.0, 0.0, ...",-1.000000e+11
168947,Max Ortiz-Catalan,168947,1.0,1,1.000000,0.000000,"[1.524985909461975, 0.0, 0.0, 1.52498590946197...",-1.000000e+11
366883,Elizabeth W Pang,366883,1.0,1,1.000000,0.000000,"[-1.524985909461975, -1.524985909461975, 1.524...",-1.000000e+11
20450,Johan Ræder,20450,1.0,1,1.000000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-1.000000e+11
153195,E.Gregory Keating,153195,1.0,1,1.000000,0.000000,"[-1.4285715818405151, 0.0, 0.0, 0.0, -1.428571...",-1.000000e+11
...,...,...,...,...,...,...,...,...
322789,L Parolin,0,4.0,2,0.484293,1.000000,"[0.6991167068481445, -2.390488624572754, -1.14...",-9.204082e-01
36497,P.K. Sadasivan,0,6.0,5,0.869979,0.533333,"[-0.6779718995094299, -0.3040192723274231, 1.2...",-9.188990e-01
172507,Shyam Diwakar,172507,40.0,10,3.934544,0.197436,"[0.8750414848327637, 0.021053701639175415, -0....",-9.186044e-01
83645,Kristinn Johnsen,0,75.0,22,3.211445,0.344505,"[-0.4510290026664734, 1.5770740509033203, 0.28...",-9.150334e-01


In [35]:
df.sort_values(by=['lof'], ascending=True).to_csv('lof_fastrp_embedding_author.csv', index=False)

In [37]:
# curioso vedere i nearest neighbors e trovare quali autori sono più vicini tra loro
# il paper fast rp raccomanda la cosine similarity, ma sklearn usa euclidean, quindi normalizzo fast rp per avere lo stesso ordinamento

(425047,)

In [47]:
X_norm = np.linalg.norm(X, axis = 1, keepdims = True)

In [52]:
l2_norm = np.sum(np.abs(X)**2,axis=-1)**(1./2)
l2_norm[l2_norm == 0] = 1
l2_norm.max()

10.000001464459567

In [53]:
X_norm = X / l2_norm[:, np.newaxis]

In [54]:
l2_norm = np.sum(np.abs(X_norm)**2,axis=-1)**(1./2)
l2_norm.max()

1.0000000000000002

In [56]:
neigh = NearestNeighbors(n_neighbors=2, metric='euclidean')
neigh.fit(X_norm)

NearestNeighbors(metric='euclidean', n_neighbors=2)

In [58]:
X_norm[1:10, :]

array([[-0.09912667, -0.00321451,  0.00546042, ...,  0.05405542,
         0.16227038,  0.03773376],
       [-0.09904093,  0.00130121,  0.02695718, ...,  0.053793  ,
         0.13098475,  0.04623028],
       [-0.00016658,  0.16453352,  0.00749547, ..., -0.08185427,
         0.        , -0.15703807],
       ...,
       [ 0.01586033, -0.10599009,  0.07787759, ...,  0.01253639,
         0.05840234, -0.0707914 ],
       [ 0.01826123, -0.09536059,  0.09877386, ...,  0.00903046,
         0.05834375, -0.06015463],
       [ 0.04254953, -0.0965631 ,  0.04749346, ...,  0.01397605,
         0.07222436, -0.04980686]])

In [62]:
distances = neigh.kneighbors(X_norm, 2, return_distance=True) # è una tupla, primo elemento le distanze secondo gli indici, ovviamente come primo elemento mi ritornerà sempre se stesso, anche se stranamente non me lo ritorna a 0 (comunque un numero molto vicino a 0)

In [63]:
distances

(array([[0.        , 0.05893549],
        [0.        , 0.24664167],
        [0.        , 0.05893549],
        ...,
        [0.        , 0.04254493],
        [0.        , 0.        ],
        [0.        , 0.        ]]),
 array([[     0,      2],
        [     1,      2],
        [     2,      0],
        ...,
        [425044, 425042],
        [425045, 425046],
        [425045, 425046]]))

In [68]:
distances_second = distances[0][:, 1]
indices_second = distances[1][:, 1]

In [79]:
nearest_neighbour_df = df[['n.id']].iloc[indices_second]

In [80]:
df['nearest_neighbour'] = nearest_neighbour_df['n.id'].to_list()

In [83]:
df['distance_to_nearest_neighbour'] = distances_second

In [87]:
df[df['degree'] > 1].sort_values(by=['distance_to_nearest_neighbour'], ascending=True) # sembra un pattern ricorrente scrivere il nome per esteso se sono solo e puntato se sono con altri
# il primo sono 2 collegati 2 volte con la stessa risorsa
# Brendan Kelley c'è una risorsa dove hanno contribuito tutti 2 volte

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient,n.`__fastrp-embedding`,lof,nearest_neighbour,distance_to_nearest_neighbour
112349,Michael J. Kovacs,0,6.0,1,0.500855,0.666667,"[-1.277119517326355, 0.3321249783039093, 0.222...",-1.070089e+00,Eric Wright,0.000000
13775,Rob Bartha,0,567.0,8,6.194972,0.792847,"[-0.19466306269168854, 1.8628027439117432, -0....",-1.201219e+09,Rob Bartha,0.000000
276337,Lorenzo eSani,0,15.0,2,1.324498,0.609524,"[0.31591281294822693, -0.03289956972002983, 1....",-1.016638e+00,Emiliano eRicciardi,0.000000
13789,Brendan Kelley,0,486.0,6,5.289691,0.871910,"[-0.1259988695383072, 1.940391182899475, -0.10...",-1.221920e+00,Hristina Koleva,0.000000
66774,Filipe L. Andrés,66774,2.0,2,1.459459,0.000000,"[1.132277011871338, 0.7808688282966614, 0.0, -...",-6.419982e+10,Hendrik Van der Loos,0.000000
...,...,...,...,...,...,...,...,...,...,...
132867,Dennis A. Turner,0,4.0,4,0.384847,0.000000,"[-0.49793142080307007, 0.500878095626831, -0.3...",-1.134408e+00,Allen R. Wyler,0.794037
79158,Babak Mahmoudi,0,7.0,7,0.781997,0.142857,"[0.002955872565507889, 1.0737414360046387, -0....",-1.330229e+00,Mark Connolly,0.800522
291542,Daphne L. Wang,0,3.0,1,0.278055,1.000000,"[1.1258503198623657, 0.8384608030319214, -0.61...",-1.470406e+00,Tracy T. Batchelor,0.811722
289106,Sunita Venkateswaran,0,3.0,2,0.285750,0.333333,"[0.16328322887420654, 0.6464352011680603, -0.2...",-1.507481e+00,Michael Shevell,0.827964


In [86]:
df.sort_values(by=['lof'], ascending=True).to_csv('nearest_neighbour.csv', index=False)

# TSNE + LOF

In [1]:
import matplotlib.pyplot as plt
import ast
import pandas as pd
from sklearn.manifold import TSNE
import numpy as np
%matplotlib widget



In [2]:
df = pd.read_csv('csvs/lof_fastrp_embedding_author.csv')

In [3]:
df["fast_rp_formatted"] = df["n.`__fastrp-embedding`"].map(lambda x : ast.literal_eval(x))

In [5]:
X = np.array(df["fast_rp_formatted"].to_list())
l2_norm = np.sum(np.abs(X)**2,axis=-1)**(1./2)
l2_norm[l2_norm == 0] = 1
X = X / l2_norm[:, np.newaxis]

In [6]:
X_embedded = TSNE(n_components=2).fit_transform(X)

In [7]:
with open('tsne_fastrp_embedding_by_resource_author.npy', 'wb') as f:
    np.save(f, X_embedded)

In [11]:
with open('tsne_fastrp_embedding_by_resource_author.npy', 'rb') as f:
    X_imp = np.load(f)

In [12]:
s = np.array(df["lof"].to_list())

In [13]:
plt.scatter(X_imp[:, 0], X_imp[:, 1], c=s/s.max(), s=0.5) # idee per visualizzarlo meglio?
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …