In [2]:
from neo4j import GraphDatabase
import pandas as pd

# Init the connection to the database
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "neuroinformatics"), encrypted=False)
pd.set_option('display.max_columns', None)

def cyperQueryToDataFrame(query):
  with driver.session() as session:
    result = session.run(query)
    return pd.DataFrame(result.data(), columns=result.keys())



## WCC 

In [6]:
wcc = cyperQueryToDataFrame("MATCH (n:Author) RETURN id(n), n.__wcc")

In [7]:
wcc

Unnamed: 0,id(n),n.__wcc
0,21446,0
1,21447,0
2,21448,0
3,21449,3
4,21450,3
...,...,...
425042,832552,425042
425043,832553,425042
425044,832554,425042
425045,832555,425045


In [9]:
wcc.astype(object).describe()

Unnamed: 0,id(n),n.__wcc
count,425047,425047
unique,425047,27344
top,21446,0
freq,1,330226


In [10]:
wcc_grouped = wcc.groupby(['n.__wcc']).count()

In [13]:
wcc_grouped

Unnamed: 0_level_0,id(n)
n.__wcc,Unnamed: 1_level_1
0,330226
3,4
12,1
58,5
75,5
...,...
425006,3
425020,3
425036,1
425042,3


In [12]:
wcc_grouped.describe()

Unnamed: 0,id(n)
count,27344.0
mean,15.544434
std,1996.992529
min,1.0
25%,1.0
50%,2.0
75%,4.0
max,330226.0


## Degree

In [21]:
degree_df = cyperQueryToDataFrame("MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree")

In [22]:
degree_df.describe()

Unnamed: 0,wcc,degree
count,425047.0,425047.0
mean,44970.782259,25.829275
std,101354.419554,98.524533
min,0.0,0.0
25%,0.0,4.0
50%,0.0,7.0
75%,0.0,13.0
max,425045.0,3607.0


In [23]:
degree_df.astype(object).describe()

Unnamed: 0,n.id,wcc,degree
count,425047,425047,425047.0
unique,425047,27344,1286.0
top,Rodrigo Basilio,0,4.0
freq,1,330226,38383.0


In [24]:
degree_df.sort_values(by="degree", ascending=False) 

Unnamed: 0,n.id,wcc,degree
3011,Paul M. Thompson,0,3607.0
15477,"Jahanshad, Neda",0,2771.0
5111,"Agartz, Ingrid",0,2573.0
5232,"McDonald, Colm",0,2534.0
3053,Arthur W. Toga,0,2425.0
...,...,...,...
199053,"Samel, Mirachel. D",199053,0.0
59682,Nils I. Bachen,59682,0.0
199052,"Jeong, Seong Wook",199052,0.0
59662,Barbara R. Jasny,59662,0.0


In [27]:
degree_df[degree_df['degree'] == 2]

Unnamed: 0,n.id,wcc,degree
0,"Maimon-Mor, Roni O.",0,2.0
2,"Makin, Tamar R.",0,2.0
51,"Cockett, Peter",0,2.0
52,"Yuan, Ye",0,2.0
194,"Song, Xiaopeng",0,2.0
...,...,...,...
425020,Sung-Woo Byun,425020,2.0
425021,Hyuk Soo Han,425020,2.0
425022,Seok-Pil Lee,425020,2.0
425043,Florian Gondesen,425042,2.0


## Degree by author and number of resource published

In [29]:
df = cyperQueryToDataFrame("MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree, n.resource_published as resources")

In [30]:
df.describe()

Unnamed: 0,wcc,degree,resources
count,425047.0,425047.0,425047.0
mean,44970.782259,25.829275,2.4432
std,101354.419554,98.524533,5.657282
min,0.0,0.0,1.0
25%,0.0,4.0,1.0
50%,0.0,7.0,1.0
75%,0.0,13.0,2.0
max,425045.0,3607.0,487.0


già qui si potrebbe pensare ad un indicatore che esprime quanto l'autore è propenso a fare risorse con persone diverse

## Adding page rank

In [35]:
df = cyperQueryToDataFrame("""
    MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree, n.resource_published as resources, n.__parerank as pagerank
""")

In [36]:
df

Unnamed: 0,n.id,wcc,degree,resources,pagerank
0,"Maimon-Mor, Roni O.",0,2.0,1,0.388988
1,"Johansen-Berg, Heidi",0,27.0,6,2.340052
2,"Makin, Tamar R.",0,2.0,1,0.388988
3,"Chang, Luke J",3,3.0,1,1.000000
4,"Smith, Alec",3,3.0,1,1.000000
...,...,...,...,...,...
425042,Dieter Gollmann,425042,3.0,1,1.248175
425043,Florian Gondesen,425042,2.0,1,0.875912
425044,Matthias Marx,425042,2.0,1,0.875912
425045,Sunila Jain,425045,1.0,1,1.000000


In [37]:
df.describe()

Unnamed: 0,wcc,degree,resources,pagerank
count,425047.0,425047.0,425047.0,425047.0
mean,44970.782259,25.829275,2.4432,0.98219
std,101354.419554,98.524533,5.657282,1.080809
min,0.0,0.0,1.0,0.15
25%,0.0,4.0,1.0,0.550568
50%,0.0,7.0,1.0,0.815309
75%,0.0,13.0,2.0,1.0
max,425045.0,3607.0,487.0,79.315635


## Adding local outlier coefficient

In [3]:
df = cyperQueryToDataFrame("""
    MATCH (n:Author) RETURN n.id, n.__wcc as wcc, n.__degree as degree, n.resource_published as resources, n.__parerank as pagerank,
    n.__localClusteringCoefficient as localclusteringcoefficient
""")

In [39]:
df

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient
0,"Maimon-Mor, Roni O.",0,2.0,1,0.388988,1.000000
1,"Johansen-Berg, Heidi",0,27.0,6,2.340052,0.267806
2,"Makin, Tamar R.",0,2.0,1,0.388988,1.000000
3,"Chang, Luke J",3,3.0,1,1.000000,1.000000
4,"Smith, Alec",3,3.0,1,1.000000,1.000000
...,...,...,...,...,...,...
425042,Dieter Gollmann,425042,3.0,1,1.248175,0.333333
425043,Florian Gondesen,425042,2.0,1,0.875912,1.000000
425044,Matthias Marx,425042,2.0,1,0.875912,1.000000
425045,Sunila Jain,425045,1.0,1,1.000000,0.000000


In [40]:
df.describe()

Unnamed: 0,wcc,degree,resources,pagerank,localclusteringcoefficient
count,425047.0,425047.0,425047.0,425047.0,425047.0
mean,44970.782259,25.829275,2.4432,0.98219,0.7932
std,101354.419554,98.524533,5.657282,1.080809,0.323026
min,0.0,0.0,1.0,0.15,0.0
25%,0.0,4.0,1.0,0.550568,0.6
50%,0.0,7.0,1.0,0.815309,1.0
75%,0.0,13.0,2.0,1.0,1.0
max,425045.0,3607.0,487.0,79.315635,2.0


In [41]:
df.sort_values(by="localclusteringcoefficient", ascending=False) 

Unnamed: 0,n.id,wcc,degree,resources,pagerank,localclusteringcoefficient
280205,R. Gentner,0,2.0,1,0.337758,2.0
0,"Maimon-Mor, Roni O.",0,2.0,1,0.388988,1.0
257955,RobertW. Baker,0,3.0,1,0.354722,1.0
257965,Paul Clouston,0,4.0,1,0.530304,1.0
257964,C.L Lim,0,4.0,1,0.530304,1.0
...,...,...,...,...,...,...
16926,William Bosl,16926,0.0,1,0.150000,0.0
163485,Rodrick Wallace,163484,1.0,1,1.000000,0.0
16928,Edward C. Clark,0,2.0,2,0.629109,0.0
181292,Elizabeth Hampson,181292,0.0,1,0.150000,0.0


Il 2 è un errore c'è una doppia relazione tra lo stesso autore e la stessa risorsa

## First approach using DBScan

In [4]:
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [5]:
X = StandardScaler().fit_transform(df[["degree", "resources", "pagerank", "localclusteringcoefficient"]].to_numpy())

In [6]:
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

In [None]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)