In [2]:
from neo4j import GraphDatabase
import pandas as pd

# Init the connection to the database
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "neuroinformatics_orc_id"), encrypted=False)
pd.set_option('display.max_columns', None)

def cyperQueryToDataFrame(query):
  with driver.session() as session:
    result = session.run(query)
    return pd.DataFrame(result.data(), columns=result.keys())



In [3]:
keywords = cyperQueryToDataFrame("MATCH (n:Keyword) RETURN n.id")

In [4]:
!pip install -U sentence-transformers



In [5]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('stsb-roberta-base') # roberta base good and not too large

# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

The cat sits outside 		 The dog plays in the garden 		 Score: -0.0686
A man is playing guitar 		 A woman watches TV 		 Score: 0.0891
The new movie is awesome 		 The new movie is so great 		 Score: 0.9907


In [6]:
import re
def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return ' '.join([m.group(0) for m in matches])

In [7]:
sentences = keywords['n.id'].squeeze().map(lambda x: ' '.join(camel_case_split(x[:-4]).split('_'))).tolist() # rimozione camelcase e underscore

In [8]:
embeddings = model.encode(sentences, convert_to_tensor=True)

In [12]:
keywords

Unnamed: 0,n.id
0,Article_sub
1,Developmental Biology_sub
2,General Neuroscience_sub
3,Molecular Biology_sub
4,Clinical Neurology_sub
...,...
60799,Trend analysis_sub
60800,Mandate_sub
60801,Spatial regression_sub
60802,Partially compliant_sub


In [10]:
json_to_import = []
for i in range(len(sentences)):
    json_to_import.append({ "id": keywords['n.id'][i], "embeddings": embeddings[i].tolist(),  })

In [11]:
pd.DataFrame(json_to_import).to_csv('embeddings_keyword.csv')

In [13]:
from neo4j import GraphDatabase

# Init the connection to the database
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "neuroinformatics_orc_id"), encrypted=False)
session = driver.session()
for i in range(len(sentences)):
    session.run("MATCH (n:Keyword) WHERE n.id = $id SET n.r_embeddings = $embeddings RETURN n", { "id": keywords['n.id'][i], "embeddings": embeddings[i].tolist() })

In [None]:
# meglio farlo sulle risorse (descrizione e/o titolo) bert sulle keyword potrebbe essere non molto pulito (proviamo comunque sulle keyword)

In [19]:
keywords = cyperQueryToDataFrame("MATCH (n:Keyword) RETURN n.id, n.r_embeddings, n.__fastrp_embedding")

In [3]:
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor, NearestNeighbors
import pandas as pd
from sklearn.manifold import TSNE
%matplotlib widget

In [21]:
X = np.array(keywords["n.r_embeddings"].to_list())

In [22]:
unq, count = np.unique(X, axis=0, return_counts=True)

In [23]:
count.max() # il massimo è solo 6 simboli, quindi al massimo abbiamo 6 UNK

6

Nearest neighbour using transformer

In [24]:
l2_norm = np.sum(np.abs(X)**2,axis=-1)**(1./2)
l2_norm[l2_norm == 0] = 1
l2_norm.max()

22.45608757823422

In [25]:
X_norm = X / l2_norm[:, np.newaxis]

In [26]:
l2_norm = np.sum(np.abs(X_norm)**2,axis=-1)**(1./2)
l2_norm.max()

1.0000000000000002

In [27]:
neigh = NearestNeighbors(n_neighbors=2, metric='euclidean')
neigh.fit(X_norm)

NearestNeighbors(metric='euclidean', n_neighbors=2)

In [28]:
distances = neigh.kneighbors(X_norm, 2, return_distance=True)
distances_second = distances[0][:, 1]
indices_second = distances[1][:, 1]
nearest_neighbour_df = keywords[['n.id']].iloc[indices_second]
keywords['nearest_neighbour'] = nearest_neighbour_df['n.id'].to_list()
keywords['distance_to_nearest_neighbour'] = distances_second

In [29]:
keywords.sort_values(by=['distance_to_nearest_neighbour'], ascending=True)

Unnamed: 0,n.id,n.r_embeddings,n.__fastrp_embedding,nearest_neighbour,distance_to_nearest_neighbour
0,Article_sub,"[0.29068514704704285, -0.520392894744873, 0.27...","[0.23946213722229004, 0.9097933173179626, 0.37...",article_sub,0.000000
5550,Genetic model_sub,"[-0.21972937881946564, -0.8356800079345703, 0....","[0.18015313148498535, 0.9190300703048706, 0.21...",GENETIC MODEL_sub,0.000000
23452,biotechnology_sub,"[0.38012242317199707, -0.5731474757194519, -0....","[0.46034255623817444, 1.2117494344711304, 0.25...",biotechnology_sub,0.000000
23477,audiovisual_sub,"[-0.18781010806560516, -0.24385662376880646, 0...","[0.47778013348579407, 0.8880183100700378, 0.21...",audiovisual_sub,0.000000
23483,parcellation_sub,"[-0.24408778548240662, -0.7265926599502563, 0....","[0.4743480086326599, 0.769284725189209, 0.1475...",Parcellation_sub,0.000000
...,...,...,...,...,...
10516,Thinning_sub,"[0.30108463764190674, -1.0121861696243286, -0....","[0.4265685975551605, 0.8930532336235046, 0.235...",Dichotic pitch_sub,1.022137
50822,ASIP centennial review_sub,"[0.2732141315937042, -0.5266939401626587, -0.5...","[0.32552725076675415, 0.8336598873138428, 0.14...",Asian lineage_sub,1.024123
44190,Door to needle time_sub,"[1.1658705472946167, 0.2911945581436157, 0.098...","[0.16837739944458008, 0.8343154191970825, 0.29...",EEG synchronization_sub,1.026619
52695,Downtown_sub,"[-0.07620495557785034, -0.5523253679275513, 0....","[0.3217189610004425, 1.0120137929916382, 0.241...",Dual modality_sub,1.034853


In [30]:
keywords.sort_values(by=['distance_to_nearest_neighbour'], ascending=True).to_csv('keyword_embeddings_similarity.csv')

In [31]:
X = np.array(keywords["n.__fastrp_embedding"].to_list())
l2_norm = np.sum(np.abs(X)**2,axis=-1)**(1./2)
l2_norm[l2_norm == 0] = 1
l2_norm.max()
X_norm = X / l2_norm[:, np.newaxis]
l2_norm = np.sum(np.abs(X_norm)**2,axis=-1)**(1./2)
l2_norm.max()
neigh = NearestNeighbors(n_neighbors=2, metric='euclidean')
neigh.fit(X_norm)
distances = neigh.kneighbors(X_norm, 2, return_distance=True)
distances_second = distances[0][:, 1]
indices_second = distances[1][:, 1]
nearest_neighbour_df = keywords[['n.id']].iloc[indices_second]
keywords['nearest_neighbour_rp'] = nearest_neighbour_df['n.id'].to_list()
keywords['distance_to_nearest_neighbour_rp'] = distances_second

In [32]:
keywords.sort_values(by=['distance_to_nearest_neighbour_rp'], ascending=True)

Unnamed: 0,n.id,n.r_embeddings,n.__fastrp_embedding,nearest_neighbour,distance_to_nearest_neighbour,nearest_neighbour_rp,distance_to_nearest_neighbour_rp
428,Neurosciences. Biological psychiatry. Neuropsy...,"[-0.03752695769071579, 0.5647879242897034, 0.1...","[0.4839213192462921, 0.9260246753692627, 0.010...","Biological Psychiatry, Neuropsychology and Phy...",5.747103e-01,RC321-571_sub,0.001695
429,RC321-571_sub,"[0.41022780537605286, -0.2790752649307251, 0.0...","[0.4839308559894562, 0.9277102947235107, 0.010...",571 Physiology_sub,8.766453e-01,Neurosciences. Biological psychiatry. Neuropsy...,0.001695
550,Neurology. Diseases of the nervous system_sub,"[-0.5539271831512451, 0.06927037984132767, 0.8...","[0.16167712211608887, 0.83883136510849, 0.1945...",neurological diseases_sub,5.493369e-01,RC346-429_sub,0.001748
551,RC346-429_sub,"[0.2908068001270294, -0.9086668491363525, 0.00...","[0.16346226632595062, 0.8388150930404663, 0.19...",RC0346_sub,6.699589e-01,Neurology. Diseases of the nervous system_sub,0.001748
1071,R858-859.7_sub,"[-0.012210744433104992, -0.4148079454898834, -...","[0.1112583726644516, 0.7617290019989014, 0.179...",R858_sub,4.831673e-01,Computer applications to medicine. Medical inf...,0.001792
...,...,...,...,...,...,...,...
48186,ecosystem services_sub,"[-0.28248322010040283, -0.17272460460662842, -...","[-0.17734858393669128, 0.5480749607086182, 0.2...",Ecosystem services_sub,6.710488e-07,Bayesian hierarchical analysis_sub,0.274478
24268,Environmental data_sub,"[-0.6154889464378357, -0.764665961265564, -0.5...","[-0.3294067978858948, 1.0884798765182495, -0.0...",environmental_sub,4.104480e-01,Cyclostratigraphy_sub,0.301508
34929,value_sub,"[0.1971978396177292, -0.9848495125770569, -0.4...","[0.46996498107910156, 0.6772952675819397, 0.04...",value_sub,0.000000e+00,relation_sub,0.310805
40493,Deep sea mining_sub,"[-0.32977399230003357, 0.25428643822669983, -1...","[-0.46577492356300354, 0.5656290650367737, -0....",deep-sea mining_sub,2.784853e-01,European Union Framework 7_sub,0.315627


In [33]:
keywords.sort_values(by=['distance_to_nearest_neighbour_rp'], ascending=True).to_csv('keyword_rp_similarity.csv')

In [35]:
keywords['combined_distance'] = keywords['distance_to_nearest_neighbour_rp'] + keywords['distance_to_nearest_neighbour']

In [36]:
keywords.sort_values(by=['combined_distance'], ascending=True) # perchè il NN è rispetto due distanze che potrebbero essere diverse e ordinate per la combined

Unnamed: 0,n.id,n.r_embeddings,n.__fastrp_embedding,nearest_neighbour,distance_to_nearest_neighbour,nearest_neighbour_rp,distance_to_nearest_neighbour_rp,combined_distance
99,business_sub,"[-0.5647993683815002, 0.7837275862693787, 0.28...","[0.24657195806503296, 0.7429737448692322, 0.14...",Business_sub,0.000000e+00,business.industry_sub,0.005533,0.005533
841,Discrete Mathematics_sub,"[-0.12998123466968536, -0.5200858116149902, -0...","[0.22510449588298798, 1.1104384660720825, 0.01...",Discrete mathematics_sub,2.980232e-08,Combinatorics_sub,0.005829,0.005829
679,law_sub,"[0.07385210692882538, -0.6456049084663391, -0....","[0.23039433360099792, 0.9937765598297119, 0.09...",Law_sub,2.107342e-08,law.invention_sub,0.008311,0.008311
1833,Chromatin modification_sub,"[0.8373721837997437, -0.35834360122680664, 0.1...","[0.3418125510215759, 0.8903435468673706, 0.043...",Chromatin Modification_sub,4.712161e-08,Chromosome biology_sub,0.009104,0.009104
1831,Chromosome biology_sub,"[0.34271085262298584, -0.34560465812683105, 0....","[0.34230875968933105, 0.8904055953025818, 0.05...",Chromosome Biology_sub,6.618867e-07,Chromatin modification_sub,0.009104,0.009105
...,...,...,...,...,...,...,...,...
20841,To be checked for WOS id_sub,"[0.262027382850647, 0.3499324321746826, 0.3919...","[0.06904032826423645, 0.8562159538269043, 0.02...",Wada testing_sub,1.008030e+00,Physiology_sub,0.142877,1.150907
52695,Downtown_sub,"[-0.07620495557785034, -0.5523253679275513, 0....","[0.3217189610004425, 1.0120137929916382, 0.241...",Dual modality_sub,1.034853e+00,Neurorobotics_sub,0.119665,1.154517
36762,"Festschrift for James S. Hyde, PhDPart 2Forewo...","[0.06858428567647934, -0.8310294151306152, 0.0...","[0.3184698224067688, 0.8697881698608398, 0.259...",crypt of Lieberkühn_sub,9.993494e-01,Psychoanalysis_sub,0.158761,1.158110
58152,Shoulder joint_sub,"[0.40083780884742737, -0.15101096034049988, -0...","[0.290085107088089, 0.8747177124023438, 0.2765...",Shroud_sub,1.037086e+00,Active exercise_sub,0.129564,1.166650


In [38]:
keywords.sort_values(by=['combined_distance'], ascending=True).to_csv('keyword_similarity_combined_distance.csv')

# TSNE

In [4]:
keywords = cyperQueryToDataFrame("MATCH (n:Keyword) RETURN n.id, n.r_embeddings, n.__fastrp_embedding")

In [26]:
X = np.array(keywords["n.__fastrp_embedding"].to_list()).astype(float)
l2_norm = np.sum(np.abs(X)**2,axis=-1)**(1./2)
l2_norm[l2_norm == 0] = 1
X = X / l2_norm[:, np.newaxis]
X_embedded = TSNE(n_components=2).fit_transform(X)

In [27]:
with open('tsne_fastrp_embedding_keyword.npy', 'wb') as f:
    np.save(f, X_embedded)

In [1]:
with open('tsne_fastrp_embedding_keyword.npy', 'rb') as f:
    X_imp = np.load(f)

NameError: name 'np' is not defined

In [29]:
import matplotlib.pyplot as plt
%matplotlib widget
plt.scatter(X_imp[:, 0], X_imp[:, 1], s=0.5)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [30]:
plt.close('all')
X = np.array(keywords["n.r_embeddings"].to_list()).astype(float)
l2_norm = np.sum(np.abs(X)**2,axis=-1)**(1./2)
l2_norm[l2_norm == 0] = 1
X = X / l2_norm[:, np.newaxis]
X_embedded = TSNE(n_components=2).fit_transform(X)

In [31]:
with open('tsne_bert_embedding_keyword.npy', 'wb') as f:
    np.save(f, X_embedded)

In [32]:
with open('tsne_bert_embedding_keyword.npy', 'rb') as f:
    X_imp = np.load(f)

In [33]:
plt.scatter(X_imp[:, 0], X_imp[:, 1], s=0.5)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …