In [1]:
from neo4j import GraphDatabase
import pandas as pd

# Init the connection to the database
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "neuroinformatics"), encrypted=False)
pd.set_option('display.max_columns', None)

def cyperQueryToDataFrame(query):
  with driver.session() as session:
    result = session.run(query)
    return pd.DataFrame(result.data(), columns=result.keys())



In [2]:
keywords = cyperQueryToDataFrame("MATCH (n:Keyword) RETURN n.id")

In [3]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-1.0.2.tar.gz (74 kB)
[K     |████████████████████████████████| 74 kB 3.1 MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
  Downloading transformers-4.4.2-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 14.9 MB/s 
[?25hCollecting tqdm
  Downloading tqdm-4.59.0-py2.py3-none-any.whl (74 kB)
[K     |████████████████████████████████| 74 kB 7.5 MB/s 
[?25hCollecting torch>=1.6.0
  Downloading torch-1.8.0-cp38-none-macosx_10_9_x86_64.whl (119.6 MB)
[K     |████████████████████████████████| 119.6 MB 11.2 MB/s 
Collecting nltk
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 11.5 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.95-cp38-cp38-macosx_10_6_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 15.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.1-cp38-cp38-macosx_10_11_x86_64.whl (2.2

In [3]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('stsb-roberta-base') # roberta base good and not too large

# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

The cat sits outside 		 The dog plays in the garden 		 Score: -0.0686
A man is playing guitar 		 A woman watches TV 		 Score: 0.0891
The new movie is awesome 		 The new movie is so great 		 Score: 0.9907


In [4]:
import re
def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return ' '.join([m.group(0) for m in matches])

In [5]:
sentences = keywords['n.id'].squeeze().map(lambda x: ' '.join(camel_case_split(x[:-4]).split('_'))).tolist() # rimozione camelcase e underscore

In [6]:
embeddings = model.encode(sentences, convert_to_tensor=True)

In [7]:
keywords

Unnamed: 0,n.id
0,Brain_sub
1,ComputingMilieux_PERSONALCOMPUTING_sub
2,Biomedical Engineering_sub
3,fMRI_sub
4,dfMRI_sub
...,...
192882,CORTICOTROPIN-RELEASING FACTOR_sub
192883,COLOCALIZATION_sub
192884,Glutathione S-Transferase Alpha_sub
192885,Glutathione S-Transferase pi_sub


In [8]:
json_to_import = []
for i in range(len(sentences)):
    json_to_import.append({ "id": keywords['n.id'][i], "embeddings": embeddings[i].tolist(),  })

In [9]:
pd.DataFrame(json_to_import).to_csv('embeddings_keyword.csv')

In [10]:
from neo4j import GraphDatabase

# Init the connection to the database
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "neuroinformatics"), encrypted=False)
session = driver.session()
for i in range(len(sentences)):
    session.run("MATCH (n:Keyword) WHERE n.id = $id SET n.r_embeddings = $embeddings RETURN n", { "id": keywords['n.id'][i], "embeddings": embeddings[i].tolist() })