In [None]:
import spacy
from spacy import displacy #displaCy is spaCy’s built-in visualizer for syntactic dependencies and named entities. It helps you “see” how words relate to each other.
from IPython.display import HTML, display #Without these, you’d only see plain text output in Jupyter. Helps render rich outputs (like formatted HTML, tables, or interactive
                                        #visualizations) inside Jupyter notebooks.
import pandas as pd #for data handling/tables
import numpy as np #for numerical operations
import os #tools for interacting with files, directories, environment variables.
from pathlib import Path #a modern, object-oriented way of handling paths. to read text file text1.txt

from neo4j import GraphDatabase #to connect Python to Neo4j database.

driver = GraphDatabase.driver("bolt://127.0.0.1:7687", auth=("neo4j", "testpassword")) #Creates a driver object that connects to local Neo4j database.
print("Connected to Neo4j via Python!")
driver.close()

#load the French model
nlp = spacy.load("fr_core_news_sm")

# Load your text
text_path = Path("text1.txt")
TEXT = text_path.read_text(encoding="utf-8")

doc = nlp(TEXT) #doc is a spaCy Doc object, which is a container

displacy.render(doc, style="dep", jupyter=True)

Connected to Neo4j via Python!


In [None]:
token_infos_lines = [] #this will store structured information (tuples) about each token (word/punctuation) in text.

for token in doc: #Iterates over every token in spaCy doc. This loop lets collect information about each word
    token_infos_lines.append((token.text, token.lemma_, token.pos_, token.dep_,  token.idx, token.head.text, token.head.idx))

In [None]:
df_tokens = pd.DataFrame(token_infos_lines, columns=['text', 'lemma', 'pos','dep', 'idx',  'head', 'head_idx'])
display(df_tokens) #extracts linguistic features from each token, structures them into a table, and displays them

Unnamed: 0,text,lemma,pos,dep,idx,head,head_idx
0,Longtemps,longtemps,ADV,advmod,0,couché,22
1,",",",",PUNCT,punct,9,couché,22
2,je,je,PRON,nsubj,11,couché,22
3,me,me,PRON,iobj,14,couché,22
4,suis,être,AUX,aux:tense,17,couché,22
...,...,...,...,...,...,...,...
1885,tout,tout,ADV,advmod,8575,éveillé,8587
1886,à,à,ADP,fixed,8580,tout,8575
1887,fait,fait,NOUN,fixed,8582,tout,8575
1888,éveillé,éveillé,ADJ,obj,8587,serais,8568


In [None]:
# Build token nodes and dependency edges
tokens = [] #will hold information for each token node
deps = []

doc_id = "text1"

for token in doc:
    uid = f"{doc_id}:{token.i}" #Builds a unique ID (uid) for each token, combining the document ID and the token index.
    tokens.append({      #Adds a dictionary (key–value pairs) describing this token to the tokens list.
        "uid": uid,
        "text": token.text,
        "lemma": token.lemma_,
        "pos": token.pos_,
        "dep": token.dep_
    })
    if token.head.i != token.i: #Checks if the token’s head is different from itself.
        deps.append({
            "head_uid": f"{doc_id}:{token.head.i}",
            "child_uid": uid,
            "label": token.dep_
        })

    #This block converts spaCy’s Doc object into two graph-friendly structures:

In [None]:
# Build adjacency matrix -mathematical graph representation
N = len(tokens)
uid_to_index = {token["uid"]: i for i, token in enumerate(tokens)}
adj_matrix = np.zeros((N, N), dtype=int) #adjacency matrix will be an N x N square matrix.

for rel in deps:
    i = uid_to_index[rel["head_uid"]] #head_uid - the token that is head of a grammatical relation
    j = uid_to_index[rel["child_uid"]] #the token that depends on the head
    adj_matrix[i, j] = 1 #means “there’s an edge from head to child.”

print("Adjacency matrix shape:", adj_matrix.shape)
print("Number of edges:", adj_matrix.sum())


Adjacency matrix shape: (1890, 1890)
Number of edges: 1834


In [None]:
# Connect to Neo4j

uri = os.getenv("NEO4J_URI", "bolt://localhost:7687") 
user = os.getenv("NEO4J_USER", "arailym")
password = os.getenv("NEO4J_PASSWORD", "neo4j")  

driver = GraphDatabase.driver(uri, auth=(user, password)) #connection manager to Neo4j.


In [None]:
# Insert token nodes

def insert_tokens(tx, token_list): #tx: a Neo4j transaction object (session transaction context where queries are run), a Python list of token dictionaries
    tx.run("""
    UNWIND $tokens AS t
    MERGE (token:Token {uid: t.uid})
    SET token.text = t.text, token.lemma = t.lemma, token.pos = t.pos, token.dep = t.dep
    """, tokens=token_list)
    #This function inserts all your tokens as nodes in Neo4j in a single efficient query.

In [None]:
#Insert dependency edges

def insert_deps(tx, dep_list): #Cypher query
    tx.run("""
    UNWIND $deps AS d      
    MATCH (h:Token {uid:d.head_uid}), (c:Token {uid:d.child_uid})
    MERGE (h)-[r:DEP]->(c)
    SET r.label = d.label
    """, deps=dep_list)

    #you already created the nodes earlier with insert_tokens. Now you just connect them.

In [87]:
#Run insertion
from neo4j import GraphDatabase
driver = GraphDatabase.driver("bolt://127.0.0.1:7687", auth=("neo4j", "testpassword"))

with driver.session() as session:
    session.execute_write(insert_tokens, tokens)
    session.execute_write(insert_deps, deps)


driver.close()
print(" Graph inserted in Neo4j")

 Graph inserted in Neo4j




Si vous avez cette erreur :

https://github.com/explosion/spacy/issues/13864

avec 
```
displacy.render(doc, style="dep", jupyter=True)
```

changez dans le fichier incriminé :
```
from IPython.core.display import HTML, display
```

par : 
```
from IPython.display import HTML, display
```

# PERSONAL NOTE
Create a venv (in a folder called .venv)
```
python3.13 -m venv .venv
```
Activate it
```
source .venv/bin/activate
```

Install spacy inside the venv
```
pip install -U pip
pip install spacy
```

install ipykernel inside venv
```
pip install ipykernel
```

add venv as a Jupyter kernel
```
python -m ipykernel install --user --name=myvenv --display-name "Python (myvenv)"
```