## 📦 Étape 1 — Installation des dépendances

In [None]:

!pip install spacy rdflib pykeen torch scikit-learn matplotlib
!python -m spacy download en_core_web_sm


## 🧠 Étape 2 — Extraction automatique de relations depuis le .txt

In [9]:
import spacy
from spacy.matcher import Matcher

def extract_all_relations(txt_path):
    nlp = spacy.load("en_core_web_sm")

    with open(txt_path, "r", encoding="utf-8") as f:
        text = f.read()
    doc = nlp(text)

    relations = []

    # 1. "X is the Y of Z"
    matcher = Matcher(nlp.vocab)
    pattern = [
        {"DEP": "nsubj"},
        {"LEMMA": "be"},
        {"LOWER": "the"},
        {"POS": "NOUN"},
        {"LOWER": "of"},
        {"DEP": "pobj"},
    ]
    matcher.add("is_relation_of", [pattern])
    matches = matcher(doc)
    for _, start, end in matches:
        span = doc[start:end]
        relations.append((span[0].text, span[3].text, span[5].text))

    # 2. "X is Y"
    for sent in doc.sents:
        for token in sent:
            if token.dep_ == "attr" and token.head.lemma_ == "be":
                subj = [t for t in token.head.children if t.dep_ == "nsubj"]
                if subj:
                    relations.append((subj[0].text, "is", token.text))

    # 3. "X verb Y"
    for sent in doc.sents:
        for token in sent:
            if token.dep_ in ("nsubj", "nsubjpass") and token.head.pos_ == "VERB":
                verb = token.head.lemma_
                subject = token.text
                for child in token.head.children:
                    if child.dep_ in ("dobj", "attr", "pobj"):
                        relations.append((subject, verb, child.text))

    # 4. "X was Vpp by Y" (voix passive)
    for sent in doc.sents:
        for token in sent:
            if token.dep_ == "nsubjpass" and token.head.pos_ == "VERB":
                verb = token.head.lemma_
                object_ = token.text
                agent = [child for child in token.head.children if child.dep_ == "agent"]
                if agent:
                    by_obj = [t for t in agent[0].children if t.dep_ == "pobj"]
                    if by_obj:
                        relations.append((by_obj[0].text, verb, object_))

    # 5. "X verb with Y" → ex: "Mando travels with Grogu"
    for sent in doc.sents:
        for token in sent:
            if token.pos_ == "VERB":
                subject = [t for t in token.children if t.dep_ == "nsubj"]
                prep_with = [t for t in token.children if t.dep_ == "prep" and t.text == "with"]
                if subject and prep_with:
                    pobj = [t for t in prep_with[0].children if t.dep_ == "pobj"]
                    if pobj:
                        relations.append((subject[0].text, token.lemma_, pobj[0].text))

    return list(set(relations))  # élimine doublons

# 📥 Utilisation directe avec le fichier "devtest.txt"
relations = extract_all_relations("devtest.txt")
print(f"{len(relations)} relations extraites")
for r in relations[:10]:
    print(r)


47 relations extraites
('Ahsoka', 'is', 'Padawan')
('R2D2', 'serve', 'Anakin')
('Jabba', 'hire', 'BobaFett')
('Lando', 'help', 'Chewbacca')
('Finn', 'join', 'Resistance')
('Han', 'pilot', 'MillenniumFalcon')
('DeathStar', 'is', 'superweapon')
('Luke', 'train', 'KyloRen')
('ObiWan', 'train', 'Anakin')
('Padmé', 'is', 'wife')


## 🧱 Étape 3 — Construction du graphe RDF

In [10]:

from rdflib import Graph, Namespace, RDF

g = Graph()
EX = Namespace("http://example.org/")

for s, p, o in relations:
    g.add((EX[s], EX[p], EX[o]))

print("Nombre de triplets RDF :", len(g))
next(iter(g))


Nombre de triplets RDF : 47


(rdflib.term.URIRef('http://example.org/ObiWan'),
 rdflib.term.URIRef('http://example.org/train'),
 rdflib.term.URIRef('http://example.org/Anakin'))

## 🧪 Étape 4 — Vérification de la couverture du graphe avant split

In [12]:

from collections import Counter

entities = []
relations_set = []

for s, p, o in g:
    entities.extend([s, o])
    relations_set.append(p)

entity_counts = Counter(entities)
relation_counts = Counter(relations_set)

print(f"Nombre total d'entités : {len(set(entities))}")
print(f"Nombre total de relations : {len(set(relations_set))}")
print("Top 5 entités :", entity_counts.most_common(5))
print("Top 5 relations :", relation_counts.most_common(5))


Nombre total d'entités : 49
Nombre total de relations : 21
Top 5 entités : [(rdflib.term.URIRef('http://example.org/Leia'), 7), (rdflib.term.URIRef('http://example.org/Han'), 6), (rdflib.term.URIRef('http://example.org/Luke'), 6), (rdflib.term.URIRef('http://example.org/Anakin'), 5), (rdflib.term.URIRef('http://example.org/KyloRen'), 5)]
Top 5 relations : [(rdflib.term.URIRef('http://example.org/is'), 19), (rdflib.term.URIRef('http://example.org/train'), 5), (rdflib.term.URIRef('http://example.org/lead'), 2), (rdflib.term.URIRef('http://example.org/help'), 2), (rdflib.term.URIRef('http://example.org/son'), 2)]


## 🤖 Étape 5 — Entraînement du modèle TransE avec PyKEEN

In [13]:

from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
import numpy as np

triples = [(str(s), str(p), str(o)) for s, p, o in g]
triples_array = np.array(triples, dtype=str)
full_tf = TriplesFactory.from_labeled_triples(triples_array)

training, validation, testing = full_tf.split([0.8, 0.1, 0.1])

results = pipeline(
    training=training,
    validation=validation,
    testing=testing,
    model='TransE',
    model_kwargs=dict(embedding_dim=50),
    training_kwargs=dict(num_epochs=100, batch_size=32),
    optimizer_kwargs=dict(lr=0.01),
    random_seed=42,
)


using automatically assigned random_state=263303681


ValueError: Could not find a coverage of all entities and relation with only 37 triples.

## 📊 Étape 6 — Évaluation des performances

In [None]:

metrics = results.metric_results.to_dict()
for key, value in metrics["both"].items():
    if isinstance(value, (int, float)):
        print(f"{key}: {value:.4f}")


## 🧭 Étape 7 — Visualisation des embeddings (t-SNE)

In [None]:

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

entity_embeddings = results.model.entity_representations[0]().detach().numpy()
id_to_label = results.training.entity_labeling.label_to_id.inverse

reduced = TSNE(n_components=2, random_state=42).fit_transform(entity_embeddings)
labels = list(id_to_label.values())

plt.figure(figsize=(10, 10))
plt.scatter(reduced[:, 0], reduced[:, 1], alpha=0.7)
for i, label in enumerate(labels):
    short_label = label.split("/")[-1]
    plt.annotate(short_label, (reduced[i, 0], reduced[i, 1]))
plt.title("Visualisation des entités (TransE Embeddings)")
plt.grid(True)
plt.show()
