In [13]:
import json
import pandas as pd
from openai import OpenAI
from tqdm import tqdm
import tiktoken
import numpy as np
import re

class SimpleKnowledgeGraph:
    def __init__(self, openai_api_key):
        self.client = OpenAI(api_key=openai_api_key)
        self.df = None
        self.tokenizer = tiktoken.get_encoding("cl100k_base")

    def load_data(self, file_path):
        self.df = pd.read_csv(file_path, sep='\t')
        self.df['content'] = self.df['content'].astype(str).replace('nan', '')
        self.df = self.df[self.df['content'].str.strip() != '']
        self.df['references'] = self.df['content'].apply(self.extract_references)

    def create_embeddings(self):
        tqdm.pandas()
        self.df['embedding'] = self.df['content'].progress_apply(self.get_embedding)

    def extract_references(self, text):
        pattern = r'Voyez\s+([^.,;]+)'
        matches = re.findall(pattern, text, re.IGNORECASE)
        return [match.strip() for match in matches]

    def get_embedding(self, text):
        max_tokens = 8000  # Laissons une marge de sécurité
        tokens = self.tokenizer.encode(text)
        
        if len(tokens) <= max_tokens:
            return self._get_embedding_for_text(text)
        else:
            # Diviser le texte en morceaux
            chunks = []
            current_chunk = []
            current_length = 0
            for token in tokens:
                if current_length + 1 > max_tokens:
                    chunks.append(self.tokenizer.decode(current_chunk))
                    current_chunk = [token]
                    current_length = 1
                else:
                    current_chunk.append(token)
                    current_length += 1
            if current_chunk:
                chunks.append(self.tokenizer.decode(current_chunk))
            
            # Obtenir l'embedding pour chaque morceau
            embeddings = [self._get_embedding_for_text(chunk) for chunk in chunks]
            
            # Faire la moyenne des embeddings
            avg_embedding = np.mean(embeddings, axis=0)
            return avg_embedding.tolist()

    def _get_embedding_for_text(self, text):
        response = self.client.embeddings.create(
            input=text,
            model="text-embedding-3-small"
        )
        return response.data[0].embedding

    def export_to_jsonld(self, file_path):
        jsonld_data = []
        for _, row in self.df.iterrows():
            node_data = {
                "@context": "http://schema.org",
                "@type": "Article",
                "@id": f"http://example.org/node/{row['id_enccre']}",
                "title": row.get('head', ''),
                "authors": row.get('author', 'Unknown'),
                "content": row['content'],
                "references": row['references'],
                "embedding": row.get('embedding', [])
            }
            
            # Creating triples for knowledge graph
            triples = []
            # Relation: is_written_by
            triples.append({
                "subject": node_data["@id"],
                "predicate": "is_written_by",
                "object": row.get('author', 'Unknown')
            })
            
            # Relation: belongs_to_domain
            triples.append({
                "subject": node_data["@id"],
                "predicate": "belongs_to_domain",
                "object": row.get('domaine_enccre', 'Unknown')
            })
            
            # Relation: references other articles
            for ref in row['references']:
                triples.append({
                    "subject": node_data["@id"],
                    "predicate": "references",
                    "object": ref
                })
            
            node_data["triples"] = triples
            jsonld_data.append(node_data)

        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(jsonld_data, f, ensure_ascii=False, indent=2)

        print(f"Graphe exporté au format JSON-LD dans {file_path}")

# Exemple d'utilisation
openai_api_key = "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"
kg = SimpleKnowledgeGraph(openai_api_key)

kg.load_data("data/EDdA_500dataframe.tsv")
kg.create_embeddings()
kg.export_to_jsonld("data/output_kg.jsonld")

 45%|████▌     | 227/500 [01:08<01:16,  3.56it/s]