In [12]:
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
from nltk.corpus import stopwords
import re


class Graphe:
    def __init__(self, documents, custom_stop_words=None):
        self.documents = documents
        self.terms = []
        self.co_occurrence = None
        self.distance_matrix = None
        self.graph = None
        # Liste des mots personnalisés à exclure
        self.custom_stop_words = custom_stop_words if custom_stop_words else list(stopwords.words(
            'english'))

    def preprocess(self):
        self.documents = [re.sub(
            r'(#\S+|@\S+|\S*@\S*\s?|http\S+|[^A-Za-z0-9]\'\'|\d+|<[^>]*>|[^A-Za-z0-9\'\- ]+)', "", doc) for doc in self.documents]
        vectorizer = CountVectorizer(ngram_range=(
            1, 2), stop_words=self.custom_stop_words)
        X = vectorizer.fit_transform(self.documents)
        self.terms = vectorizer.get_feature_names_out()
        self.co_occurrence = (X.T @ X).toarray()
        np.fill_diagonal(self.co_occurrence, 0)
        print(X.shape, self.terms.shape, self.co_occurrence.shape)

    def compute_distance_matrix(self):
        self.distance_matrix = np.zeros(self.co_occurrence.shape)
        for i in tqdm(range(self.co_occurrence.shape[0])):
            for j in range(self.co_occurrence.shape[1]):
                if self.co_occurrence[i, j] > 0:
                    self.distance_matrix[i, j] = 1 / self.co_occurrence[i, j]
                else:
                    self.distance_matrix[i, j] = np.inf

    def build_graph(self):
        self.graph = nx.Graph()
        for i in tqdm(range(len(self.terms))):
            for j in range(len(self.terms)):
                if i != j:
                    self.graph.add_edge(
                        self.terms[i], self.terms[j], weight=self.distance_matrix[i, j])

    def detect_keywords(self):
        keyword_scores = {}
        for i, term in enumerate(self.terms):
            neighbors = list(self.graph.neighbors(term))
            if neighbors:
                sum_distance = np.sum(
                    [self.distance_matrix[i, self.terms.tolist().index(n)] for n in neighbors])
                keyword_scores[term] = sum_distance

        # Trier par moyenne des distances
        keywords = sorted(keyword_scores.items(), key=lambda item: item[1])
        return keywords

    def summarize(self, num_sentences=2):
        sentences = []
        for doc in self.documents:
            sentences.extend(sent_tokenize(doc, language='english'))

        vectorizer = CountVectorizer(stop_words='english')
        X_sentences = vectorizer.fit_transform(sentences)
        co_occurrence_sent = (X_sentences.T @ X_sentences).toarray()
        np.fill_diagonal(co_occurrence_sent, 0)

        graph_sent = nx.Graph()
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j and co_occurrence_sent[i, j] > 0:
                    graph_sent.add_edge(
                        sentences[i], sentences[j], weight=1/co_occurrence_sent[i, j])

        centrality_sent = nx.degree_centrality(graph_sent)
        ranked_sentences = sorted(
            centrality_sent.items(), key=lambda item: item[1], reverse=True)

        summary = [sentence for sentence,
                   _ in ranked_sentences[:num_sentences]]
        return summary

    def analyze(self):
        self.preprocess()
        self.compute_distance_matrix()
        self.build_graph()
        return self.detect_keywords()

In [13]:
from tqdm import tqdm
import os
import glob
from dotenv import load_dotenv
load_dotenv()
if __name__ == '__main__':
    documents = []
    for file in tqdm(glob.glob(os.path.join(os.getenv('TXT_FOLDER2'), "*.txt"))[0:1]):
        with open(file, "r", encoding="utf-8") as f:
            doc = f.read().strip().lower()
            if len(doc) > 10:
                documents.append(doc)
    # print(documents[0])
    graphe = Graphe(documents)
    keywords = graphe.analyze()

    print("Mots-clés détectés :")
    for word, score in keywords:
        print(f"{word}: {score:.4f}")

    summary = graphe.summarize(num_sentences=1)
    print("\nRésumé des documents :")
    for sentence in summary:
        print(f"- {sentence}")

100%|██████████| 1/1 [00:00<00:00, 467.38it/s]


(1, 9506) (9506,) (9506, 9506)


100%|██████████| 9506/9506 [00:21<00:00, 439.08it/s]
100%|██████████| 9506/9506 [02:59<00:00, 52.82it/s]


Mots-clés détectés :
aaron: 1.0000
aaron krolik: 1.0000
abby: 1.0000
abby rockefeller: 1.0000
abilities: 1.0000
abilities able: 1.0000
abilities backgrounds: 1.0000
abilities bias: 1.0000
abilities capabilities: 1.0000
abilities deliver: 1.0000
abilities product: 1.0000
abilities tests: 1.0000
abilities year: 1.0000
ability: 1.0000
ability blockreport: 1.0000
ability identify: 1.0000
ability language: 1.0000
ability users: 1.0000
able: 1.0000
able answer: 1.0000
able build: 1.0000
able correct: 1.0000
able hide: 1.0000
able justify: 1.0000
able leverage: 1.0000
able mitigate: 1.0000
able predict: 1.0000
able reason: 1.0000
able solack: 1.0000
able suspend: 1.0000
able understand: 1.0000
able use: 1.0000
abolitionist: 1.0000
abolitionist tools: 1.0000
absolute: 1.0000
absolute due: 1.0000
abubakar: 1.0000
abubakar frustrating: 1.0000
abuse: 1.0000
abuse committed: 1.0000
academics: 1.0000
academics business: 1.0000
acceleration: 1.0000
acceleration ai: 1.0000
accept: 1.0000
accept rejec

In [11]:
graphe.distance_matrix.shape
list(graphe.graph.nodes())

['0001',
 '0001 oxfordhb',
 '001',
 '001 0001',
 '00451',
 '00451 accessed',
 '0085',
 '0085 accessed',
 '01',
 '01 08',
 '01 state',
 '02',
 '02 22',
 '021',
 '021 00451',
 '03',
 '03 1088182',
 '03 120307132206',
 '03 weapons',
 '04',
 '04 business',
 '06',
 '06 08',
 '06668',
 '06668 pdf',
 '08',
 '08 14',
 '08 17',
 '08 3bcb1832',
 '08 breach',
 '10',
 '10 1007',
 '10 1093',
 '10 1098',
 '10 11',
 '10 1145',
 '10 2020',
 '10 foundational',
 '10 important',
 '10 september',
 '10 tested',
 '10 year',
 '10 young',
 '10 yukti',
 '100',
 '100 100',
 '100 intended',
 '100 much',
 '100 technology',
 '1007',
 '1007 s13347',
 '1088182',
 '1088182 artificial',
 '1093',
 '1093 oxfordhb',
 '1098',
 '1098 rsta',
 '11',
 '11 04',
 '11 13',
 '11 28',
 '11 age',
 '11 checklist',
 '11 society',
 '11 technology',
 '11 year',
 '1126930',
 '1126930 accessed',
 '1145',
 '1145 3290605',
 '1145 3325867',
 '1145 3359180',
 '1145 3419764',
 '11e6',
 '11e6 ae4a',
 '12',
 '12 20',
 '12 2021',
 '12 april',
 '