In [12]:
import re
import json
import os
import unicodedata

def clean_filename(filename):
    filename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore').decode('ASCII')
    filename = re.sub(r'[^\w\-_\. ]', '_', filename)
    filename = filename.replace(' ', '_')
    filename = re.sub(r'_+', '_', filename)
    return filename[:200]

def parse_article(text):
    article = {}
    
    # Extraire la vedette
    vedette_match = re.match(r'^([A-ZÉÈ\s]+)', text)
    if vedette_match:
        article['vedette'] = vedette_match.group(1).strip()
    else:
        return None  # Si pas de vedette, ce n'est probablement pas un article valide
    
    # Extraire l'indication grammaticale et le désignant
    info_match = re.search(r',\s*([\w\.\s]+)?\s*\(([\w\s&]+)\)', text)
    if info_match:
        if info_match.group(1):
            article['indication_grammaticale'] = info_match.group(1).strip()
        article['designants'] = [d.strip() for d in info_match.group(2).split('&')]
    
    # Extraire la signature
    signature_match = re.search(r'\(([A-Z])\)$', text)
    if signature_match:
        article['signature'] = signature_match.group(1)
    
    # Extraire les renvois encyclopédiques
    renvois_match = re.search(r'Voyez\s+([\w\s,&]+)', text)
    if renvois_match:
        article['renvois'] = [r.strip() for r in re.split(r'[,&]', renvois_match.group(1))]
    
    # Extraire le contenu principal
    content_match = re.search(r'\)\s*(.*?)(?=\n\n[A-ZÉÈ\s]+[,\(]|\Z)', text, re.DOTALL)
    if content_match:
        article['contenu'] = content_match.group(1).strip()
    
    return article

def process_file(input_filepath, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    article_count = 0
    
    with open(input_filepath, 'r', encoding='utf-8') as file:
        content = file.read()
    
    articles = re.split(r'\n\n(?=[A-ZÉÈ\s]+[,\(])', content)
    
    for article_text in articles:
        parsed_article = parse_article(article_text)
        if parsed_article:
            filename = clean_filename(parsed_article['vedette'].lower()) + '.json'
            filepath = os.path.join(output_directory, filename)
            with open(filepath, 'w', encoding='utf-8') as json_file:
                json.dump(parsed_article, json_file, ensure_ascii=False, indent=2)
            article_count += 1
            if article_count % 100 == 0:
                print(f"Processed {article_count} articles")
    
    print(f"Total articles processed: {article_count}")

if __name__ == "__main__":
    input_filepath = "data/cleanedMergedDiderot.txt"
    output_directory = "cleaned_articles"
    process_file(input_filepath, output_directory)

Processed 100 articles
Processed 200 articles
Processed 300 articles
Processed 400 articles
Processed 500 articles
Processed 600 articles
Processed 700 articles
Processed 800 articles
Processed 900 articles
Processed 1000 articles
Processed 1100 articles
Processed 1200 articles
Processed 1300 articles
Processed 1400 articles
Processed 1500 articles
Processed 1600 articles
Processed 1700 articles
Processed 1800 articles
Processed 1900 articles
Processed 2000 articles
Processed 2100 articles
Processed 2200 articles
Processed 2300 articles
Processed 2400 articles
Processed 2500 articles
Processed 2600 articles
Processed 2700 articles
Processed 2800 articles
Processed 2900 articles
Processed 3000 articles
Processed 3100 articles
Processed 3200 articles
Processed 3300 articles
Processed 3400 articles
Processed 3500 articles
Processed 3600 articles
Processed 3700 articles
Processed 3800 articles
Processed 3900 articles
Processed 4000 articles
Processed 4100 articles
Processed 4200 articles
P

In [None]:
import re
import json

def clean_filename(filename):
    import unicodedata
    filename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore').decode('ASCII')
    filename = re.sub(r'[^\w\-_\. ]', '_', filename)
    filename = filename.replace(' ', '_')
    filename = re.sub(r'_+', '_', filename)
    return filename[:200]

def parse_article(text):
    article = {}
    
    # Extraire la vedette
    vedette_match = re.match(r'^([A-ZÉÈ\s]+)', text)
    if vedette_match:
        article['vedette'] = vedette_match.group(1).strip()
    else:
        return None  # Si pas de vedette, ce n'est probablement pas un article valide
    
    # Extraire l'indication grammaticale et le désignant
    info_match = re.search(r',\s*([\w\.\s]+)?\s*\(([\w\s&]+)\)', text)
    if info_match:
        if info_match.group(1):
            article['indication_grammaticale'] = info_match.group(1).strip()
        article['designants'] = [d.strip() for d in info_match.group(2).split('&')]
    
    # Extraire la signature
    signature_match = re.search(r'\(([A-Z])\)$', text)
    if signature_match:
        article['signature'] = signature_match.group(1)
    
    # Extraire les renvois encyclopédiques
    renvois_match = re.search(r'Voyez\s+([\w\s,&]+)', text)
    if renvois_match:
        article['renvois'] = [r.strip() for r in re.split(r'[,&]', renvois_match.group(1))]
    
    # Extraire le contenu principal
    content_match = re.search(r'\)\s*(.*?)(?=\n\n[A-ZÉÈ\s]+[,\(]|\Z)', text, re.DOTALL)
    if content_match:
        article['contenu'] = content_match.group(1).strip()
    
    return article

# Test avec un article spécifique
article_text = """
ZÉTÉTIQUE, adj. (mathématiques) épithète que quelques auteurs donnent à une branche des mathématiques, qui s'occupe de la recherche ou de la solution des problèmes. Voyez Résolution & Problème.

Ce mot est formé du grec ζητειν, chercher.

Les anciens ont donné le nom de zététique à une méthode dont ils se servoient pour résoudre un problème. Ils commençoient par supposer que ce qu'on cherchoit étoit déjà trouvé ; puis ils examinoient ce qui résultoit de cette supposition ; & suivant le fil de ce raisonnement, ils parvenoient ou à découvrir ce qu'ils cherchoient, ou à en reconnoître l'impossibilité.

On donne aussi le nom de zététiques aux Pyrrhoniens. Voyez Pyrrhonien. (O)
"""

parsed_article = parse_article(article_text)
print(json.dumps(parsed_article, ensure_ascii=False, indent=2))

In [1]:
#Load EDdA_dataframe_withContent.tsv
import pandas as pd
df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep='\t')

In [16]:
df

Unnamed: 0,volume,numero,head,normClass,classEDdA,author,id_enccre,domaine_enccre,ensemble_domaine_enccre,content,contentWithoutClass,firstParagraph,nb_words
0,1,1,Title Page,unclassified,unclassified,unsigned,,,,"ENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES SCIE...",encyclopédie \n dictionnaire raisonné \n scien...,encyclopédie \n dictionnaire raisonné \n scien...,178
1,1,2,A MONSEIGNEUR LE COMTE D'ARGENSON,unclassified,unclassified,Diderot & d'Alembert,,,,"A MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINISTRE\...",a monseigneur \n comte argenson \n ministre \n...,a monseigneur \n comte argenson \n ministre \n...,322
2,1,3,DISCOURS PRÉLIMINAIRE DES EDITEURS,unclassified,unclassified,d'Alembert,,,,DISCOURS PRÉLIMINAIRE\nDES EDITEURS.\nL'Encycl...,discours préliminaire \n editeurs \n encyclopé...,discours préliminaire \n editeurs \n encyclopé...,58294
3,1,5,"A, a & a",Grammaire,"ordre Encyclopéd. Entend. Science de l'homme, ...",Dumarsais5,v1-1-0,grammaire,Grammaire,"A, a & a s.m. (ordre Encyclopéd.\nEntend. Scie...",a a a s.m ordre encyclopéd \n entend science h...,a a a s.m ordre encyclopéd \n entend science h...,1092
4,1,6,A,unclassified,unclassified,Dumarsais5,v1-1-1,grammaire,Grammaire,"A, mot, est 1. la troisieme personne du présen...",a mot 1 troisieme personne présent \n indicati...,a mot 1 troisieme personne présent \n indicati...,381
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74185,17,3204,TRANSFUGE,Art militaire,Art. milit.,Saint-Lambert5,v17-2197-0,,,"TRANSFUGE, s. m. (Art. milit.) La plus grande\...",transfuge s. m. grand \n partie europe étonn...,transfuge s. m. grand \n partie europe étonn...,9576
74186,17,3205,VÉNUS,Astronomie,Astronom.,unsigned,v17-2198-0,astronomie,Physique - [Sciences physico-mathématiques],"VÉNUS, (Astronom.) satellites de Vénus. Depuis...",vénus satellite vénus depuis \n\n découverte...,vénus satellite vénus depuis \n\n découverte...,3754
74187,17,3206,VERS falisque,Poésie latine,Poésie latine.,Jaucourt,v17-2199-0,poésie,Belles-lettres - Poésie,"VERS falisque, (Poésie latine.) vers latin de\...",ver falisque ver latin \n mesure précises a ...,ver falisque ver latin \n mesure précises a ...,97
74188,17,3207,"VIBRATION, ou OSCILLATION",Horlogerie,Horlog.,Romilly5,v17-2200-0,horlogerie,Métiers,"VIBRATION, ou OSCILLATION, s. f. (Horlog.) ter...",vibration oscillation s. f. terme synonyme c...,vibration oscillation s. f. terme synonyme c...,4945


In [5]:
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_JUSTIFY
import pandas as pd
import html
import re

def clean_text(text):
    if pd.isna(text):
        return "Contenu non disponible"
    # Convertir les caractères spéciaux HTML
    text = html.unescape(text)
    # Remplacer les sauts de ligne par <br/>
    text = text.replace('\n', '<br/>')
    # Échapper les caractères spéciaux XML
    text = text.replace('&', '&amp;').replace('<', '<').replace('>', '>')
    # Supprimer ou remplacer les caractères non-ASCII
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    return text

# Charger les données
df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep='\t')

# Configurer les styles
styles = getSampleStyleSheet()
styles.add(ParagraphStyle(name='Justify', alignment=TA_JUSTIFY, fontName='Times-Roman'))

# Créer le document
doc = SimpleDocTemplate("Encyclopedie_Diderot_dAlembert.pdf", pagesize=A4,
                        rightMargin=72, leftMargin=72,
                        topMargin=72, bottomMargin=18)

Story = []

# Ajouter le titre
Story.append(Paragraph("Encyclopédie de Diderot et d'Alembert", styles['Title']))
Story.append(Spacer(1, 12))

# Parcourir les entrées et les ajouter au document
for index, row in df.iterrows():
    try:
        # Titre de l'entrée
        Story.append(Paragraph(clean_text(str(row['head'])), styles['Heading2']))
        Story.append(Spacer(1, 6))
        
        # Informations sur l'entrée
        info = f"Volume: {row['volume']}, Numéro: {row['numero']}, Auteur: {row['author']}"
        Story.append(Paragraph(clean_text(info), styles['Italic']))
        Story.append(Spacer(1, 6))
        
        # Contenu de l'entrée
        content = clean_text(str(row['content']))
        Story.append(Paragraph(content, styles['Justify']))
        Story.append(Spacer(1, 12))
        
        # Afficher la progression
        if (index + 1) % 100 == 0:
            print(f"Processed {index + 1} articles")
    except Exception as e:
        print(f"Error processing article {index + 1}: {str(e)}")
        continue

# Générer le PDF
try:
    doc.build(Story)
    print(f"Total articles processed: {len(df)}")
    print("PDF generated successfully")
except Exception as e:
    print(f"Error generating PDF: {str(e)}")

Processed 100 articles
Processed 200 articles
Processed 300 articles
Processed 400 articles
Processed 500 articles
Processed 600 articles
Processed 700 articles
Processed 800 articles
Processed 900 articles
Processed 1000 articles
Processed 1100 articles
Processed 1200 articles
Processed 1300 articles
Processed 1400 articles
Processed 1500 articles
Processed 1600 articles
Processed 1700 articles
Processed 1800 articles
Processed 1900 articles
Processed 2000 articles
Processed 2100 articles
Processed 2200 articles
Processed 2300 articles
Processed 2400 articles
Processed 2500 articles
Processed 2600 articles
Processed 2700 articles
Processed 2800 articles
Processed 2900 articles
Processed 3000 articles
Processed 3100 articles
Processed 3200 articles
Processed 3300 articles
Processed 3400 articles
Processed 3500 articles
Processed 3600 articles
Processed 3700 articles
Processed 3800 articles
Processed 3900 articles
Processed 4000 articles
Processed 4100 articles
Processed 4200 articles
P

In [5]:
from PyPDF2 import PdfReader, PdfWriter

# Open the PDF file
pdf_path = "Encyclopedie_Diderot_dAlembert.pdf"

# Create a PDF reader object
pdf_reader = PdfReader(pdf_path)

# Get the number of pages in the PDF
num_pages = len(pdf_reader.pages)

# Get the first 15 pages (or all pages if less than 15)
num_pages_subset = min(15, num_pages)
pdf_writer = PdfWriter()
for page_num in range(num_pages_subset):
    pdf_writer.add_page(pdf_reader.pages[page_num])

# Create a new PDF file
output_pdf = "Encyclopedie_Diderot_dAlembert_subset.pdf"

# Write the subset to a new file
with open(output_pdf, 'wb') as output_file:
    pdf_writer.write(output_file)

print(f"Created subset PDF with {num_pages_subset} pages: {output_pdf}")

Created subset PDF with 15 pages: Encyclopedie_Diderot_dAlembert_subset.pdf
