# Add predicted domains DUFLT

Version février 2026 intégrant les 40 domaines de Denis

In [1]:
from transformers import BertForSequenceClassification, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import torch
from tqdm.auto import tqdm
import os
import datetime

# date as "yyyymmdd"
suffix = datetime.datetime.now().strftime("%y%m%d")
suffix

'260220'

In [2]:

# 1. Setup Device
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

TEXT_COLUMN = "text"

# 2. Load Model and Tokenizer
MODEL_PATH = "../models/final_multiclass_model"
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model.to(device)
model.eval() # Set to evaluation mode

# 3. Create a simple Dataset class
class SimpleDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def predict_dataframe(df, text_column, batch_size=16):
    texts = df[text_column].astype(str).tolist()
    dataset = SimpleDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size)
    
    all_preds = []
    
    # Wrap dataloader with tqdm for the progress bar
    # 'total' helps tqdm calculate the percentage correctly
    progress_bar = tqdm(dataloader, desc="Predicting", unit="batch")
    
    with torch.no_grad():
        for batch in progress_bar:
            inputs = tokenizer(
                batch, 
                padding=True, 
                truncation=True, 
                max_length=512, 
                return_tensors="pt"
            ).to(device)
            
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, axis=-1).cpu().numpy()
            all_preds.extend(preds)
            
            # Optional: Add real-time info to the bar
            # progress_bar.set_postfix({"last_batch_size": len(preds)})

    df['predicted_label_id'] = all_preds
    df['predDomain'] = df['predicted_label_id'].apply(lambda x: model.config.id2label[x])
    return df



In [3]:
sentence = "Brin , se dit aussi des menus jets des herbes, des joncs, des cheveux, & de tout ce que des racines poussent. Coliculus, surculus. Il faut mettre deux ou trois brins de ciboulette dans cette salade. Il n'est resté à ce convalescent que deux ou trois brins de cheveux. Les brins des vergettes sont faits de petits joncs. Les tresses de cheveux se font brin à brin. Philyra."

inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
outputs = model(**inputs)
pred = torch.argmax(outputs.logits, axis=-1).cpu().item()
pred_label = model.config.id2label[pred]
print(f"Predicted label: {pred_label} (ID: {pred})")

Predicted label: Métiers [alimentation] (ID: 26)


In [None]:
#filepath = '/Users/lmoncla/Nextcloud/GEODE/GEODE - Partage consortium/Corpus/EDdA/EDdA_dataset_articles_250517.tsv'
filepath = '/Users/lmoncla/Nextcloud/GEODE/GEODE - Partage consortium/Corpus/EDdA/EDdA_dataset_paragraphs_230407.tsv'

df = pd.read_csv(filepath, sep='\t', encoding='utf-8')
df.rename(columns={"edda_class":"srcDomain", "paragraph_id":"paragrapId", "content":"text"}, inplace=True)
df.drop(columns=['author', "nb_place_spacy", "nb_person_spacy"], inplace=True)
#df = pd.read_csv(os.path.join('..', 'data', '1743_LeRobert', 'Trevoux1743_html_macro_260213.tsv'), sep='\t', encoding='utf-8')
df.head()

Unnamed: 0,volume,numero,head,author,edda_class,paragraph_id,content,nb_place_spacy,nb_person_spacy
0,1,1,Title Page,unsigned,unclassified,1,"\n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...",0,1
1,1,1,Title Page,unsigned,unclassified,2,"\nMis en ordre & publié par M. DIDEROT, de l'A...",2,2
2,1,1,Title Page,unsigned,unclassified,3,"\n\nTantùm series juncturaque pollet,\nTantùm ...",0,2
3,1,1,Title Page,unsigned,unclassified,4,"\nA PARIS,\nChez\n BRIASSON, rue Sa...",4,5
4,1,1,Title Page,unsigned,unclassified,5,\nM. DCC. L I.\nAVEC APPROBATION ET PRIVILEGE ...,0,0


In [17]:
df.shape

(261633, 9)

In [9]:
# get a sample of rows where text is longer than 100 characters or src-domain is not null
# DUFLT
#df_sample = df[(df['text'].str.len() > 100) | (df['srcDomain'].isna() == False)]

# EDdA
df_sample = df[(df['text'].str.len() > 100) | ((df['srcDomain'].isna() == False) & (df['srcDomain'] != 'pending') & (df['srcDomain'] != '0') & (df['srcDomain'] != 'unclassified'))]

df_sample

Unnamed: 0,book,volume,numero,head,author,srcDomain,text
0,EDdA,1,1,Title Page,unsigned,unclassified,"ENCYCLOPÉDIE, DICTIONNAIRE RAISONNÉ DES SCIENC..."
1,EDdA,1,2,A MONSEIGNEUR LE COMTE D'ARGENSON,Diderot & d'Alembert,unclassified,"A MONSEIGNEUR LE COMTE D'ARGENSON, MINISTRE ET..."
2,EDdA,1,3,DISCOURS PRÉLIMINAIRE DES EDITEURS,d'Alembert,unclassified,DISCOURS PRÉLIMINAIRE DES EDITEURS. L'Encyclop...
3,EDdA,1,5,"A, a & a",Dumarsais5,Grammaire,"A, a & a s.m. (ordre Encyclopéd. Entend. Scien..."
4,EDdA,1,6,A,Dumarsais5,unclassified,"A, mot, est 1. la troisieme personne du présen..."
...,...,...,...,...,...,...,...
74185,EDdA,17,3204,TRANSFUGE,Saint-Lambert5,Art militaire,"TRANSFUGE, s. m. (Art. milit.) La plus grande ..."
74186,EDdA,17,3205,VÉNUS,unsigned,Astronomie,"VÉNUS, (Astronom.) satellites de Vénus. Depuis..."
74187,EDdA,17,3206,VERS falisque,Jaucourt,Poésie latine,"VERS falisque, (Poésie latine.) vers latin de ..."
74188,EDdA,17,3207,"VIBRATION, ou OSCILLATION",Romilly5,Horlogerie,"VIBRATION, ou OSCILLATION, s. f. (Horlog.) ter..."


In [10]:
df_sample_pred = predict_dataframe(df_sample, TEXT_COLUMN, batch_size=16)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting:   0%|          | 0/4447 [00:00<?, ?batch/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted_label_id'] = all_preds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predDomain'] = df['predicted_label_id'].apply(lambda x: model.config.id2label[x])


In [11]:
df_sample.head()

Unnamed: 0,book,volume,numero,head,author,srcDomain,text,predicted_label_id,predDomain
0,EDdA,1,1,Title Page,unsigned,unclassified,"ENCYCLOPÉDIE, DICTIONNAIRE RAISONNÉ DES SCIENC...",12,Grammaire
1,EDdA,1,2,A MONSEIGNEUR LE COMTE D'ARGENSON,Diderot & d'Alembert,unclassified,"A MONSEIGNEUR LE COMTE D'ARGENSON, MINISTRE ET...",12,Grammaire
2,EDdA,1,3,DISCOURS PRÉLIMINAIRE DES EDITEURS,d'Alembert,unclassified,DISCOURS PRÉLIMINAIRE DES EDITEURS. L'Encyclop...,31,Métiers [papier]
3,EDdA,1,5,"A, a & a",Dumarsais5,Grammaire,"A, a & a s.m. (ordre Encyclopéd. Entend. Scien...",12,Grammaire
4,EDdA,1,6,A,Dumarsais5,unclassified,"A, mot, est 1. la troisieme personne du présen...",12,Grammaire


In [12]:
df.loc[df_sample_pred.index, 'predDomain'] = df_sample_pred['predDomain']

In [13]:
df

Unnamed: 0,book,volume,numero,head,author,srcDomain,text,predDomain
0,EDdA,1,1,Title Page,unsigned,unclassified,"ENCYCLOPÉDIE, DICTIONNAIRE RAISONNÉ DES SCIENC...",Grammaire
1,EDdA,1,2,A MONSEIGNEUR LE COMTE D'ARGENSON,Diderot & d'Alembert,unclassified,"A MONSEIGNEUR LE COMTE D'ARGENSON, MINISTRE ET...",Grammaire
2,EDdA,1,3,DISCOURS PRÉLIMINAIRE DES EDITEURS,d'Alembert,unclassified,DISCOURS PRÉLIMINAIRE DES EDITEURS. L'Encyclop...,Métiers [papier]
3,EDdA,1,5,"A, a & a",Dumarsais5,Grammaire,"A, a & a s.m. (ordre Encyclopéd. Entend. Scien...",Grammaire
4,EDdA,1,6,A,Dumarsais5,unclassified,"A, mot, est 1. la troisieme personne du présen...",Grammaire
...,...,...,...,...,...,...,...,...
74185,EDdA,17,3204,TRANSFUGE,Saint-Lambert5,Art militaire,"TRANSFUGE, s. m. (Art. milit.) La plus grande ...",Militaire
74186,EDdA,17,3205,VÉNUS,unsigned,Astronomie,"VÉNUS, (Astronom.) satellites de Vénus. Depuis...",Sciences physico-mathématiques
74187,EDdA,17,3206,VERS falisque,Jaucourt,Poésie latine,"VERS falisque, (Poésie latine.) vers latin de ...",Belles-lettres
74188,EDdA,17,3207,"VIBRATION, ou OSCILLATION",Romilly5,Horlogerie,"VIBRATION, ou OSCILLATION, s. f. (Horlog.) ter...",Métiers [autres]


In [None]:
filepath = '/Users/lmoncla/Nextcloud/GEODE/GEODE - Partage consortium/Corpus/EDdA/EDdA_dataset_paragraphs_pred40domains_260220.tsv'
df.to_csv(os.path.join(filepath), sep="\t", index=False)

In [60]:
path = os.path.join('..', 'data', '1743_LeRobert')
df.to_csv(os.path.join(path, "Trevoux1743_html_macro_pred_"+suffix+".tsv"), sep="\t", index=False)


In [None]:
path = os.path.join('..', 'data', '1743_LeRobert')
df.to_csv(os.path.join(path, "Trevoux1743_paragraphs_macro_pred_"+suffix+".tsv"), sep="\t", index=False)
df.to_excel(os.path.join(path, "Trevoux1743_paragraphs_macro_pred_"+suffix+".xlsx"), index=False)