# Add predicted domains DUFLT

Version février 2026 intégrant les 40 domaines de Denis

In [18]:
from transformers import BertForSequenceClassification, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import torch
from tqdm.auto import tqdm
import os
import datetime

# date as "yyyymmdd"
suffix = datetime.datetime.now().strftime("%y%m%d")
suffix

'260209'

In [19]:

# 1. Setup Device
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

TEXT_COLUMN = "text"

# 2. Load Model and Tokenizer
MODEL_PATH = "../models/final_multiclass_model"
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model.to(device)
model.eval() # Set to evaluation mode

# 3. Create a simple Dataset class
class SimpleDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def predict_dataframe(df, text_column, batch_size=16):
    texts = df[text_column].astype(str).tolist()
    dataset = SimpleDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size)
    
    all_preds = []
    
    # Wrap dataloader with tqdm for the progress bar
    # 'total' helps tqdm calculate the percentage correctly
    progress_bar = tqdm(dataloader, desc="Predicting", unit="batch")
    
    with torch.no_grad():
        for batch in progress_bar:
            inputs = tokenizer(
                batch, 
                padding=True, 
                truncation=True, 
                max_length=512, 
                return_tensors="pt"
            ).to(device)
            
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, axis=-1).cpu().numpy()
            all_preds.extend(preds)
            
            # Optional: Add real-time info to the bar
            # progress_bar.set_postfix({"last_batch_size": len(preds)})

    df['predicted_label_id'] = all_preds
    df['predicted_label'] = df['predicted_label_id'].apply(lambda x: model.config.id2label[x])
    return df



In [20]:
df = pd.read_csv(os.path.join('..', 'data', '1743_LeRobert', 'Trevoux1743_macro_260209.tsv'), sep='\t', encoding='utf-8')
df.head()

Unnamed: 0,book,volume,numero,head,id,type,src-domain,text,macro-domain
0,DUFLT_1743,1,1,A,1,mainEntry,,A est la première Lettre de l'Alphabet Françoi...,
1,DUFLT_1743,1,2,AAHUS,1,mainEntry,,AAHUS Aahusium. Ville de l'Evéché de Munster. ...,
2,DUFLT_1743,1,3,AAR,1,mainEntry,,"AAR ou AHR. subst. Aara, Abrinca. Rivière d'Al...",
3,DUFLT_1743,1,4,AAR,1,mainEntry,,"AAR Arula ou Arola, & non pas Arosa, comme on ...",
4,DUFLT_1743,1,5,AARBRER,1,mainEntry,,AARBRER Terme ancien qui n'est pas aujourd'hui...,


In [21]:
df.shape

(81374, 9)

In [22]:
# get a sample of rows where text is longer than 100 characters or src-domain is not null
df_sample = df[(df['text'].str.len() > 100) | (df['src-domain'].isna() == False)]
df_sample

Unnamed: 0,book,volume,numero,head,id,type,src-domain,text,macro-domain
0,DUFLT_1743,1,1,A,1,mainEntry,,A est la première Lettre de l'Alphabet Françoi...,
1,DUFLT_1743,1,2,AAHUS,1,mainEntry,,AAHUS Aahusium. Ville de l'Evéché de Munster. ...,
2,DUFLT_1743,1,3,AAR,1,mainEntry,,"AAR ou AHR. subst. Aara, Abrinca. Rivière d'Al...",
3,DUFLT_1743,1,4,AAR,1,mainEntry,,"AAR Arula ou Arola, & non pas Arosa, comme on ...",
4,DUFLT_1743,1,5,AARBRER,1,mainEntry,,AARBRER Terme ancien qui n'est pas aujourd'hui...,
...,...,...,...,...,...,...,...,...,...
81368,DUFLT_1743,6,5581,ZYGETH,2,relatedEntry,,"ZYGETH Zygeth , est aussi un village de la Hau...",
81369,DUFLT_1743,6,5582,ZYGÔME,1,mainEntry,terme de médecine & d'anatomie,ZYGÔME Terme de Médecine & d'Anatomie . Orzygo...,Médecine
81370,DUFLT_1743,6,5583,ZYMOSIMÈTRE,1,mainEntry,,ZYMOSIMÈTRE est un instrument proposé par Swam...,
81371,DUFLT_1743,6,5584,ZYP,1,mainEntry,,ZYP C'étoit autrefois un grand marais de la No...,


In [23]:
df_sample_pred = predict_dataframe(df_sample, TEXT_COLUMN, batch_size=16)

Predicting:   0%|          | 0/4433 [00:00<?, ?batch/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted_label_id'] = all_preds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted_label'] = df['predicted_label_id'].apply(lambda x: model.config.id2label[x])


In [24]:
df_sample.head()

Unnamed: 0,book,volume,numero,head,id,type,src-domain,text,macro-domain,predicted_label_id,predicted_label
0,DUFLT_1743,1,1,A,1,mainEntry,,A est la première Lettre de l'Alphabet Françoi...,,12,Grammaire
1,DUFLT_1743,1,2,AAHUS,1,mainEntry,,AAHUS Aahusium. Ville de l'Evéché de Munster. ...,,13,Géographie
2,DUFLT_1743,1,3,AAR,1,mainEntry,,"AAR ou AHR. subst. Aara, Abrinca. Rivière d'Al...",,13,Géographie
3,DUFLT_1743,1,4,AAR,1,mainEntry,,"AAR Arula ou Arola, & non pas Arosa, comme on ...",,13,Géographie
4,DUFLT_1743,1,5,AARBRER,1,mainEntry,,AARBRER Terme ancien qui n'est pas aujourd'hui...,,12,Grammaire


In [25]:
df.loc[df_sample_pred.index, 'predicted_label'] = df_sample_pred['predicted_label']

In [26]:
df

Unnamed: 0,book,volume,numero,head,id,type,src-domain,text,macro-domain,predicted_label
0,DUFLT_1743,1,1,A,1,mainEntry,,A est la première Lettre de l'Alphabet Françoi...,,Grammaire
1,DUFLT_1743,1,2,AAHUS,1,mainEntry,,AAHUS Aahusium. Ville de l'Evéché de Munster. ...,,Géographie
2,DUFLT_1743,1,3,AAR,1,mainEntry,,"AAR ou AHR. subst. Aara, Abrinca. Rivière d'Al...",,Géographie
3,DUFLT_1743,1,4,AAR,1,mainEntry,,"AAR Arula ou Arola, & non pas Arosa, comme on ...",,Géographie
4,DUFLT_1743,1,5,AARBRER,1,mainEntry,,AARBRER Terme ancien qui n'est pas aujourd'hui...,,Grammaire
...,...,...,...,...,...,...,...,...,...,...
81369,DUFLT_1743,6,5582,ZYGÔME,1,mainEntry,terme de médecine & d'anatomie,ZYGÔME Terme de Médecine & d'Anatomie . Orzygo...,Médecine,Anatomie
81370,DUFLT_1743,6,5583,ZYMOSIMÈTRE,1,mainEntry,,ZYMOSIMÈTRE est un instrument proposé par Swam...,,Chimie
81371,DUFLT_1743,6,5584,ZYP,1,mainEntry,,ZYP C'étoit autrefois un grand marais de la No...,,Géographie
81372,DUFLT_1743,6,5585,ZYTHI,1,mainEntry,,ZYTHI Voyez Zuthi.\n\n,,


In [27]:
path = os.path.join('..', 'data', '1743_LeRobert')
df.to_csv(os.path.join(path, "Trevoux1743_macro_pred_"+suffix+".tsv"), sep="\t", index=False)
df.to_excel(os.path.join(path, "Trevoux1743_macro_pred_"+suffix+".xlsx"), index=False)