# Domain classification

The model is on HuggingFace: https://huggingface.co/GEODE/bert-base-multilingual-cased-edda-domain-classification/

In [None]:
!pip install transformers
!pip install sentencepiece

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# for MacOS
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print('We will use the GPU')
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

We will use the GPU


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

from transformers import BertTokenizer, BertForSequenceClassification, TextClassificationPipeline
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler


In [3]:
def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512):

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids_test = []
    # For every sentence...
    for sent in sentences:
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            # This function also supports truncation and conversion
                            # to pytorch tensors, but I need to do padding, so I
                            # can't use these features.
                            #max_length = max_len,          # Truncate all sentences.
                            #return_tensors = 'pt',     # Return pytorch tensors.
                    )
        input_ids_test.append(encoded_sent)

    # Pad our input tokens
    padded_test = []
    for i in input_ids_test:
        if len(i) > max_len:
            padded_test.extend([i[:max_len]])
        else:
            padded_test.extend([i + [0] * (max_len - len(i))])
    input_ids_test = np.array(padded_test)

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids_test:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert to tensors.
    inputs = torch.tensor(input_ids_test)
    masks = torch.tensor(attention_masks)

    # Create the DataLoader.
    data = TensorDataset(inputs, masks)
    prediction_sampler = SequentialSampler(data)

    return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size)


def data(dataset):
    for d in dataset:
        yield f"{d}"


In [4]:
model_path = 'GEODE/bert-base-multilingual-cased-edda-domain-classification'

print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

print('Loading GEODE Classifier...')
model = BertForSequenceClassification.from_pretrained(model_path).to(device.type)

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=None, device=device, batch_size=8)

# https://stackoverflow.com/questions/67849833/how-to-truncate-input-in-the-huggingface-pipeline
tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}

Loading Bert Tokenizer...
Loading GEODE Classifier...


model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

In [5]:
path = os.path.join('..', 'data', '1743_LeRobert')
df = pd.read_csv(os.path.join(path, 'Trevoux1743.tsv'), sep='\t')
df.head(10)

Unnamed: 0,volume,entry,entry_lemma,subordinate,subordinate_lemma,subordinate_domain,content
0,1,250000010,A,1,,,A est la première Lettre de l'Alphabet Françoi...
1,1,250000010,A,2,,,C'est inutilement que la plupart des Grammairi...
2,1,250000010,A,3,,,"A se prononce du gozier, ce qui ne rend pas ce..."
3,1,250000010,A,4,,,Le son de l'a est ordinairement un son clair. ...
4,1,250000010,A,5,,,Le son de l'a est un de ceux que les muets for...
5,1,250000010,A,6,,,"A devant un e, avec lequel il fait une diphton..."
6,1,250000010,A,7,,,"A devant un i, ou devant un y, avec lequel il ..."
7,1,250000010,A,8,,,A devant o & ne faisant qu'une même syllabe av...
8,1,250000010,A,9,,,"A devant u se prononce comme un o, comme dans ..."
9,1,250000010,A,10,,,A devant y a le meme son que devant i ; il fau...


In [6]:
dataset = df['content'].values

In [7]:
pred = []

for out in tqdm(pipe(data(dataset), **tokenizer_kwargs)):
    out = sorted(out, key=lambda d: d['score'], reverse=True)
    #print(out[0]['label'], out[0]['score'], out[1]['label'], out[1]['score'], out[2]['label'], out[2]['score'])
    pred.append([out[0]['label'], out[0]['score'], out[1]['label'], out[1]['score'], out[2]['label'], out[2]['score']])

pred = np.array(pred)

135004it [26:44, 84.14it/s] 


In [8]:
pred

array([['Religion', '0.5409356951713562', 'Belles-lettres',
        '0.3419780135154724', 'Philosophie', '0.04260238632559776'],
       ['Philosophie', '0.5389487743377686', 'Belles-lettres',
        '0.37168973684310913', 'Religion', '0.02738337032496929'],
       ['Philosophie', '0.9752424955368042', 'Belles-lettres',
        '0.005087039899080992', 'Médecine', '0.0036539973225444555'],
       ...,
       ['Géographie', '0.9952837824821472', 'Histoire',
        '0.0026258418802171946', 'Militaire', '0.0002585176262073219'],
       ['Géographie', '0.7414860129356384', 'Histoire',
        '0.10729145258665085', 'Histoire naturelle',
        '0.08297926187515259'],
       ['Géographie', '0.9985746145248413', 'Histoire',
        '0.00023022794630378485', 'Militaire', '0.00012393835640978068']],
      dtype='<U32')

In [None]:
df['domainPred1'] = pred[:,0]
df['domainProba1'] = pred[:,1]
df['domainPred2'] = pred[:,2]
df['domainProba2'] = pred[:,3]
df['domainPred3'] = pred[:,4]
df['domainProba3'] = pred[:,5]

In [11]:
df.to_csv(os.path.join(path,"Trevoux1743_domPred.tsv"), sep="\t", index=False)

In [None]:
# pour chaque duflt_domain la proportion des 3 premiers domains prédits (en premier)
# grouper par duflt_domain 
# calculer la proportion de chaque domain prédit
# df.groupby('duflt_domain')['domainPred1'].value_counts(normalize=True).unstack().fillna(0)

In [19]:
import pandas as pd
import os

In [20]:
path = os.path.join('..', 'data', '1743_LeRobert')
df = pd.read_csv(os.path.join(path, "Trevoux1743_domPred.tsv"), sep="\t")
l = len(df)
df.head()

Unnamed: 0,volume,entry,entry_lemma,subordinate,subordinate_lemma,subordinate_domain,content,domainPred1,domainProba1,domainPred2,domainProba2,domainPred3,domainProba3
0,1,250000010,A,1,,,A est la première Lettre de l'Alphabet Françoi...,Religion,0.540936,Belles-lettres,0.341978,Philosophie,0.042602
1,1,250000010,A,2,,,C'est inutilement que la plupart des Grammairi...,Philosophie,0.538949,Belles-lettres,0.37169,Religion,0.027383
2,1,250000010,A,3,,,"A se prononce du gozier, ce qui ne rend pas ce...",Philosophie,0.975242,Belles-lettres,0.005087,Médecine,0.003654
3,1,250000010,A,4,,,Le son de l'a est ordinairement un son clair. ...,Philosophie,0.912985,Belles-lettres,0.023113,Musique,0.019635
4,1,250000010,A,5,,,Le son de l'a est un de ceux que les muets for...,Médecine,0.744529,Métiers,0.162147,Musique,0.029677


In [21]:
# drop row where subordinate_domain is NaN
df = df.dropna(subset=['subordinate_domain'])
new_l = len(df)
df.head()

Unnamed: 0,volume,entry,entry_lemma,subordinate,subordinate_lemma,subordinate_domain,content,domainPred1,domainProba1,domainPred2,domainProba2,domainPred3,domainProba3
65,1,250000150,ABADIR,1,,terme de Mythologie,"ABADIR ou ABADDIR ; car Priscien, qui nous a c...",Histoire,0.8785,Religion,0.088441,Belles-lettres,0.0116
69,1,250000160,ABAEUZ,1,,Terme de Coutume,ABAEUZ Terme de Coutume . Biens abaeuz. Bona v...,Droit Jurisprudence,0.95268,Histoire,0.027877,Politique,0.006872
70,1,250000170,ABAJOUR,1,,Terme d'Architecture,"ABAJOUR Terme d'Architecture , Spiraculum, esp...",Beaux-arts,0.985802,Métiers,0.005415,Physique,0.001047
72,1,250000170,ABAJOUR,3,Abajour,Terme de Botanique,Abajour . s. m. Terme de Botanique . Spiraculu...,Histoire naturelle,0.575177,Beaux-arts,0.315066,Agriculture,0.037692
76,1,250000190,ABAISSE,1,,terme de Pâtissier,ABAISSE terme de Pâtissier . C'est la pâte qui...,Agriculture,0.979142,Chasse,0.002907,Histoire naturelle,0.002278


In [22]:
print(f"Total number of subordinates: {l}")
print(f"Number of subordinates with domain: {new_l}")

Total number of subordinates: 135004
Number of subordinates with domain: 12287


In [23]:
df['subordinate_domain'].value_counts()

subordinate_domain
Terme de Marine                 718
termes de Marine                467
Terme de Médecine               415
Terme de Fleuriste              339
Terme d'Anatomie                327
                               ... 
terme de Fauconnérie              1
termes du Droit Romain            1
terme de Fondeur                  1
terme de Droit & de Commerce      1
terme d'Alchymie                  1
Name: count, Length: 1720, dtype: int64

In [24]:
df['subordinate_domain'] = df['subordinate_domain'].str.replace('Terme de ', '')
df['subordinate_domain'] = df['subordinate_domain'].str.replace('Termes de ', '')
df['subordinate_domain'] = df['subordinate_domain'].str.replace('terme de ', '')
df['subordinate_domain'] = df['subordinate_domain'].str.replace('termes de ', '')
df['subordinate_domain'] = df['subordinate_domain'].str.replace("Terme d'", '')
df['subordinate_domain'] = df['subordinate_domain'].str.replace("Termes d'", '')
df['subordinate_domain'] = df['subordinate_domain'].str.replace("terme d'", '')
df['subordinate_domain'] = df['subordinate_domain'].str.replace("termes d'", '')

In [25]:
dom = df['subordinate_domain'].value_counts().reset_index()
dom


Unnamed: 0,subordinate_domain,count
0,Marine,1252
1,Médecine,516
2,Blason,490
3,Anatomie,410
4,Fleuriste,377
...,...,...
1246,Jardinier & de Laboureur,1
1247,Horlogerie & de Machines,1
1248,Agriculture & de labourage,1
1249,quelques Arts,1


In [26]:
data = df.groupby('subordinate_domain')['domainPred1'].value_counts(normalize=True).unstack().fillna(0)
data = data.reset_index()
data

domainPred1,subordinate_domain,Agriculture,Beaux-arts,Belles-lettres,Chasse,Commerce,Droit Jurisprudence,Géographie,Histoire,Histoire naturelle,Militaire,Musique,Médecine,Métiers,Philosophie,Physique,Religion
0,Académiste,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,Accoucheur & de Sage-femme,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.0,0.0,0.0
2,Achitecture,0.000000,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
3,Agriculture,0.942857,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.014286,0.014286,0.0,0.0,0.0
4,Agriculture & de Coutumes,0.000000,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246,Éxécuteur,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,1.0,0.0,0.0
1247,Împrimerie,0.000000,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1248,écriture,0.000000,0.0,1.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1249,évolution militaire,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000,0.0,0.0,0.0


In [27]:
dom = dom.merge(data, how='right')
dom

Unnamed: 0,subordinate_domain,count,Agriculture,Beaux-arts,Belles-lettres,Chasse,Commerce,Droit Jurisprudence,Géographie,Histoire,Histoire naturelle,Militaire,Musique,Médecine,Métiers,Philosophie,Physique,Religion
0,Académiste,2,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,Accoucheur & de Sage-femme,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.0,0.0,0.0
2,Achitecture,1,0.000000,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
3,Agriculture,70,0.942857,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.014286,0.014286,0.0,0.0,0.0
4,Agriculture & de Coutumes,1,0.000000,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246,Éxécuteur,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,1.0,0.0,0.0
1247,Împrimerie,1,0.000000,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1248,écriture,1,0.000000,0.0,1.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1249,évolution militaire,1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000,0.0,0.0,0.0


In [28]:
dom.sort_values(by='count', ascending=False, inplace=True)
#dom.drop(columns=['count'], inplace=True)
dom

Unnamed: 0,subordinate_domain,count,Agriculture,Beaux-arts,Belles-lettres,Chasse,Commerce,Droit Jurisprudence,Géographie,Histoire,Histoire naturelle,Militaire,Musique,Médecine,Métiers,Philosophie,Physique,Religion
505,Marine,1252,0.000000,0.000799,0.000799,0.006390,0.005591,0.001597,0.003195,0.007987,0.010383,0.957668,0.0,0.000000,0.000000,0.004792,0.000799,0.0
590,Médecine,516,0.000000,0.000000,0.000000,0.001938,0.000000,0.000000,0.000000,0.000000,0.001938,0.000000,0.0,0.992248,0.000000,0.001938,0.001938,0.0
92,Blason,490,0.000000,0.000000,0.000000,0.000000,0.000000,0.004082,0.010204,0.010204,0.000000,0.969388,0.0,0.000000,0.000000,0.006122,0.000000,0.0
16,Anatomie,410,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.992683,0.000000,0.002439,0.004878,0.0
312,Fleuriste,377,0.599469,0.007958,0.000000,0.002653,0.000000,0.000000,0.000000,0.005305,0.222812,0.007958,0.0,0.037135,0.111406,0.000000,0.005305,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
721,Prosodie Grèque & Latine,1,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
722,Prosodie Latine,1,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
723,Pyrotechnie,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,1.000000,0.0
270,Droit Écclésiastique,1,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0


In [33]:
import csv
# Liste pour stocker les lignes du futur CSV
output_rows = []

# Parcours de chaque ligne
for _, row in dom.iterrows():
    domain = row['subordinate_domain']
    count = row['count']
    sorted_cols = row.drop(['subordinate_domain', 'count']).sort_values(ascending=False)
    top3 = list(sorted_cols.items())[:3]
    
    # Formate une ligne comme demandé : [nom, label1, val1, label2, val2, label3, val3]
    formatted_row = [domain, count]
    for label, val in top3:
        formatted_row.append(label)
        formatted_row.append(str(round(val * 100, 2)))
    
    output_rows.append(formatted_row)

# Entêtes pour le fichier CSV
headers = ['subordinate_domain', '# article', 'top1_label', 'top1_value', 'top2_label', 'top2_value', 'top3_label', 'top3_value']

# Écriture dans un nouveau fichier CSV
with open("top3_output.csv", mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    writer.writerows(output_rows)

print("Fichier 'top3_output.csv' généré avec succès.")


Fichier 'top3_output.csv' généré avec succès.
