# Domain classification

The model is on HuggingFace: https://huggingface.co/GEODE/bert-base-multilingual-cased-edda-domain-classification/

In [None]:
!pip install transformers
!pip install sentencepiece

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# for MacOS
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print('We will use the GPU')
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

We will use the GPU


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

from transformers import BertTokenizer, BertForSequenceClassification, TextClassificationPipeline
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler


In [3]:
def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512):

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids_test = []
    # For every sentence...
    for sent in sentences:
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            # This function also supports truncation and conversion
                            # to pytorch tensors, but I need to do padding, so I
                            # can't use these features.
                            #max_length = max_len,          # Truncate all sentences.
                            #return_tensors = 'pt',     # Return pytorch tensors.
                    )
        input_ids_test.append(encoded_sent)

    # Pad our input tokens
    padded_test = []
    for i in input_ids_test:
        if len(i) > max_len:
            padded_test.extend([i[:max_len]])
        else:
            padded_test.extend([i + [0] * (max_len - len(i))])
    input_ids_test = np.array(padded_test)

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids_test:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert to tensors.
    inputs = torch.tensor(input_ids_test)
    masks = torch.tensor(attention_masks)

    # Create the DataLoader.
    data = TensorDataset(inputs, masks)
    prediction_sampler = SequentialSampler(data)

    return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size)


def data(dataset):
    for d in dataset:
        yield f"{d}"


In [4]:
model_path = 'GEODE/bert-base-multilingual-cased-edda-domain-classification'

print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

print('Loading GEODE Classifier...')
model = BertForSequenceClassification.from_pretrained(model_path).to(device.type)

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=None, device=device, batch_size=8)

# https://stackoverflow.com/questions/67849833/how-to-truncate-input-in-the-huggingface-pipeline
tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}

Loading Bert Tokenizer...
Loading GEODE Classifier...


model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

In [5]:
path = os.path.join('..', 'data', '1743_LeRobert')
df = pd.read_csv(os.path.join(path, 'Trevoux1743.tsv'), sep='\t')
df.head(10)

Unnamed: 0,volume,entry,entry_lemma,subordinate,subordinate_lemma,subordinate_domain,content
0,1,250000010,A,1,,,A est la première Lettre de l'Alphabet Françoi...
1,1,250000010,A,2,,,C'est inutilement que la plupart des Grammairi...
2,1,250000010,A,3,,,"A se prononce du gozier, ce qui ne rend pas ce..."
3,1,250000010,A,4,,,Le son de l'a est ordinairement un son clair. ...
4,1,250000010,A,5,,,Le son de l'a est un de ceux que les muets for...
5,1,250000010,A,6,,,"A devant un e, avec lequel il fait une diphton..."
6,1,250000010,A,7,,,"A devant un i, ou devant un y, avec lequel il ..."
7,1,250000010,A,8,,,A devant o & ne faisant qu'une même syllabe av...
8,1,250000010,A,9,,,"A devant u se prononce comme un o, comme dans ..."
9,1,250000010,A,10,,,A devant y a le meme son que devant i ; il fau...


In [6]:
dataset = df['content'].values

In [7]:
pred = []

for out in tqdm(pipe(data(dataset), **tokenizer_kwargs)):
    out = sorted(out, key=lambda d: d['score'], reverse=True)
    #print(out[0]['label'], out[0]['score'], out[1]['label'], out[1]['score'], out[2]['label'], out[2]['score'])
    pred.append([out[0]['label'], out[0]['score'], out[1]['label'], out[1]['score'], out[2]['label'], out[2]['score']])

pred = np.array(pred)

135004it [26:44, 84.14it/s] 


In [8]:
pred

array([['Religion', '0.5409356951713562', 'Belles-lettres',
        '0.3419780135154724', 'Philosophie', '0.04260238632559776'],
       ['Philosophie', '0.5389487743377686', 'Belles-lettres',
        '0.37168973684310913', 'Religion', '0.02738337032496929'],
       ['Philosophie', '0.9752424955368042', 'Belles-lettres',
        '0.005087039899080992', 'Médecine', '0.0036539973225444555'],
       ...,
       ['Géographie', '0.9952837824821472', 'Histoire',
        '0.0026258418802171946', 'Militaire', '0.0002585176262073219'],
       ['Géographie', '0.7414860129356384', 'Histoire',
        '0.10729145258665085', 'Histoire naturelle',
        '0.08297926187515259'],
       ['Géographie', '0.9985746145248413', 'Histoire',
        '0.00023022794630378485', 'Militaire', '0.00012393835640978068']],
      dtype='<U32')

In [None]:
df['domainPred1'] = pred[:,0]
df['domainProba1'] = pred[:,1]
df['domainPred2'] = pred[:,2]
df['domainProba2'] = pred[:,3]
df['domainPred3'] = pred[:,4]
df['domainProba3'] = pred[:,5]

In [11]:
df.to_csv(os.path.join(path,"Trevoux1743_domPred.tsv"), sep="\t", index=False)