**Connect to drive**


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


**Install packages**

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install datasets transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 8.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 31.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 46.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

**Import packages**

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from torch.utils.data import Dataset
import pandas as pd
import torch

**Import data**

In [None]:
marine=pd.read_excel("/content/drive/MyDrive/mer.xlsx")
marine=marine.rename(columns={'content': 'premise'})
marine['hypothesis'] = "C'est un article traitant de mer."
marine['label'] = 0

non_marine=pd.read_excel("/content/drive/MyDrive/Colab Notebooks/non_mer.xlsx")
non_marine=non_marine.rename(columns={'Content': 'premise'})
non_marine['hypothesis'] = "C'est un article traitant de mer."
non_non_marine['label'] = 1

pdList = [marine, non] 
df = pd.concat(pdList, ignore_index=True)
df

**Dataset Class**

In [None]:
class Dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings= encodings
        self.labels= labels
    
    def __getitem__(self, idx):
        item={key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels']=torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

**Import the model**

In [None]:
tokenizer=AutoTokenizer.from_pretrained("BaptisteDoyen/camembert-base-xnli",padding=True ,truncation=True ,return_tensors='pt')
model=AutoModelForSequenceClassification.from_pretrained("BaptisteDoyen/camembert-base-xnli")

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/882 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/299 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

**Prepare data for the training**

In [None]:
train_encodings=tokenizer(df.hypothesis.tolist(), marine.premise.tolist(),truncation=True, padding=True)
train_dataset=Dataset(train_encodings, df.label.tolist())

**Fix the arguments and train the model**

In [None]:
model.train()

args = TrainingArguments(
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    output_dir="./"
)
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 60
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 20


Step,Training Loss


Saving model checkpoint to ./checkpoint-4
Configuration saved in ./checkpoint-4/config.json
Model weights saved in ./checkpoint-4/pytorch_model.bin
tokenizer config file saved in ./checkpoint-4/tokenizer_config.json
Special tokens file saved in ./checkpoint-4/special_tokens_map.json
Saving model checkpoint to ./checkpoint-8
Configuration saved in ./checkpoint-8/config.json
Model weights saved in ./checkpoint-8/pytorch_model.bin
tokenizer config file saved in ./checkpoint-8/tokenizer_config.json
Special tokens file saved in ./checkpoint-8/special_tokens_map.json
Saving model checkpoint to ./checkpoint-12
Configuration saved in ./checkpoint-12/config.json
Model weights saved in ./checkpoint-12/pytorch_model.bin
tokenizer config file saved in ./checkpoint-12/tokenizer_config.json
Special tokens file saved in ./checkpoint-12/special_tokens_map.json
Saving model checkpoint to ./checkpoint-16
Configuration saved in ./checkpoint-16/config.json
Model weights saved in ./checkpoint-16/pytorch_mo

TrainOutput(global_step=20, training_loss=1.1007635116577148, metrics={'train_runtime': 193.1565, 'train_samples_per_second': 1.553, 'train_steps_per_second': 0.104, 'total_flos': 7708400910000.0, 'train_loss': 1.1007635116577148, 'epoch': 5.0})

**Set the model in evaluation mode**

In [None]:
model.eval()

CamembertForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Laye

**Test the model**

In [None]:
classifier = pipeline("zero-shot-classification", 
                      model=model, tokenizer=tokenizer)

sequence = "le mer est magnéfique en été"
candidate_labels = ["mer"]
hypothesis_template = "C'est un article traitant de {}."    

classifier(sequence, candidate_labels, hypothesis_template=hypothesis_template)     
