In [None]:
!git lfs install
!git clone https://huggingface.co/oscarfossey/job_classification
!pip install pickle
!pip install spacy
!pip install keras
!spacy download fr_core_news_sm
!pip install transformers
!pip install sentencepiece
!pip install datasets

In [1]:
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from keras.preprocessing.sequence import pad_sequences
global camembert_tokenizer, MAX_LEN
camembert_tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)
MAX_LEN = 200

def preprocessing_camembert(texts_array):
  """This functions takes an array of strings and return the inputs and the masks of the camemebert model"""

  texts_list = list(texts_array.flatten())
  input_ids  = [camembert_tokenizer.encode(sent,add_special_tokens=True,max_length=MAX_LEN,truncation=True) for sent in texts_list]
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

  attention_masks = []
  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]  
    attention_masks.append(seq_mask)

  return (input_ids, attention_masks)

In [6]:
import torch
import pickle
import numpy as np
from transformers import CamembertForSequenceClassification
from datasets import load_dataset

labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'N', 'M']

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if DEVICE.type != 'cpu':
  print(torch.cuda.get_device_name(DEVICE))

camembert_model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=14) #for our 14 categories A to N
camembert_model.load_state_dict(torch.load("/content/job_classification/model_camembert_unbalancedv2.pth"))

if DEVICE.type != 'cpu':
  camembert_model.to(DEVICE)

tags_name = load_dataset("oscarfossey/NLP_Pole_emploi", data_files='tags_name.csv')

def predict_camembert(texts_array, step = 100):
  "Predicting the outputs step by step trough all the inputs"
  
  inputs, masks = preprocessing_camembert(texts_array)
  inputs = torch.tensor(inputs)
  masks = torch.tensor(masks)
  camembert_model.eval()
  predictions = []
  i = 0
  while i < len(inputs) :
    pred = []      
    if DEVICE.type != 'cpu':
      local_inputs = inputs[i:min(i + step, len(inputs))].to(DEVICE)
      local_masks = masks[i:min(i + step, len(masks))].to(DEVICE)    
    else: 
      local_inputs = inputs[i:min(i + step, len(inputs))]
      local_masks = masks[i:min(i + step, len(masks))]
    with torch.no_grad():
      outputs =  camembert_model(local_inputs, token_type_ids = None, attention_mask = local_masks)
      logits = outputs[0]
    if DEVICE.type != 'cpu':
      logits = logits.detach().cpu().numpy()
    pred.extend(np.argmax(logits, axis=1).flatten())
    predictions.extend(pred)
    i = min(i + step, len(inputs))
  
  predictions = [labels[i] for i in predictions]
      
  return predictions

Tesla P100-PCIE-16GB


Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 