In [1]:
# Importing standard libraries for every machine/deep learning pipeline
import pandas as pd
import torch
from tqdm import tqdm, trange
import numpy as np


# Importing specific libraries for data prerpcessing, model archtecture choice, training and evaluation
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from transformers import AdamW
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [3]:
df=pd.read_csv('C:\\Users\\User\\datafinal.csv')[['author_id','name','username',
                                                 'text','created_at','Sexe']]

In [4]:
df.head()

Unnamed: 0,author_id,name,username,text,created_at,Sexe
0,1.33e+18,Alexandre M A Caron,Alexand14248460,Tellement fier d'être vacciné trois fois contr...,2022-04-29T15:47:14.000Z,M
1,1.33e+18,Alexandre M A Caron,Alexand14248460,@JeMouth Tellement fier d'être vacciné trois f...,2022-04-29T15:33:49.000Z,M
2,1469749000.0,GeB. 🥕🥕🥕💀💀💀,GerardBondeau,@DrEliDavid @idrissaberkane 🔴 Si... #Moderna b...,2022-04-29T08:46:59.000Z,M
3,1469749000.0,GeB. 🥕🥕🥕💀💀💀,GerardBondeau,@JeanYvesCAPO @maillardjeanch3 @alainhoupert P...,2022-04-26T19:57:10.000Z,M
4,1469749000.0,GeB. 🥕🥕🥕💀💀💀,GerardBondeau,"""L’ancien responsable de la rech respiratoire ...",2022-04-21T17:42:44.000Z,M


In [5]:
len(df)

8611

In [6]:
# Defining constants
epochs = 5
MAX_LEN = 128
batch_size = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# Load the dataset, I selected only 5000 sample because of memory limitation
df2 = pd.read_csv('C:\\Users\\User\\Desktop\\Base des donnees\\french_tweets.csv').sample(5000).reset_index(drop=True)
df2.head()

Unnamed: 0,label,text
0,0,Arrête de pleurer ton bonheur
1,0,J'aimerais être en stockholm maintenant ...
2,0,Fait une mission
3,0,En regardant le spectacle de jay leno dernier ...
4,0,J'ai enseigné jordan à prendre la tête ce soir...


In [8]:
# Initialize CamemBERT tokenizer
tokenizer =CamembertTokenizer.from_pretrained(
    'camembert-base',do_lower_case=True)

In [9]:
# Creates list of texts and labels
text = df2['text'].to_list()
labels = df2['label'].to_list()

#user tokenizer to convert sentences into tokenizer
input_ids  = [tokenizer.encode(sent,add_special_tokens=True,max_length=MAX_LEN) for sent in text]

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]  
    attention_masks.append(seq_mask)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(input_ids, labels, attention_masks,
                                                            random_state=42, test_size=0.2)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [11]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=2)
model.to(device)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 

CamembertForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Laye

In [12]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=10e-8)

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)



In [13]:
# Store our loss and accuracy for plotting if we want to visualize training evolution per epochs after the training process
train_loss_set = []

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):  
    # Tracking variables for training
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
  
    # Train the model
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Add batch to device CPU or GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        # Get loss value
        loss = outputs[0]
        # Add it to train loss list
        train_loss_set.append(loss.item())    
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
    
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    


    # Tracking variables for validation
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Validation of the model
    model.eval()
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to device CPU or GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs =  model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss, logits = outputs[:2]
    
        # Move logits and labels to CPU if GPU is used
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Train loss: 0.5719897872805595


Epoch:  20%|██        | 1/5 [58:16<3:53:05, 3496.35s/it]

Validation Accuracy: 0.7698412698412699
Train loss: 0.4124925308227539


Epoch:  40%|████      | 2/5 [1:58:34<2:58:24, 3568.15s/it]

Validation Accuracy: 0.7926587301587301
Train loss: 0.29708193969726565


Epoch:  60%|██████    | 3/5 [2:57:55<1:58:49, 3564.86s/it]

Validation Accuracy: 0.7926587301587301
Train loss: 0.1955599757358432


Epoch:  80%|████████  | 4/5 [3:57:02<59:17, 3557.87s/it]  

Validation Accuracy: 0.7807539682539683
Train loss: 0.13299895744770765


Epoch: 100%|██████████| 5/5 [4:56:55<00:00, 3563.17s/it]

Validation Accuracy: 0.7896825396825397





In [8]:
comments = ["La vie est très mauvaise, Je la déteste", "Quelle belle voiture c'est très magnifique"]

In [14]:
# Encode the comments


In [14]:
# Apply the finetuned model (Camembert)
flat_pred = []
with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs = model(prediction_inputs.to(device),token_type_ids=None, attention_mask=prediction_masks.to(device))
    logits = outputs[0]
    logits = logits.detach().cpu().numpy() 
    flat_pred.extend(np.argmax(logits, axis=1).flatten())

NameError: name 'prediction_inputs' is not defined

In [16]:
for i in range(len(flat_pred)):
    print('Comment: ', comments[i])
    print('Label', flat_pred[i])

Comment:  La vie est très mauvaise, Je la déteste
Label 0
Comment:  Quelle belle voiture c'est très magnifique
Label 1


In [17]:
import re
def nlp_pipeline(a):
    i=[]
    for text in a:
        

        text = text.lower()
        text = text.replace('\n', ' ').replace('\r', '')
        text = ' '.join(text.split())
        text = re.sub(r"[A-Za-z\.]*[0-9]+[A-Za-z%°\.]*", "", text)
        text = re.sub(r"(\s\-\s|-$)", "", text)
        text = re.sub(r"[,\!\?\%\(\)\/\"]", "", text)
        text = re.sub(r"\&\S*\s", "", text)
        text = re.sub(r"\&", "", text)
        text = re.sub(r"\+", "", text)
        text = re.sub(r"\#", "", text)
        text = re.sub(r"\$", "", text)
        text = re.sub(r"\£", "", text)
        text = re.sub(r"\%", "", text)
        text = re.sub(r"\:", "", text)
        text = re.sub(r"\@", "", text)
        text = re.sub(r"\-", "", text)
        i.append(text)

    return i

In [18]:
df['text']=nlp_pipeline(df['text'])

In [16]:
df['text'][3]

"jeanyvescapo  alainhoupert pas compris c'est récent  ils veulent imposer la vax des femmes enceintes  contre indiquee sur la notice initial pfizer "

In [40]:
comments = df['text'][:1000]


In [41]:
len(comments)

1000

In [42]:
df['text'][0]

"tellement fier d'être vacciné trois fois contre le avec le vaccin pfizer vendredi mars mercredi mars jeudi décembre httpst.coqgeqrhuwos"

In [43]:
# Encode the comments
tokenized_comments_ids = [tokenizer.encode(comment,add_special_tokens=True,max_length=MAX_LEN) for comment in comments]
# Pad the resulted encoded comments
tokenized_comments_ids = pad_sequences(tokenized_comments_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks 
attention_masks = []
for seq in tokenized_comments_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

prediction_inputs = torch.tensor(tokenized_comments_ids)
prediction_masks = torch.tensor(attention_masks)

In [44]:
flat_pred = []
with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs =  model(prediction_inputs.to(device),token_type_ids=None, attention_mask=prediction_masks.to(device))
    logits = outputs[0]
    logits = logits.detach().cpu().numpy() 
    flat_pred.extend(np.argmax(logits, axis=1).flatten())

In [22]:
liste1=flat_pred

In [23]:
len(liste2)

2000

In [33]:
liste1[0:10]

[1, 1, 0, 0, 1, 1, 0, 0, 1, 0]

In [27]:
liste2=flat_pred

In [50]:
liste3==liste4

False

In [31]:
liste3=flat_pred

In [36]:
liste4=flat_pred

In [37]:
listefinal=liste1+liste2+liste3+liste4

In [47]:
len(listefinal)

8611

In [39]:
len(liste3)

2000

In [45]:
liste0=flat_pred

In [46]:
listefinal=liste0+liste1+liste2+liste3+liste4

In [48]:
df['label']=listefinal

In [49]:
df.head()

Unnamed: 0,author_id,name,username,text,created_at,Sexe,label
0,1.33e+18,Alexandre M A Caron,Alexand14248460,tellement fier d'être vacciné trois fois contr...,2022-04-29T15:47:14.000Z,M,1
1,1.33e+18,Alexandre M A Caron,Alexand14248460,jemouth tellement fier d'être vacciné trois fo...,2022-04-29T15:33:49.000Z,M,1
2,1469749000.0,GeB. 🥕🥕🥕💀💀💀,GerardBondeau,drelidavid idrissaberkane 🔴 si... moderna bais...,2022-04-29T08:46:59.000Z,M,0
3,1469749000.0,GeB. 🥕🥕🥕💀💀💀,GerardBondeau,jeanyvescapo alainhoupert pas compris c'est ré...,2022-04-26T19:57:10.000Z,M,0
4,1469749000.0,GeB. 🥕🥕🥕💀💀💀,GerardBondeau,l’ancien responsable de la rech respiratoire c...,2022-04-21T17:42:44.000Z,M,0


In [50]:
df.to_csv('bddfinalized.csv')

In [51]:
df['label'].value_counts()

0    4641
1    3970
Name: label, dtype: int64

In [53]:
df.groupby('username')

ValueError: Boolean array expected for the condition, not object

In [106]:
comments =["Le vaccin Covid semble être efficace, ce qui donne confiance à de nombreuses personnes. Mais pour moi, je pense qu'il est dangereux"]

In [107]:
tokenized_comments_ids = [tokenizer.encode(comment,add_special_tokens=True,max_length=MAX_LEN) for comment in comments]
# Pad the resulted encoded comments
tokenized_comments_ids = pad_sequences(tokenized_comments_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks 
attention_masks = []
for seq in tokenized_comments_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

prediction_inputs = torch.tensor(tokenized_comments_ids)
prediction_masks = torch.tensor(attention_masks)

In [108]:
flat_pred = []
with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs =  model(prediction_inputs.to(device),token_type_ids=None, attention_mask=prediction_masks.to(device))
    logits = outputs[0]
    logits = logits.detach().cpu().numpy() 
    flat_pred.extend(np.argmax(logits, axis=1).flatten())

In [109]:
for i in range(len(flat_pred)):
    print('Comment: ', comments[i])
    print('Label', flat_pred[i])

Comment:  Le vaccin Covid semble être efficace, ce qui donne confiance à de nombreuses personnes. Mais pour moi, je pense qu'il est dangereux
Label 0
