In [None]:
import pandas as pd
import numpy as np
from transformers import CamembertTokenizer, CamembertForSequenceClassification
import torch
from torch.utils.data import TensorDataset, SequentialSampler, DataLoader

# Load the entire dataset
df_entire = pd.read_pickle("data/df_clean.pkl")

# Load the labeled sample data
df_labeled = pd.read_pickle("data/manually_labelled_df.pkl")

In [None]:
print(len(df_entire))
len(df_labeled)

401987


4019

In [None]:
len(df_entire.author_id.unique())

38836

In [None]:
# Specify the columns to check for duplicates
columns_to_check = ['author_id', 'created_at', 'retweet_count', 'reply_count', 'like_count', 'quote_count', 'lang', 'tweet']

# Remove duplicate rows based on the specified columns
df_entire.drop_duplicates(subset=columns_to_check, keep='first', inplace=True)
len(df_entire)

401067

In [None]:
401067 - 4019

397048

In [None]:
# Remove the labeled sample data from the entire dataset
df_labeled.drop(["_merge", "day", "week", "labels"], axis=1, inplace=True)

# Specify the columns to compare
columns_to_compare = ['author_id', 'created_at', 'retweet_count', 'reply_count', 'like_count', 'quote_count', 'lang', 'tweet', 'tweet_clean0']

# Perform a left merge to find rows in df_entire that are not in df_labeled
df_remaining = pd.merge(df_entire, df_labeled, on=columns_to_compare, how='left', indicator=True)

# Keep only rows that are not present in df_labeled
df_remaining = df_remaining[df_remaining['_merge'] == 'left_only']

# Drop the '_merge' column if not needed
df_remaining = df_remaining.drop(columns=['_merge'])

In [None]:
len(df_remaining)

397048

In [None]:
# Load the previously saved model
model = CamembertForSequenceClassification.from_pretrained("models/camembert_model")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if torch.cuda.is_available():
    print("GPU is available.")
    print(f"Using device: {device}")
else:
    print("GPU is not available. Using CPU.")

GPU is available.
Using device: cuda


In [None]:
batch_size = 16

# Initialize CamemBERT tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)

# Create lists of texts and labels for the remaining data
text_remaining = df_remaining['tweet_clean0'].to_list()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]



In [None]:
model_max_length = tokenizer.model_max_length
print(model_max_length)

512


In [None]:
# Compute the maximum sequence length
max_length = 0
for sent in text_remaining:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_length = max(max_length, len(input_ids))

# You can add a buffer value for padding if needed
MAX_LEN = max_length + 10

# Define a function for tokenizing and preprocessing data
def preprocess_data(texts):
    input_ids = []
    attention_masks = []

    for sent in texts:
        inputs = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=MAX_LEN,  # Define MAX_LEN as appropriate
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [None]:
MAX_LEN

151

In [None]:
# Print a message to indicate the start of tokenization and preprocessing
print("Tokenizing and preprocessing the remaining data...")

# Tokenize and preprocess the remaining data
inputs, masks = preprocess_data(text_remaining)

remaining_data = TensorDataset(inputs, masks)
remaining_sampler = SequentialSampler(remaining_data)
remaining_dataloader = DataLoader(remaining_data, sampler=remaining_sampler, batch_size=batch_size)

Tokenizing and preprocessing the remaining data...


In [None]:
len(inputs)

397048

In [None]:
22276 * 16

356416

In [None]:
# Validation of the model
model.to(device)  # Move the model to the device

# Set the model to evaluation mode to disable dropout layers, etc.
model.eval()

# Lists to store predicted labels for the entire remaining dataset
predictions_remaining = []

# Iterate over batches of data
for i, batch in enumerate(remaining_dataloader):
    batch = tuple(t.to(device) for t in batch)  # Move batch to the same device as the model
    input_ids, input_mask = batch

    with torch.no_grad():  # No need to compute gradients during inference
        outputs = model(input_ids, attention_mask=input_mask)
        logits = outputs.logits

    predictions_remaining.extend(logits.argmax(dim=1).cpu().numpy())  # Extend the predictions list

    # Check if the batch number is a multiple of 1000 before printing
    if (i + 1) % 1000 == 0:
        print(f"Processed batch {i+1}/{len(remaining_dataloader)}")

# Print a message to indicate the completion of model evaluation
print("Model evaluation completed.")

Processed batch 1000/24816
Processed batch 2000/24816
Processed batch 3000/24816
Processed batch 4000/24816
Processed batch 5000/24816
Processed batch 6000/24816
Processed batch 7000/24816
Processed batch 8000/24816
Processed batch 9000/24816
Processed batch 10000/24816
Processed batch 11000/24816
Processed batch 12000/24816
Processed batch 13000/24816
Processed batch 14000/24816
Processed batch 15000/24816
Processed batch 16000/24816
Processed batch 17000/24816
Processed batch 18000/24816
Processed batch 19000/24816
Processed batch 20000/24816
Processed batch 21000/24816
Processed batch 22000/24816
Processed batch 23000/24816
Processed batch 24000/24816
Model evaluation completed.


In [None]:
# Add these predictions to the manually labelled dataset
df_remaining['labels'] = predictions_remaining

df_labeled.rename(columns={"new_labels": "labels"}, inplace=True)

df_final = pd.concat([df_remaining, df_labeled], axis=0)

# Save the remaining dataset with predicted labels to a CSV file
df_final.to_csv("data/entire_dataset_labeled_final.csv", index=False)

In [None]:
len(df_final)

401067

In [None]:
# Count the occurrences of 1s and 0s
counts = np.bincount(df_final["labels"])

# Calculate the percentage of 1s and 0s
total_count = len(df_final["labels"])
percentage_1 = (counts[1] / total_count) * 100
percentage_0 = (counts[0] / total_count) * 100

print(f"Percentage of 1s: {percentage_1:.2f}%")
print(f"Percentage of 0s: {percentage_0:.2f}%")

Percentage of 1s: 84.44%
Percentage of 0s: 15.56%


In [None]:
sample = df_final.sample(100)

In [None]:
sample.reset_index(drop=True, inplace=True)
for i in range(100):
  print(sample["tweet_clean0"][i])
  print(sample["labels"][i])
  print("*******************")

Bonjour 👋Allez c'est repartie pour la semaine ,alors je vous souhaite de passer une bonne journée ,et bon courage aussi🤗🙏☀️😚😘#NonAuPassDeLaHonte #NonAuVaccinObligatoire #OnNeTouchePasAMesEnfants 🇫🇷✊🇫🇷💙🤍❤️
1
*******************
Le pédophile soixante-huitard #CohnBendit s’en prend aux parents qui refusent le vakxkxin à leurs enfants : « Le vakxkxin c’est formidable.les pauvres petits ils ne veulent pas être vakxkxinés ? Qu’ils aillent se faire foutre. »#Vaccin #vaccination #DictatureSanitaire
0
*******************
FLASH🇫🇷- Les parc aquatique pourront réouvrir des le 19 mai… mais sans eau . #COVID19 #NonAuPassSanitaire
1
*******************
Et la marmotte elle met le chocolat dans le papier !NonAuPassDeLaHonte#NonALaVaccinationObligatoire#DictatureSanitaire#FautArreterDeNousPrendrePourDesCons#ÇaSuffitLesConneries
1
*******************
J'espère que #Pfizer #AstraZeneca #Moderna vous remercieront #Macron20h #Macron #LR #Melenchon #Presidentielle2022 #LREM #ITA #CONCOURS #TDF2021 #TDF #Xbox 