In [None]:
import os
import io
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
import os
os.environ['HF_HUB_TIMEOUT'] = '600'  # Timeout in seconds


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained('AhmedBou/TuniBert')

model = AutoModelForSequenceClassification.from_pretrained('AhmedBou/TuniBert')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/334k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

# Uploading our Data


In [None]:
def text_preprocessing(text):
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
df=pd.read_csv("/content/df_tot.csv")

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [None]:
MAX_LEN = 128

In [None]:
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (list): List of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for t in data:
        # Ensure the input is a string, convert non-strings to empty string
        if not isinstance(t, str):
            t = str(t) if not pd.isna(t) else ""

        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(t),  # Preprocess sentence
            add_special_tokens=True,     # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,          # Max length to truncate/pad
            truncation=True,             # Activate truncation
            padding='max_length',        # Pad sentence to max length
            return_attention_mask=True   # Return attention mask
        )

        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks


In [None]:
# Check for non-string values in the dataset
non_string_count = df['text'].apply(lambda x: not isinstance(x, str)).sum()
print(f"Number of non-string entries: {non_string_count}")

# Optional: Convert all non-string entries to empty strings or handle them as needed
df['text'] = df['text'].apply(lambda x: str(x) if not pd.isna(x) else "")


Number of non-string entries: 1665


In [None]:
from sklearn.model_selection import train_test_split

# Assume you have your data in X and y
X = df.text.values
y = df.label.values

# First, split the data into 80% train and 20% temporary set (which will later be split into validation and test sets)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Now split the temporary set into 50% validation and 50% test (which gives 10% of the original data for each)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")


Training set size: 179859
Validation set size: 22482
Test set size: 22483


In [None]:
print('Tokenizing data at the moment...')
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)

Tokenizing data at the moment...


In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Training the model

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    #bert_classifier = model(freeze_bert)
    bert_classifier=model
    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:
import random
import time
import torch as nn
import torch
import torch.nn as nn

# Specify loss function
loss_fn = nn.CrossEntropyLoss()


def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model."""
    print("Start training...\n")
    for epoch_i in range(epochs):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()
        total_loss, batch_loss, batch_counts = 0, 0, 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts += 1
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()
            outputs = model(b_input_ids, attention_mask=b_attn_mask)
            logits = outputs.logits  # Extract logits from the outputs

            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        avg_train_loss = total_loss / len(train_dataloader)
        print("-"*70)

        if evaluation:
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            time_elapsed = time.time() - t0_epoch
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    print("Training complete!")



def evaluate(model, val_dataloader):
    """Evaluate the model on the validation set."""
    model.eval()
    val_accuracy = []
    val_loss = []

    for batch in val_dataloader:
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attn_mask)
            logits = outputs.logits  # Extract logits from the outputs

        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy


In [None]:
bert_classifier, optimizer, scheduler = initialize_model(epochs=3)



In [None]:
# Concatenate the train set and the validation set
full_train_data = torch.utils.data.ConcatDataset([train_data, val_data])
full_train_sampler = RandomSampler(full_train_data)
full_train_dataloader = DataLoader(full_train_data, sampler=full_train_sampler, batch_size=32)

In [None]:
def predict_comment_posneg_reqneu(comment, model, tokenizer, max_len=MAX_LEN):
    """Preprocess the input comment and use the model to predict the binary label.

    Args:
    - comment (str): The input comment to classify.
    - model (torch.nn.Module): The trained BERT model.
    - tokenizer (transformers.PreTrainedTokenizer): The tokenizer associated with the BERT model.
    - max_len (int): Maximum length for padding/truncation.

    Returns:
    - label (int): Predicted label for the comment (0 for neutral/request, 1 for positive/negative).
    """
    # Preprocess the comment
    encoded_comment = tokenizer.encode_plus(
        comment,                       # Comment to preprocess
        add_special_tokens=True,      # Add special tokens [CLS] and [SEP]
        max_length=max_len,           # Pad or truncate to max length
        padding='max_length',         # Pad to max length
        truncation=True,              # Truncate longer sentences
        return_attention_mask=True,   # Generate attention mask
        return_tensors='pt'           # Return PyTorch tensors
    )

    # Move tensors to the device (GPU/CPU)
    input_ids = encoded_comment['input_ids'].to(device)
    attention_mask = encoded_comment['attention_mask'].to(device)

    # Put the model in evaluation mode
    model.eval()

    with torch.no_grad():
        # Perform forward pass and get logits
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Apply softmax to logits for multi-class classification
    probs = F.softmax(logits, dim=1)  # Convert logits to probabilities

    # Get the predicted class
    predicted_class = torch.argmax(probs, dim=1).item()

    # Map the 3-class output to binary labels
    if predicted_class == 0:
        predicted_label = 0  # Neutral/Request
    elif predicted_class == 1:
        predicted_label = 1  # Positive/Negative
    elif predicted_class == 2:
        predicted_label = 1  # Assuming class 2 is also Positive/Negative
    else:
        raise ValueError("Unexpected predicted class.")

    return predicted_label

In [None]:
import torch
import torch.nn.functional as F

In [None]:
# Load the saved model
model_path = '/content/bert_posneg_reqneu.pth'

# Assuming you have already defined or loaded `bert_classifier`
# Load the state dict into your BERT classifier
bert_classifier.load_state_dict(torch.load(model_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))
bert_classifier.eval()  # Set the model to evaluation mode


  bert_classifier.load_state_dict(torch.load(model_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Example message to classify
input_text = "walahi"

# Predict the label using the `predict_comment_posneg_reqneu` function
predicted_label = predict_comment_posneg_reqneu(input_text, bert_classifier, tokenizer)

# Output the result
print(f"Predicted label: {predicted_label}")

Predicted label: 1
