In [None]:
# Import all necessary libraries
import pandas as pd
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizer,
    BertForSequenceClassification
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import os
from google.colab import files
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizerFast




In [None]:

# Set environment variable for debugging CUDA errors
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Check device availability and set up properly
def setup_device():
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
        # Clear cache to prevent memory issues
        torch.cuda.empty_cache()
    else:
        device = torch.device('cpu')
        print("Using CPU")
    return device

device = setup_device()

Using GPU: Tesla T4
GPU Memory: 14.7 GB


In [None]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving Training_clean.csv to Training_clean.csv
User uploaded file "Training_clean.csv" with length 8740240 bytes


In [None]:
CSV_FILE_PATH = "Training_clean.csv"  # Change this to your CSV file path
MODEL_NAME = 'bert-base-uncased'  # Use 'distilbert-base-uncased' for faster training
MAX_LENGTH = 128
BATCH_SIZE = 8  # Reduce to 8 or 4 if you get memory errors
EPOCHS = 3
LEARNING_RATE = 2e-5

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Load data
# print("Loading dataset...")
df = pd.read_csv(CSV_FILE_PATH)
# print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
# print("\ndfset columns:")
print(df.columns.tolist())
# print("\nFirst few rows:")
print(df.head())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Handle missing values
df = df.dropna()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Dataset shape: (39650, 3)
['Tweet_ID', 'type', 'tweet']
      Tweet_ID             type  \
0  ID_0022DWKP  sexual_violence   
1  ID_00395QYM  sexual_violence   
2  ID_003EOSSF  sexual_violence   
3  ID_004BBHOD  sexual_violence   
4  ID_004F7516  sexual_violence   

                                               tweet  
0  had a dream i got raped last night by a guy i ...  
1  he thought the word raped means sex and told m...  
2  she not talking to me i was raped by men moles...  
3  i was sexually abused for years at age to no o...  
4  chessy prout can do better by telling the trut...  

Missing values:
Tweet_ID    0
type        0
tweet       0
dtype: int64


In [None]:
le = LabelEncoder()
df['label'] = le.fit_transform(df["type"])

train_text, val_text, train_label, val_label =train_test_split(
    df['tweet'].tolist(),
    df['label'].tolist(),
    test_size=0.1,
    random_state=42,
    stratify=df['label']
)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
train_dataset = TweetDataset(train_text, train_label, tokenizer)
val_dataset = TweetDataset(val_text, val_label, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [None]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(le.classes_)
)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

In [None]:
epochs = 3

for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    total_loss = 0
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_loader)
    print(f'Average Training Loss: {avg_train_loss:.4f}')


Epoch 1: 100%|██████████| 2231/2231 [13:48<00:00,  2.69it/s, loss=0.000323]


Average Training Loss: 0.0319


Epoch 2: 100%|██████████| 2231/2231 [13:43<00:00,  2.71it/s, loss=0.000646]


Average Training Loss: 0.0045


Epoch 3: 100%|██████████| 2231/2231 [13:42<00:00,  2.71it/s, loss=0.000177]

Average Training Loss: 0.0040





In [None]:
model.eval()
val_preds = []
val_true = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        val_preds.extend(preds.cpu().numpy())
        val_true.extend(labels.cpu().numpy())

print(f'Validation Classification Report Epoch {epoch+1}:')
print(classification_report(val_true, val_preds, target_names=le.classes_))


Validation Classification Report Epoch 3:
                              precision    recall  f1-score   support

Harmful_Traditional_practice       1.00      1.00      1.00        19
           Physical_violence       0.99      1.00      1.00       594
           economic_violence       1.00      1.00      1.00        22
          emotional_violence       1.00      1.00      1.00        65
             sexual_violence       1.00      1.00      1.00      3265

                    accuracy                           1.00      3965
                   macro avg       1.00      1.00      1.00      3965
                weighted avg       1.00      1.00      1.00      3965



In [None]:
# Save the model
SAVE_DIRECTORY = './save_gbv_bert_model'

model.save_pretrained(SAVE_DIRECTORY)
tokenizer.save_pretrained(SAVE_DIRECTORY)
print(f"Model saved to {SAVE_DIRECTORY}")

# Optional: Load model (for future use)
def load_saved_model(model_directory):
    """Load a previously saved model"""
    try:
        loaded_model = BertForSequenceClassification.from_pretrained(model_directory)
        loaded_tokenizer = BertTokenizer.from_pretrained(model_directory)
        loaded_model.to(device)
        print(f"Model loaded from {model_directory}")
        return loaded_model, loaded_tokenizer
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Example of loading (uncomment to test)
# loaded_model, loaded_tokenizer = load_saved_model(SAVE_DIRECTORY)

💾 Saving model...
✅ Model saved to ./save_gbv_bert_model


In [None]:
uploaded = files.upload()

# Assuming the uploaded file is named "Test.csv"
for filename in uploaded.keys():
    print(f'User uploaded file "{filename}" with length {len(uploaded[filename])} bytes')
    NEW_CSV_FILE_PATH = filename # Use the actual uploaded filename

new_data = pd.read_csv(NEW_CSV_FILE_PATH)

print("\nNew dataset loaded successfully!")
print(f"Dataset shape: {new_data.shape}")
print("\nFirst few rows of the new dataset:")
print(new_data.head())

Saving Test.csv to Test (1).csv
User uploaded file "Test (1).csv" with length 3289617 bytes

New dataset loaded successfully!
Dataset shape: (15581, 2)

First few rows of the new dataset:
      Tweet_ID                                              tweet
0  ID_0095QL4S  because he was my boyfriend, and if I said no,...
1  ID_00DREW5O  lol no, I'm telling you it's not legal. It's l...
2  ID_00E9F5X9  Somalia's semi-autonomous Puntland region has ...
3  ID_00G9OSKZ  University of Cape Coast students being robbed...
4  ID_00HU96U6  "Somebody came up behind him and stabbed him i...


In [None]:
# Create a placeholder for labels for the new data
placeholder_labels = [0] * len(new_data)

# Create a dataset for the new data
new_dataset = TweetDataset(new_data['tweet'], placeholder_labels, tokenizer, MAX_LENGTH)

# Create a data loader for the new data
new_loader = DataLoader(new_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("Data loader for new data created successfully!")
print(f"Number of batches in the new data loader: {len(new_loader)}")

# Test the new data loader
sample_batch_new = next(iter(new_loader))
print(f"\nSample batch shapes from new data loader:")
print(f"Input IDs: {sample_batch_new['input_ids'].shape}")
print(f"Attention Mask: {sample_batch_new['attention_mask'].shape}")
print(f"Labels: {sample_batch_new['labels'].shape}")

Data loader for new data created successfully!
Number of batches in the new data loader: 1948

Sample batch shapes from new data loader:
Input IDs: torch.Size([8, 128])
Attention Mask: torch.Size([8, 128])
Labels: torch.Size([8])


In [None]:
# 1. Set the model to evaluation mode
model.eval()

# 2. Initialize an empty list called new_predictions to store the model's predictions.
new_predictions = []

print("Predicting on new data...")

# 3. Iterate through the new_loader using a for loop and tqdm for a progress bar.
for batch in tqdm(new_loader, desc="Predicting"):
    # 4. For each batch, move the input_ids and attention_mask to the appropriate device (device).
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    # 5. Disable gradient calculation using torch.no_grad().
    with torch.no_grad():
        # 6. Pass the input_ids and attention_mask to the model to get the outputs.
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # 7. Get the predicted class by finding the index of the maximum value in the outputs.logits along the last dimension.
    batch_predictions = torch.argmax(outputs.logits, dim=-1)

    # 8. Extend the new_predictions list with the predicted classes, converting them to NumPy arrays and moving them to the CPU.
    new_predictions.extend(batch_predictions.cpu().numpy())

print("\nPrediction complete!")

Predicting on new data...


Predicting: 100%|██████████| 1948/1948 [02:01<00:00, 15.99it/s]


Prediction complete!





In [None]:
# Create the label_map dictionary from the LabelEncoder classes
label_map = {i: label for i, label in enumerate(le.classes_)}

# 1. Invert the label_map dictionary to create a mapping from numerical IDs back to the original string labels.
id_to_label = {i: label for i, label in enumerate(le.classes_)}

# 2. Create an empty list called predicted_labels.
predicted_labels = []

# 3. Iterate through the new_predictions list (which contains the numerical predictions).
# 4. For each numerical prediction in new_predictions, use the id_to_label dictionary to find the corresponding string label.
# 5. Append the retrieved string label to the predicted_labels list.
for prediction in new_predictions:
    predicted_labels.append(id_to_label[prediction])

# 6. Print a message confirming that the numerical predictions have been mapped back to the original labels.
print("Numerical predictions have been mapped back to the original labels.")
print(f"Example predicted labels: {predicted_labels[:10]}")

Numerical predictions have been mapped back to the original labels.
Example predicted labels: ['sexual_violence', 'Harmful_Traditional_practice', 'Harmful_Traditional_practice', 'sexual_violence', 'sexual_violence', 'sexual_violence', 'sexual_violence', 'emotional_violence', 'sexual_violence', 'sexual_violence']


In [None]:
# 1. Create a new pandas DataFrame named results_df.
results_df = pd.DataFrame()

# 2. Add a column named 'Tweet_ID' to results_df and populate it with the 'Tweet_ID' column from the original new_data DataFrame.
results_df['Tweet_ID'] = new_data['Tweet_ID']

# 3. Add a column named 'Predicted_Type' to results_df and populate it with the predicted_labels list.
results_df['Predicted_Type'] = predicted_labels

# 4. Save results_df to a CSV file named 'predictions.csv'.
output_filename = 'predictions.csv'
results_df.to_csv(output_filename, index=False)

# 5. Print a confirmation message.
print(f"Prediction results saved to {output_filename}")
print("\nFirst few rows of the results DataFrame:")
print(results_df.head())

Prediction results saved to predictions2.csv

First few rows of the results DataFrame:
      Tweet_ID                Predicted_Type
0  ID_0095QL4S               sexual_violence
1  ID_00DREW5O  Harmful_Traditional_practice
2  ID_00E9F5X9  Harmful_Traditional_practice
3  ID_00G9OSKZ               sexual_violence
4  ID_00HU96U6               sexual_violence


In [None]:
files.download('predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>