<a href="https://colab.research.google.com/github/muajnstu/Multi-Class-Classification-of-YouTube-Videos-Using-A-BERT-enhanced-Machine-Learning-approach/blob/main/RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nlpaug
!pip install ktrain
!pip install tensorflow
!pip install transformers

In [None]:
# Imports
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import string

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import nlpaug.augmenter.word as naw

# Load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/muajnstu/ML-Datasets/refs/heads/main/Youtube%20Video%20Dataset.csv')

# Map category labels to integers
df["Category"] = df["Category"].map({
    "travel blog": 0,
    "Science&Technology": 1,
    "Food": 2,
    "Art&Music": 3,
    "manufacturing": 4,
    "History": 5
})

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Combine Title and Description into one text field
df['processed_text'] = df['Title'] + " " + df['Description']

# Drop unneeded columns
df.drop(columns=['Title', 'Description', 'Videourl'], inplace=True)

# Extract meaningful part using regex
def extract_txt(text):
    text = str(text)
    match = re.search(r"(?<=\s\-\s).*", text)
    return match.group(0) if match else text

df['processed_text'] = df['processed_text'].apply(extract_txt)

# Basic text cleaning
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\d', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'\n', ' ', text)
    return text

df['processed_text'] = df['processed_text'].apply(clean_text)

# Rename for simplicity (optional)
df.rename(columns={'processed_text': 'text'}, inplace=True)

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(
    df['text'], df['Category'], test_size=0.2, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize texts
def encode_texts(tokenizer, texts, max_length=256):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='tf',
        )
        input_ids.append(encoded['input_ids'][0])
        attention_masks.append(encoded['attention_mask'][0])

    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_masks)

# Tokenize each split
train_input_ids, train_attention_masks = encode_texts(tokenizer, X_train)
val_input_ids, val_attention_masks = encode_texts(tokenizer, X_val)
test_input_ids, test_attention_masks = encode_texts(tokenizer, X_test)

# Final label tensors
y_train = tf.convert_to_tensor(y_train.values)
y_val = tf.convert_to_tensor(y_val.values)
y_test = tf.convert_to_tensor(y_test.values)


model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(class_names))

# Model compilation
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

# Early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

# Model training with validation data
history = model.fit(
    [train_input_ids, train_attention_masks],
    y_train,
    batch_size=20,
    validation_data=([val_input_ids, val_attention_masks], y_val),
    epochs=5,
    callbacks=[early_stopping]
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate([test_input_ids, test_attention_masks], y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)
