<a href="https://colab.research.google.com/github/nehadubey1205/ResearchWork/blob/main/Copy_of__31March_NLP_Research_Multilingual_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

BERT without Emoji 

In [None]:
pip install transformers

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [None]:
# Load the dataset from the CSV file
df = pd.read_csv('/content/Bully_Sent_Emo_IEEEIC_6084.csv')

In [None]:


# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)


In [None]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

# Load the BERT model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)


In [None]:
# Tokenize the text messages
train_encodings = tokenizer(list(train_df['Processed_Tweets']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_df['Processed_Tweets']), truncation=True, padding=True)

# Convert the labels to numerical values
train_labels = np.array(list(train_df['Bully_Label']))
test_labels = np.array(list(test_df['Bully_Label']))


In [None]:
# Create a PyTorch dataset from the encoded data
class CyberbullyingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:


# Define a function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
# Create a Trainer object
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=CyberbullyingDataset(train_encodings, train_labels),
    eval_dataset=CyberbullyingDataset(test_encodings, test_labels),
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


In [None]:
# Evaluate the model on the test set
eval_result = trainer.evaluate()

# Print the results
print(f"Accuracy: {eval_result['eval_accuracy']}")
print(f"F1 Score: {eval_result['eval_f1']}")
print(f"Precision: {eval_result['eval_precision']}")
print(f"Recall: {eval_result['eval_recall']}")


BERT With EMOJI+Text

In [None]:
pip install transformers

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# Load the dataset from the CSV file
df = pd.read_csv('/content/Bully_Sent_Emo_IEEEIC_6084.csv')

In [None]:

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

# Load the BERT model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

In [None]:
# Tokenize the text messages
train_encodings = tokenizer(list(train_df['Tweets']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_df['Tweets']), truncation=True, padding=True)

# Convert the labels to numerical values
train_labels = np.array(list(train_df['Bully_Label']))
test_labels = np.array(list(test_df['Bully_Label']))

In [None]:
# Create a PyTorch dataset from the encoded data
class CyberbullyingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Define a function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Create a Trainer object
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=CyberbullyingDataset(train_encodings, train_labels),
    eval_dataset=CyberbullyingDataset(test_encodings, test_labels),
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

In [None]:
# Evaluate the model on the test set
eval_result = trainer.evaluate()

# Print the results
print(f"Accuracy: {eval_result['eval_accuracy']}")
print(f"F1 Score: {eval_result['eval_f1']}")
print(f"Precision: {eval_result['eval_precision']}")
print(f"Recall: {eval_result['eval_recall']}")