<a href="https://colab.research.google.com/github/nehadubey1205/ResearchWork/blob/main/NLP_Research_Multilingual_BERT_new_6feb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [None]:
# Load the dataset from the CSV file
df = pd.read_csv('/content/Bully_Sent_Emo_IEEEIC_6084.csv')

In [None]:


# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)


In [None]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

# Load the BERT model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)


In [None]:
# Tokenize the text messages
train_encodings = tokenizer(list(train_df['Processed_Tweets']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_df['Processed_Tweets']), truncation=True, padding=True)

# Convert the labels to numerical values
train_labels = np.array(list(train_df['Bully_Label']))
test_labels = np.array(list(test_df['Bully_Label']))


In [None]:
# Create a PyTorch dataset from the encoded data
class CyberbullyingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:


# Define a function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
# Create a Trainer object
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=CyberbullyingDataset(train_encodings, train_labels),
    eval_dataset=CyberbullyingDataset(test_encodings, test_labels),
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


In [None]:
# Evaluate the model on the test set
eval_result = trainer.evaluate()

# Print the results
print(f"Accuracy: {eval_result['eval_accuracy']}")
print(f"F1 Score: {eval_result['eval_f1']}")
print(f"Precision: {eval_result['eval_precision']}")
print(f"Recall: {eval_result['eval_recall']}")


#Second Approach

In [None]:
pip install indic-transliteration

In [None]:
from indic_transliteration import sanscript

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/Bully_Sent_Emo_IEEEIC_6084.csv')
df.tail(3)

In [None]:
#replace \n
df['Tweets'] = df['Tweets'].str.replace('\n', ' ')

In [None]:
df.tail(2)

In [None]:
def devnagiri_to_hinglish(devnagiri):
    hinglish = sanscript.transliterate(devnagiri, sanscript.DEVANAGARI, sanscript.ITRANS)
    return hinglish

df['Hinglish'] = df['Tweets'].apply(devnagiri_to_hinglish)

In [None]:
df.tail(3)

In [None]:
# convert all text to lowercase
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
# extract the last column and insert it at position 2
col_D = df.pop('Hinglish')
df.insert(2, 'Hinglish', col_D)


In [None]:
df.tail(7)

In [None]:
#remove emoji
import re
import pandas as pd

# Sample dataframe with emojis
#df = pd.DataFrame({'text': ['I love 🍕 and 🍔', 'This is 🔥']})

# Function to remove emojis from text
def remove_emoji(Hinglish):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', Hinglish)

# Apply function to text column of dataframe
df['Hinglish'] = df['Hinglish'].apply(remove_emoji)


In [None]:
df.head(5)

In [None]:
df.isnull().sum()

In [None]:
pip install transformers

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [None]:
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

# Load the BERT model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

In [None]:
# Tokenize the text messages
train_encodings = tokenizer(list(train_df['Hinglish']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_df['Hinglish']), truncation=True, padding=True)

# Convert the labels to numerical values
train_labels = np.array(list(train_df['Bully_Label']))
test_labels = np.array(list(test_df['Bully_Label']))

In [None]:
# Create a PyTorch dataset from the encoded data
class CyberbullyingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Define a function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
# Create a Trainer object
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=CyberbullyingDataset(train_encodings, train_labels),
    eval_dataset=CyberbullyingDataset(test_encodings, test_labels),
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


In [None]:
# Evaluate the model on the test set
eval_result = trainer.evaluate()

# Print the results
print(f"Accuracy: {eval_result['eval_accuracy']}")
print(f"F1 Score: {eval_result['eval_f1']}")
print(f"Precision: {eval_result['eval_precision']}")
print(f"Recall: {eval_result['eval_recall']}")