# BERT

## Install Dependency

In [1]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Download

In [2]:
!wget https://raw.githubusercontent.com/owncodezs/Cyberbullying-classification/main/Dataset.csv

--2023-08-16 08:04:37--  https://raw.githubusercontent.com/owncodezs/Cyberbullying-classification/main/Dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 880938 (860K) [text/plain]
Saving to: ‘Dataset.csv’


2023-08-16 08:04:37 (111 MB/s) - ‘Dataset.csv’ saved [880938/880938]



## Import Datasets

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("Dataset.csv")
df.head()

Unnamed: 0,text,category
0,Viswasam trailer paathutu Inka vanthavanga lik...,unknown_state
1,Thalaiva.. full Support from kerala Mohanlal Fans,unknown_state
2,Paruthiveeran Karthi Back Lokesh Kangaraj Scr...,unknown_state
3,Hey Makkalay yaaru Darbar Motion Poster announ...,unknown_state
4,Padaiyatchi nu sonna mattum ungalukku jaathi v...,unknown_state


In [5]:
df.shape

(12595, 2)

In [6]:

df.rename(columns={'category':'label_text'}, inplace=True)
df['label_text'].value_counts(normalize = True)

Positive          0.673601
Negative          0.128067
Mixed_feelings    0.113061
unknown_state     0.053751
not-Tamil         0.031520
Name: label_text, dtype: float64

In [7]:
labels=df['label_text']
unique_labels = df['label_text'].unique()
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

# Convert labels to numerical values
df['lable'] = [label2id[label] for label in df['label_text']]
print(label2id)
print(id2label)

{'unknown_state': 0, 'Positive': 1, 'not-Tamil': 2, 'Negative': 3, 'Mixed_feelings': 4}
{0: 'unknown_state', 1: 'Positive', 2: 'not-Tamil', 3: 'Negative', 4: 'Mixed_feelings'}


In [8]:
df.head()

Unnamed: 0,text,label_text,lable
0,Viswasam trailer paathutu Inka vanthavanga lik...,unknown_state,0
1,Thalaiva.. full Support from kerala Mohanlal Fans,unknown_state,0
2,Paruthiveeran Karthi Back Lokesh Kangaraj Scr...,unknown_state,0
3,Hey Makkalay yaaru Darbar Motion Poster announ...,unknown_state,0
4,Padaiyatchi nu sonna mattum ungalukku jaathi v...,unknown_state,0


##Splite tha dataset

In [9]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [10]:
train_texts, temp_texts, train_labels, temp_labels, train_label_texts, temp_label_texts = train_test_split(
    df['text'], df['lable'],df['label_text'], test_size=0.2, random_state=42
)

val_texts, test_texts, val_labels, test_labels, val_label_texts, test_label_texts = train_test_split(
    temp_texts, temp_labels, temp_label_texts, test_size=0.5, random_state=42
)

dataset_dict = DatasetDict({
    'train': Dataset.from_dict({'text': train_texts, 'label': train_labels, 'label_text': train_label_texts}),
    'validation': Dataset.from_dict({'text': val_texts, 'label': val_labels, 'label_text': val_label_texts}),
    'test': Dataset.from_dict({'text': test_texts, 'label': test_labels, 'label_text': test_label_texts}),
})


print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 10076
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 1259
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 1260
    })
})


## Download PreTrained Model

In [11]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
model_name= "bert-base-uncased"

In [12]:
model = TFAutoModel.from_pretrained(model_name,num_labels=unique_labels.size,label2id=label2id,id2label=id2label)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Tokenizer

In [14]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [15]:
emotions_encoded = dataset_dict.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/10076 [00:00<?, ? examples/s]

Map:   0%|          | 0/1259 [00:00<?, ? examples/s]

Map:   0%|          | 0/1260 [00:00<?, ? examples/s]

In [16]:
emotions_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10076
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1259
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1260
    })
})

In [17]:


emotions_encoded.set_format('tf',
                            columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])


BATCH_SIZE = 30

def order(inp):
    '''
    This function will group all the inputs of BERT
    into a single dictionary and then output it with
    labels.
    '''
    data = list(inp.values())
    return {
        'input_ids': data[1],
        'attention_mask': data[2],
        'token_type_ids': data[3]
    }, data[0]

train_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['train'][:])

train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1000)

train_dataset = train_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['test'][:])
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

In [18]:
inp, out = next(iter(train_dataset))
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(30, 253), dtype=int64, numpy=
array([[  101, 11687,  3286, ...,     0,     0,     0],
       [  101, 10930,  2860, ...,     0,     0,     0],
       [  101, 22794,  2721, ...,     0,     0,     0],
       ...,
       [  101,  9117,  2050, ...,     0,     0,     0],
       [  101,  1050, 18413, ...,     0,     0,     0],
       [  101,  8038, 15728, ...,     0,     0,     0]])>, 'attention_mask': <tf.Tensor: shape=(30, 253), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'token_type_ids': <tf.Tensor: shape=(30, 253), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>} 

 tf.Tensor([1 3 4 2 1 1 1 0 4 0 4 1 

## Model Definition

In [19]:
class BERTForClassification(tf.keras.Model):

    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

In [20]:
classifier = BERTForClassification(model, num_classes=unique_labels.size)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

## Traing

In [21]:
history = classifier.fit(
    train_dataset,
    epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


## Test

In [22]:
classifier.evaluate(test_dataset)



[0.9460921287536621, 0.6809523701667786]