# Dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir("/content/drive/MyDrive/Share/Classification/")

In [18]:
import pandas as pd
import numpy as np

train_name = 'dataset/mixed/train-bert.csv'
val_name = 'dataset/mixed/val-bert.csv'
test_name = 'dataset/mixed/test-bert.csv'

In [None]:
# from sklearn.model_selection import train_test_split
# df_train, df_test = train_test_split(df, test_size=0.25)
# df_val, df_test = train_test_split(df_test, test_size=0.50)

# df_train = df_train.reset_index(drop=True)
# df_test = df_test.reset_index(drop=True)
# df_val = df_val.reset_index(drop=True)

# df_train.to_csv('dataset/train.csv', index=False)
# df_val.to_csv('dataset/val.csv', index=False)
# df_test.to_csv('dataset/test.csv', index=False)

In [4]:
df_train = pd.read_csv(train_name)
df_val = pd.read_csv(val_name)
df_test = pd.read_csv(test_name)

In [5]:
# rename the columns of DataFrames
df_train.columns = ['text', 'label']
df_val.columns = ['text', 'label']
df_test.columns = ['text', 'label']

## Data Pre-processing

In [None]:
# For example
# remove the meaningless or columns contain 'NULL' or 'Nan' value

# remove punctuation

# lemmatization / Stemming

# remove whitespace

In [None]:
# save the Processed data
df_train.to_csv(train_name, index=False)
df_val.to_csv(val_name, index=False)
df_test.to_csv(test_name, index=False)

# BERT classification


In [8]:
!pip install transformers
!pip install Datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 6.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.1 MB/s 
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.12.1 transformers-4.19.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
# set model name
model_name = 'bert-base-uncased' 
# set the checkpoint dirctory
checkpoint_dir = 'checkpoints/BERT'

train_name = 'dataset/mixed/train-bert.csv'
val_name = 'dataset/mixed/val-bert.csv'
test_name = 'dataset/mixed/test-bert.csv'

In [10]:
import datasets
from datasets import load_dataset

dataset = load_dataset('csv', data_files={'train': [train_name], 
                                          'val': [val_name]})

Using custom data configuration default-1724e66c50f2e3ea


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-1724e66c50f2e3ea/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1724e66c50f2e3ea/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10311
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 1719
    })
})

In [12]:
dataset['train'][0]

{'label': 0,
 'text': 'RT @AFTRebecca: I wrote about how new stretch marks turned me into a massive hypocrite when it comes to sex: http://t.co/LWJjmDXJaU http://…'}

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

# the dataset.map will avoid the RAM crash in the tokenized process if the dataset is too large
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [14]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
# train model

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = checkpoint_dir,
    num_train_epochs = 2,
    logging_steps = 10,
    load_best_model_at_end = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    warmup_steps = 100,
#     weight_decay = 0.01,
    logging_dir = 'logs',
    save_total_limit =20,
    seed=0,
    learning_rate = 5e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

!nvidia-smi

trainer.train()

final_dir = checkpoint_dir + '/final'
trainer.save_model(final_dir)

!nvidia-smi

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/re

Tue Jun 14 02:30:51 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    32W /  70W |   3810MiB / 15109MiB |     15%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10311
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5156


Epoch,Training Loss,Validation Loss
1,0.0,0.008879
2,0.0,0.006583


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1719
  Batch size = 4
Saving model checkpoint to checkpoints/BERT/checkpoint-2578
Configuration saved in checkpoints/BERT/checkpoint-2578/config.json
Model weights saved in checkpoints/BERT/checkpoint-2578/pytorch_model.bin
tokenizer config file saved in checkpoints/BERT/checkpoint-2578/tokenizer_config.json
Special tokens file saved in checkpoints/BERT/checkpoint-2578/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num exa

Tue Jun 14 02:39:28 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   72C    P0    32W /  70W |   5322MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Evaluation 

In [17]:
# load the trained model from checkpoint

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(final_dir, num_labels=2)

loading configuration file checkpoints/BERT/final/config.json
Model config BertConfig {
  "_name_or_path": "checkpoints/BERT/final",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.19.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file checkpoints/BERT/final/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequ

In [19]:
# load the dataset only for the test part
dataset = load_dataset('csv', data_files={'test': [test_name]})
tokenized_dataset = dataset.map(preprocess_function, batched=True)

pred_dataset = tokenized_dataset["test"]

# Run predictions
predictions = trainer.predict(pred_dataset)

# map labels and their meanings
model.config.id2label[0] = 'Non-hateful'
model.config.id2label[1] = 'Hateful'
model.config.id2label

# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

df_test = pd.read_csv(test_name)
pred_texts = df_test['text'].astype('str').tolist()


# Create DataFrame with texts, predictions, labels, and prediction scores
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['bug1','pred','label','score'])
df['pred'].value_counts()
df

Using custom data configuration default-dff1d67b811a420e


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-dff1d67b811a420e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-dff1d67b811a420e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1719
  Batch size = 4


Unnamed: 0,bug1,pred,label,score
0,then you pay for illegals free services too fa...,1,Hateful,0.999983
1,latest unhinged anti white rant at ferguson ug...,1,Hateful,0.999977
2,It's like Hunger Games. But with internet prot...,0,Non-hateful,0.999995
3,"@BladeOfCreation shhh, you're going to hurt th...",0,Non-hateful,0.999995
4,"In regards to being harassed by GamerGate, i w...",0,Non-hateful,0.999995
...,...,...,...,...
1714,maybe they will reconsider in every single ele...,1,Hateful,0.999982
1715,can whites have black a ists shoved down their...,1,Hateful,0.999981
1716,i hate you all racist note from the black bo...,1,Hateful,0.999982
1717,youre a black nationalist they just add white ...,1,Hateful,0.999982


In [20]:
from sklearn import metrics

y_ture = df_test['label'].to_numpy()
y_pred = preds

acc = metrics.accuracy_score(y_ture, y_pred)
f1 = metrics.f1_score(y_ture, y_pred, labels=[0, 1])
recall = metrics.recall_score(y_ture, y_pred, labels=[0, 1])
precision = metrics.precision_score(y_ture, y_pred, labels=[0, 1])
matrix = metrics.confusion_matrix(y_ture, y_pred, labels=[0, 1])
report = metrics.classification_report(y_ture, y_pred, labels=[0, 1])

print('acc is: '+ str(acc))
print('recall is: '+ str(recall))
print('precision is: '+ str(precision))
print('f1 is: '+ str(f1))
class_names1 = ['True Negative', 'True Positive']
class_names2 = ['Pred Negative', 'Pred Positive']
df_cm = pd.DataFrame(matrix, index=class_names1, columns=class_names2)
print(df_cm)
print(report)

#                Pred Negative  Pred Positive
# True Negative           TN          FP
# True Positive           FN          TP

acc is: 0.9982547993019197
recall is: 1.0
precision is: 0.9963054187192119
f1 is: 0.9981492905613819
               Pred Negative  Pred Positive
True Negative            907              3
True Positive              0            809
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       910
           1       1.00      1.00      1.00       809

    accuracy                           1.00      1719
   macro avg       1.00      1.00      1.00      1719
weighted avg       1.00      1.00      1.00      1719



# Appendix
