# Install the Required Libraries

In [1]:
!pip install transformers
!pip install datasets
!pip install numpy
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 12.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 66.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 69.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[K     |████████████████████████████████| 452 kB 3

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load the Data

In [3]:
import pandas as pd
df=pd.read_csv("TwitterHate.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
df.label.unique()


array([0, 1])

In [5]:
# drop function which is used in removing or deleting rows or columns from the CSV files
df.drop('id', inplace=True, axis=1)

In [6]:
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [7]:
from sklearn.model_selection import train_test_split
train,test= train_test_split(df, test_size=0.20, random_state=1)
train.to_csv('train.csv')
test.to_csv('test.csv')

 ### train and test datasets stored as CSV files. Let’s see how we can load them as datasets. Notice that HuggingFace requires the data to be as Dataset Dictionary

In [8]:
import datasets
from datasets import load_dataset, load_from_disk
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})
dataset



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-6fd7520b0ef6fd6b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-6fd7520b0ef6fd6b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'label', 'tweet'],
        num_rows: 25569
    })
    test: Dataset({
        features: ['Unnamed: 0', 'label', 'tweet'],
        num_rows: 6393
    })
})

### Fine-Tune the Model Keep in mind that the “target” variable should be called “label” and should be numeric. In this dataset, we are dealing with a binary problem, 0 (Ham) or 1 (Spam). So we will start with the “distilbert-base-cased” and then we will fine-tune it. First, we will load the tokenizer.

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
def tokenize_function(examples):
    return tokenizer(examples["tweet"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

### load the model for the Sequence Classification.

In [10]:
from transformers import AutoModelForSequenceClassification
checkpoint = "distilbert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bia

In [11]:
import numpy as np
from datasets import load_metric
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

# Train the Model%colors

In [12]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=3)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Unnamed: 0, tweet. If Unnamed: 0, tweet are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 25569
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9591
  Number of trainable parameters = 65783042


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1314,0.168935,0.964649
2,0.0967,0.156363,0.96809
3,0.0391,0.164177,0.969185


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_trainer/checkpoint-2000/config.json
Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2500
Configuration saved in test_trainer/checkpoint-2500/config.json
Model weights saved in test_trainer/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-30

TrainOutput(global_step=9591, training_loss=0.10093868299565192, metrics={'train_runtime': 3892.2215, 'train_samples_per_second': 19.708, 'train_steps_per_second': 2.464, 'total_flos': 1.0161176748705792e+16, 'train_loss': 0.10093868299565192, 'epoch': 3.0})

### Save the model

In [13]:
model.save_pretrained("CustomModels/CustomHamSpam")
# alternatively save the trainer
# trainer.save_model("CustomModels/CustomHamSpam")
tokenizer.save_pretrained("CustomModels/CustomHamSpam")

Configuration saved in CustomModels/CustomHamSpam/config.json
Model weights saved in CustomModels/CustomHamSpam/pytorch_model.bin
tokenizer config file saved in CustomModels/CustomHamSpam/tokenizer_config.json
Special tokens file saved in CustomModels/CustomHamSpam/special_tokens_map.json


('CustomModels/CustomHamSpam/tokenizer_config.json',
 'CustomModels/CustomHamSpam/special_tokens_map.json',
 'CustomModels/CustomHamSpam/vocab.txt',
 'CustomModels/CustomHamSpam/added_tokens.json',
 'CustomModels/CustomHamSpam/tokenizer.json')

### load the model

In [14]:

from transformers import AutoModelForSequenceClassification
load_model = AutoModelForSequenceClassification.from_pretrained("CustomModels/CustomHamSpam")
load_tokenizer = AutoTokenizer.from_pretrained("CustomModels/CustomHamSpam")

loading configuration file CustomModels/CustomHamSpam/config.json
Model config DistilBertConfig {
  "_name_or_path": "CustomModels/CustomHamSpam",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "vocab_size": 28996
}

loading weights file CustomModels/CustomHamSpam/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at 

### Make Predictions

In [15]:
from transformers import pipeline
my_pipeline  = pipeline("text-classification", model=load_model, tokenizer=load_tokenizer)
data = ["Sometimes, you think that you want to disappear, but all you really want is to be found."]
my_pipeline(data)

Disabling tokenizer parallelism, we're using DataLoader multithreading already


[{'label': 'LABEL_0', 'score': 0.9994813799858093}]

In [16]:
from transformers import pipeline
my_pipeline  = pipeline("text-classification", model=load_model, tokenizer=load_tokenizer)
data = ["I love you", "XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL"]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9993807077407837},
 {'label': 'LABEL_0', 'score': 0.9997209906578064}]

In [17]:
data=["Even the darkest night will end, and the sun will rise."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9996923208236694}]

In [18]:
data=["You are not born a winner. You are not born a loser. You are born a chooser."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.99933260679245}]

In [19]:
data=["never been this down on myself in my entire life."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9996942281723022}]

In [20]:
data=["my cousins are asking why do i always where shawl...di ko lang masabi 'coz po i'm getting fat and m.."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.999718964099884}]

In [21]:
data = ["I love you", "XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL"]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9993807077407837},
 {'label': 'LABEL_0', 'score': 0.9997209906578064}]

In [22]:
data=["Even the darkest night will end, and the sun will rise."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9996923208236694}]

In [23]:
data=["You are not born a winner. You are not born a loser. You are born a chooser."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.99933260679245}]

In [24]:
data=["sometime we only need a little break from everything"]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9997525811195374}]

### predict some hate tweets

In [25]:

data=["	tweet78	@user hey, white people: you can call people 'white' by @user  #race  #identity #medâ¦"]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.9994751811027527}]

In [26]:
data=["""If you have to start a sentence with 'I'm not racist, but...then chances are you're pretty racist. Opinions my own. RT≠endorsement, obviously."""]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.9098004698753357}]

In [27]:
data=["over-excited women is attacked by monkey while opening  present  that monkey repping banana yo!"]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9971112012863159}]

In [28]:
data=["""If we do not stand now and perform our god given duty to keep OUR country clean of all the Blacks,
 Jews and Yellow scum from Asia, WE are just as bad as the enemy, if not worse. 
We are trading our race for that of an inferior form of trash."""]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9990617632865906}]

In [29]:
data=["My only take on the idiots complaining that people who go to Pride parades might *~gasp~* witness some kink"]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9996786117553711}]

In [30]:
data=["""Revenge of the Sith is actually good in that it accurately depicts how society can be willingly led into fascism 
through conspiracy theories about "elites" and the use of ethnic scapegoating to explain economic oppression of the 
masses, and how liberalism is powerless to stop it""" ]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9979373216629028}]

In [31]:
data=[" is still rooted in our society's attitude towards black sKin's people. read more from candide uyanze:Ã¢Â€Â¦"]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.9992867112159729}]

In [32]:
data=["suppoer racist antirac"]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.965352475643158}]