In [59]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Install the Required Libraries

In [60]:
!pip install transformers
!pip install datasets
!pip install numpy
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Load the Data

In [61]:
import pandas as pd
df=pd.read_csv("TwitterHate.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [62]:
df.label.unique()


array([0, 1])

In [63]:
df.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [64]:
# drop function which is used in removing or deleting rows or columns from the CSV files
df.drop('id', inplace=True, axis=1)

In [65]:
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [66]:
from sklearn.model_selection import train_test_split
train,test= train_test_split(df, test_size=0.20, random_state=1)
train.to_csv('train.csv')
test.to_csv('test.csv')

 ### train and test datasets stored as CSV files. Let’s see how we can load them as datasets. Notice that HuggingFace requires the data to be as Dataset Dictionary

In [67]:
import datasets
from datasets import load_dataset, load_from_disk
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})
dataset



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-061fecee0c1df07e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-061fecee0c1df07e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'label', 'tweet'],
        num_rows: 25569
    })
    test: Dataset({
        features: ['Unnamed: 0', 'label', 'tweet'],
        num_rows: 6393
    })
})

### Fine-Tune the Model Keep in mind that the “target” variable should be called “label” and should be numeric. In this dataset, we are dealing with a binary problem, 0 (Ham) or 1 (Spam). So we will start with the “distilbert-base-cased” and then we will fine-tune it. First, we will load the tokenizer.

In [68]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
def tokenize_function(examples):
    return tokenizer(examples["tweet"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 28996
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde99

  0%|          | 0/26 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

### load the model for the Sequence Classification.

In [69]:
from transformers import AutoModelForSequenceClassification
checkpoint = "distilbert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 28996
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/pytorch_model.bin
Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassif

In [72]:
import numpy as np
from datasets import load_metric
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Train the Model

In [73]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=1)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Unnamed: 0, tweet. If Unnamed: 0, tweet are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 25569
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3197
  Number of trainable parameters = 65783042


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1146,0.112367,0.96809


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_trainer/checkpoint-2000/config.json
Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2500
Configuration saved in test_trainer/checkpoint-2500/config.json
Model weights saved in test_trainer/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-30

TrainOutput(global_step=3197, training_loss=0.1502459349765008, metrics={'train_runtime': 1313.5313, 'train_samples_per_second': 19.466, 'train_steps_per_second': 2.434, 'total_flos': 3387058916235264.0, 'train_loss': 0.1502459349765008, 'epoch': 1.0})

### Save the model

In [74]:
model.save_pretrained("CustomModels/CustomHamSpam")
# alternatively save the trainer
# trainer.save_model("CustomModels/CustomHamSpam")
tokenizer.save_pretrained("CustomModels/CustomHamSpam")

Configuration saved in CustomModels/CustomHamSpam/config.json
Model weights saved in CustomModels/CustomHamSpam/pytorch_model.bin
tokenizer config file saved in CustomModels/CustomHamSpam/tokenizer_config.json
Special tokens file saved in CustomModels/CustomHamSpam/special_tokens_map.json


('CustomModels/CustomHamSpam/tokenizer_config.json',
 'CustomModels/CustomHamSpam/special_tokens_map.json',
 'CustomModels/CustomHamSpam/vocab.txt',
 'CustomModels/CustomHamSpam/added_tokens.json',
 'CustomModels/CustomHamSpam/tokenizer.json')

### load the model

In [75]:

from transformers import AutoModelForSequenceClassification
load_model = AutoModelForSequenceClassification.from_pretrained("CustomModels/CustomHamSpam")
load_tokenizer = AutoTokenizer.from_pretrained("CustomModels/CustomHamSpam")

loading configuration file CustomModels/CustomHamSpam/config.json
Model config DistilBertConfig {
  "_name_or_path": "CustomModels/CustomHamSpam",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "vocab_size": 28996
}

loading weights file CustomModels/CustomHamSpam/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at 

### Make Predictions

In [76]:
from transformers import pipeline
my_pipeline  = pipeline("text-classification", model=load_model, tokenizer=load_tokenizer)
data = ["Sometimes, you think that you want to disappear, but all you really want is to be found."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9968916773796082}]

In [77]:
from transformers import pipeline
data=["Even the darkest night will end, and the sun will rise."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.999202311038971}]

In [78]:
data=["You are not born a winner. You are not born a loser. You are born a chooser."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.992078959941864}]

In [79]:
data=["	tweet78	@user hey, white people: you can call people 'white' by @user  #race  #identity #medâ¦"]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.983745813369751}]

In [80]:

data=["	tweet57	@user lets fight against"]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.9061253666877747}]

In [81]:
data=["never been this down on myself in my entire life."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9993380904197693}]

In [82]:
data=["sometime we only need a little break from everything"]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9995245933532715}]

In [83]:
data=[" is still rooted in our society's attitude towards black sKin's people. read more from candide uyanze:Ã¢Â€Â¦"]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.9779971241950989}]

In [84]:
data=["over-excited women is attacked by monkey while opening  present  that monkey repping banana yo!"]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9719732999801636}]

In [85]:
data=["""If you have to start a sentence with 'I'm not racist, but...then chances are you're pretty racist. Opinions my own. RT≠endorsement, obviously."""]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.5279669165611267}]