# Install the Required Libraries

In [1]:
!pip install transformers
!pip install datasets
!pip install numpy
!pip install pandas
!pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 15.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 52.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 76.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[K     |████████████████████████████████| 452 kB 1

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load the Data

In [3]:
import pandas as pd
df=pd.read_csv("TwitterHate.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
df.label.value_counts()


0    29720
1     2242
Name: label, dtype: int64

#### By looking at distribution of label, we have strong data imballance
#### To solve this problem, I have implemented nlpaug library to generate Augmented data.
#### Here is the link Documentation  https://nlpaug.readthedocs.io/en/latest/augmenter/word/context_word_embs.html

In [10]:
import nlpaug.augmenter.word.context_word_embs as aug

In [6]:
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [5]:
# drop function which is used in removing or deleting rows or columns from the CSV files
df.drop('id', inplace=True, axis=1)

In [7]:
from sklearn.model_selection import train_test_split
train,test= train_test_split(df, test_size=0.20, random_state=1)


#### Original sample text 

In [9]:
sample_text

'there are some truly sick ppl out there.   '

In [8]:
sample_text = df['tweet'].iloc[100]

In [11]:
augmenter = aug.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert",device='cuda')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

#### augmented sample text which is very similar to the original sample text.

In [12]:
augmented_sample_text = augmenter.augment(sample_text)
augmented_sample_text

['there truly are some truly sick damn ppl out out there.']

In [13]:
train['label'].value_counts()

0    23760
1     1809
Name: label, dtype: int64

In [14]:
from tqdm.auto import tqdm
import numpy as np
from sklearn.utils import shuffle

#### capturing only hate tweets

In [15]:
hate_tweet=train[train.label==1]


In [16]:
hate_tweet.head()

Unnamed: 0,label,tweet
23677,1,@user by the time you get to saying #notallmen...
18661,1,@user #feminismiscancer #feminismisterrorism #...
6727,1,two bots one girl @user @user @user #tcot #p2...
23379,1,yes because apparently being qualified for a j...
26704,1,@user you might be a libtard if... #libtard #...


In [17]:
hate_tweet.label.value_counts()

1    1809
Name: label, dtype: int64

In [18]:
hate_tweet.iloc[1][1]

'@user #feminismiscancer #feminismisterrorism #feminismmuktbharat why  #malevote is ignored  @user'

#### function for creating Augmented tweets

In [19]:
def augment_tweet(train,augmenter, repetitions=1, samples=1809,):
    augmented_texts=[]
    for i in tqdm(np.random.randint(0, len(train), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(train['tweet'].iloc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'label': 1,
        'tweet': augmented_texts
    }
    aug_= pd.DataFrame(data)
    aug_train = shuffle(train.append(aug_).reset_index(drop=True))
    return aug_train 

In [20]:
aug_train = augment_tweet(hate_tweet, augmenter, samples=1809)

  0%|          | 0/1809 [00:00<?, ?it/s]

In [21]:
aug_train.head()

Unnamed: 0,label,tweet
628,1,@user if i'd posted your flag logo in 2008 i'd...
201,1,book - #german #colonialism in a global age h...
3064,1,[# black democrat for pay why has alway been t...
1282,1,@user go fuck yourself richie. we like it and ...
989,1,@user @user @user worked with the worst congre...


In [None]:

# augmented_sample_text = augmenter.augment(hate_tweet.iloc[100])
# augmented_sample_text

In [None]:
# def augmentMyData(df, augmenter, repetitions=1, samples=1809):
#     augmented_texts = []
#     # select only the minority class samples
#     spam_df = df[df['label'] == 1].reset_index(drop=True) # removes unecessary index column
#     for i in tqdm(np.random.randint(0, len(spam_df), samples)):
#         # generating 'n_samples' augmented texts
#         for _ in range(repetitions):
#             augmented_text = augmenter.augment(spam_df['tweet'].iloc[i])
#             augmented_texts.append(augmented_text)
    
#     data = {
#         'label': 1,
#         'tweet': augmented_texts
#     }
#     aug_df = pd.DataFrame(data)
#     df = shuffle(df.append(aug_df).reset_index(drop=True))
#     return df

In [22]:
aug_train.head()

Unnamed: 0,label,tweet
628,1,@user if i'd posted your flag logo in 2008 i'd...
201,1,book - #german #colonialism in a global age h...
3064,1,[# black democrat for pay why has alway been t...
1282,1,@user go fuck yourself richie. we like it and ...
989,1,@user @user @user worked with the worst congre...


In [23]:

train['label'].value_counts()

0    23760
1     1809
Name: label, dtype: int64

In [24]:
aug_train['label'].value_counts()

1    3618
Name: label, dtype: int64

In [25]:
print("Original: ", train.shape)
print("Augmented: ", aug_train.shape)

Original:  (25569, 2)
Augmented:  (3618, 2)


In [26]:
aug_train.head(10)

Unnamed: 0,label,tweet
628,1,@user if i'd posted your flag logo in 2008 i'd...
201,1,book - #german #colonialism in a global age h...
3064,1,[# black democrat for pay why has alway been t...
1282,1,@user go fuck yourself richie. we like it and ...
989,1,@user @user @user worked with the worst congre...
343,1,please don't forget to use the word ! that is ...
2324,1,"[oh yes!... and apparently this rude, nasty ma..."
2107,1,[you either might also be a youth libtard also...
2644,1,[# missing theresistance # google notmypreside...
1172,1,@user no liberal believes that. we are the pay...


In [27]:
#aug_train.drop('text', inplace=True, axis=1)
aug_train = aug_train.dropna()

In [28]:
non_hate_tweet=train[train.label==0]

In [29]:
non_hate_tweet.label.value_counts()

0    23760
Name: label, dtype: int64

In [32]:
non_hate_tweet= non_hate_tweet.iloc[0:4000]

In [33]:
new_aug_train=pd.concat([aug_train,non_hate_tweet],axis=0)

In [34]:
new_aug_train.head(10)

Unnamed: 0,label,tweet
628,1,@user if i'd posted your flag logo in 2008 i'd...
201,1,book - #german #colonialism in a global age h...
3064,1,[# black democrat for pay why has alway been t...
1282,1,@user go fuck yourself richie. we like it and ...
989,1,@user @user @user worked with the worst congre...
343,1,please don't forget to use the word ! that is ...
2324,1,"[oh yes!... and apparently this rude, nasty ma..."
2107,1,[you either might also be a youth libtard also...
2644,1,[# missing theresistance # google notmypreside...
1172,1,@user no liberal believes that. we are the pay...


#### final Tweets data for training

In [35]:
new_aug_train.label.value_counts()

0    4000
1    3618
Name: label, dtype: int64

 ### train and test datasets stored as CSV files. Let’s see how we can load them as datasets. Notice that HuggingFace requires the data to be as Dataset Dictionary

In [36]:
from sklearn.model_selection import train_test_split
train,test= train_test_split(new_aug_train, test_size=0.20, random_state=77)
train.to_csv('new_aug_train.csv')
test.to_csv('test.csv')

In [None]:
# aug_train.to_csv('./new_aug_train.csv',encoding='utf-8')
# test.to_csv('test.csv')

In [37]:
import datasets
from datasets import load_dataset, load_from_disk
dataset = load_dataset('csv', data_files={'train': 'new_aug_train.csv', 'test': 'test.csv'})
dataset



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-26f0a5fcbee0070f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-26f0a5fcbee0070f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'label', 'tweet'],
        num_rows: 6094
    })
    test: Dataset({
        features: ['Unnamed: 0', 'label', 'tweet'],
        num_rows: 1524
    })
})

### Fine-Tune the Model Keep in mind that the “target” variable should be called “label” and should be numeric. In this dataset, we are dealing with a binary problem, 0 (Ham) or 1 (Spam). So we will start with the “distilbert-base-cased” and then we will fine-tune it. First, we will load the tokenizer.

In [38]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
def tokenize_function(examples):
    return tokenizer(examples["tweet"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

### load the model for the Sequence Classification.

In [39]:
from transformers import AutoModelForSequenceClassification
checkpoint = "distilbert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.b

In [40]:
import numpy as np
from datasets import load_metric
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

# Train the Model%colors

In [42]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Unnamed: 0, tweet. If Unnamed: 0, tweet are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6094
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3810
  Number of trainable parameters = 65783042


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0714,0.46408,0.924541
2,0.0246,0.454214,0.938976
3,0.0061,0.491432,0.937664
4,0.0052,0.539833,0.940289
5,0.0019,0.548564,0.938976


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Unnamed: 0, tweet. If Unnamed: 0, tweet are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1524
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Unnamed: 0, tweet. If Unnamed: 0, tweet are not expected by `DistilBertForSequenceClassification.forward`,  you can safely i

TrainOutput(global_step=3810, training_loss=0.023879011691085937, metrics={'train_runtime': 1640.2911, 'train_samples_per_second': 18.576, 'train_steps_per_second': 2.323, 'total_flos': 4036281637048320.0, 'train_loss': 0.023879011691085937, 'epoch': 5.0})

### Save the model

In [43]:
model.save_pretrained("CustomModels/CustomHamSpam")
# alternatively save the trainer
# trainer.save_model("CustomModels/CustomHamSpam")
tokenizer.save_pretrained("CustomModels/CustomHamSpam")

Configuration saved in CustomModels/CustomHamSpam/config.json


('CustomModels/CustomHamSpam/tokenizer_config.json',
 'CustomModels/CustomHamSpam/special_tokens_map.json',
 'CustomModels/CustomHamSpam/vocab.txt',
 'CustomModels/CustomHamSpam/added_tokens.json',
 'CustomModels/CustomHamSpam/tokenizer.json')

### load the model

In [44]:

from transformers import AutoModelForSequenceClassification
load_model = AutoModelForSequenceClassification.from_pretrained("CustomModels/CustomHamSpam")
load_tokenizer = AutoTokenizer.from_pretrained("CustomModels/CustomHamSpam")

loading configuration file CustomModels/CustomHamSpam/config.json
Model config DistilBertConfig {
  "_name_or_path": "CustomModels/CustomHamSpam",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "vocab_size": 28996
}



### Make Predictions

In [46]:
from transformers import pipeline
my_pipeline  = pipeline("text-classification", model=load_model, tokenizer=load_tokenizer)
data = ["Sometimes, you think that you want to disappear, but all you really want is to be found."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9999864101409912}]

In [47]:

data = ["I love you", "XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL"]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9999914169311523},
 {'label': 'LABEL_0', 'score': 0.9999924898147583}]

In [48]:
data=["Even the darkest night will end, and the sun will rise."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9999871253967285}]

In [49]:
data=["You are not born a winner. You are not born a loser. You are born a chooser."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9996776580810547}]

In [50]:

data=["	tweet78	@user hey, white people: you can call people 'white' by @user  #race  #identity #medâ¦"]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.9999738931655884}]

In [51]:

data=["	tweet57	@user lets fight against"]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.999976396560669}]

In [52]:
data=["never been this down on myself in my entire life."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9999934434890747}]

In [53]:
data=["my cousins are asking why do i always where shawl...di ko lang masabi 'coz po i'm getting fat and m.."]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9999945163726807}]

In [54]:
data=["i dont like the way rwitter is going"]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9999920129776001}]

In [55]:
data=["sometime we only need a little break from everything"]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9999955892562866}]

### predict some hate tweets

In [56]:
data=["suppoer racist antirac"]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.9999737739562988}]

In [57]:
data=[" is still rooted in our society's attitude towards black sKin's people. read more from candide uyanze:Ã¢Â€Â¦"]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.9999814033508301}]

In [58]:
data=["""Revenge of the Sith is actually good in that it accurately depicts how society can be willingly led into fascism 
through conspiracy theories about "elites" and the use of ethnic scapegoating to explain economic oppression of the 
masses, and how liberalism is powerless to stop it""" ]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.9992437362670898}]

In [59]:
data=["My only take on the idiots complaining that people who go to Pride parades might *~gasp~* witness some kink"]
my_pipeline(data)

[{'label': 'LABEL_0', 'score': 0.9999955892562866}]

In [60]:
data=["""If we do not stand now and perform our god given duty to keep OUR country clean of all the Blacks,
 Jews and Yellow scum from Asia, WE are just as bad as the enemy, if not worse. 
We are trading our race for that of an inferior form of trash."""]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.9994024038314819}]

In [61]:
data=["over-excited women is attacked by monkey while opening  present  that monkey repping banana yo!"]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.9999685287475586}]

In [62]:
data=["""If you have to start a sentence with 'I'm not racist, but...then chances are you're pretty racist. Opinions my own. RT≠endorsement, obviously."""]
my_pipeline(data)

[{'label': 'LABEL_1', 'score': 0.9999781847000122}]