<a href="https://colab.research.google.com/github/liyueling13/BERT-for-Sarcasm-Detection/blob/main/Distilbert%20model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## First some setup

In [None]:
import pandas as pd

In [None]:
!pip install datasets tensorflow transformers
import tensorflow as tf
from tensorflow.keras import activations, optimizers, losses
from datasets import load_dataset
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import pickle

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m78.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━

## Load the dataset

In [None]:
HF_irony = load_dataset('tweet_eval', 'irony')

Downloading builder script:   0%|          | 0.00/9.72k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/30.4k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/21.9k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/108k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/32.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/211 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/36.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244 [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/6 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/2862 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/784 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/955 [00:00<?, ? examples/s]

In [None]:
train = HF_irony['train'].to_pandas()
test = HF_irony['test'].to_pandas()
validation = HF_irony['validation'].to_pandas()

In [None]:
irony_df = pd.concat([train, test, validation])
irony_df

Unnamed: 0,text,label
0,seeing ppl walking w/ crutches makes me really...,1
1,"look for the girl with the broken smile, ask h...",0
2,Now I remember why I buy books online @user #s...,1
3,@user @user So is he banded from wearing the c...,1
4,Just found out there are Etch A Sketch apps. ...,1
...,...,...
950,Abraham was actually from modern day Iraq (Ur ...,0
951,@user which one is more disturbing dan? Tickli...,1
952,@user @user haha that's cool! I had a feeling ...,0
953,@user @user Let the Western bastards bank acco...,1


In [None]:
# checking if we have any missing values

irony_df.isna().value_counts()

text   label
False  False    4601
dtype: int64

In [None]:
# checking if it's balanced

irony_df.iloc[:, 1].value_counts()

0    2389
1    2212
Name: label, dtype: int64

## Build the tensorflow dataset:
First create encodings for X,
then assemble together with y

In [None]:
first_line = irony_df.iloc[0,0]

In [None]:
MODEL_NAME = 'distilbert-base-uncased'
MAX_LEN = 50

tkzr = DistilBertTokenizer.from_pretrained(MODEL_NAME)

inputs = tkzr(first_line, max_length=MAX_LEN, truncation=True, padding=True)

print(f'first_line: \'{first_line}\'')
print(f'input ids: {inputs["input_ids"]}')
print(f'attention mask: {inputs["attention_mask"]}')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

first_line: 'seeing ppl walking w/ crutches makes me really excited for the next 3 weeks of my life'
input ids: [101, 3773, 4903, 2140, 3788, 1059, 1013, 13675, 4904, 8376, 3084, 2033, 2428, 7568, 2005, 1996, 2279, 1017, 3134, 1997, 2026, 2166, 102]
attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
X = irony_df.iloc[:, 0].tolist()
X

['seeing ppl walking w/ crutches makes me really excited for the next 3 weeks of my life',
 'look for the girl with the broken smile, ask her if she wants to stay while, and she will be loved. 💕🎵',
 'Now I remember why I buy books online @user #servicewithasmile',
 '@user @user So is he banded from wearing the clothes?  #Karma',
 'Just found out there are Etch A Sketch apps.  #oldschool #notoldschool',
 "Hey what do you know, one of the witnesses supporting Darren Wilson's story lied! And is racist! Mind blown!",
 '@user on stage at #flzjingleball at the @user in #Tampa #iheartradio',
 "You know it's going to be a great day when you're Garmin resets itself and you spill some cinnamon down yourself  #slowclap",
 'Halfway thorough my workday ... Woooo',
 'Would like to thank my nephew for giving me his horrible cold & sore throat etc.. Much appreciated!',
 "I forked node!  Get ready for the future.  (Where's my interviews)",
 "@user @user @user @user I'm off to visit great-nephew very il

In [None]:
y = irony_df.iloc[:, 1].tolist()
y

[1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,


In [None]:
MODEL_NAME = 'distilbert-base-uncased'
MAX_LEN = 50

tkzr = DistilBertTokenizer.from_pretrained(MODEL_NAME)

def construct_encodings(X, tkzr, max_len, trucation=True, padding=True):
    return tkzr(X, max_length=max_len, truncation=trucation, padding=padding)

encodings = construct_encodings(X, tkzr, max_len=MAX_LEN)

In [None]:
def construct_tfdataset(encodings, y=None):
    if y:
        return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    else:
        # this case is used when making predictions on unseen samples aftera training
        return tf.data.Dataset.from_tensor_slices(dict(encodings))

tfdataset = construct_tfdataset(encodings, y)

In [None]:
for element in tfdataset.take(1):
    print(element)

({'input_ids': <tf.Tensor: shape=(50,), dtype=int32, numpy=
array([  101,  3773,  4903,  2140,  3788,  1059,  1013, 13675,  4904,
        8376,  3084,  2033,  2428,  7568,  2005,  1996,  2279,  1017,
        3134,  1997,  2026,  2166,   102,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(50,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=int32)>}, <tf.Tensor: shape=(), dtype=int32, numpy=1>)


## Split into train and test

In [None]:
TEST_SPLIT = 0.2
BATCH_SIZE = 64

train_size = int(len(X) * (1-TEST_SPLIT))

tfdataset = tfdataset.shuffle(len(X))
tfdataset_train = tfdataset.take(train_size)
tfdataset_test = tfdataset.skip(train_size)

tfdataset_train = tfdataset_train.batch(BATCH_SIZE)
tfdataset_test = tfdataset_test.batch(BATCH_SIZE)

## Setup the model

In [None]:
N_EPOCHS = 5

model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
optimizer = optimizers.Adam(learning_rate=1e5)
loss = losses.BinaryCrossentropy()
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

model.fit(tfdataset_train, batch_size=BATCH_SIZE, epochs=N_EPOCHS)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7915eb758310>

In [None]:
benchmarks = model.evaluate(tfdataset_test, return_dict=True, batch_size=BATCH_SIZE)
print(benchmarks)

{'loss': 8.377974510192871, 'accuracy': 0.4505971670150757}


## Predict some irony

In [None]:
def create_predictor(model, model_name, max_len):
  tkzr = DistilBertTokenizer.from_pretrained(model_name)
  def predict_proba(text):
      x = [text]

      encodings = construct_encodings(x, tkzr, max_len=max_len)
      tfdataset = construct_tfdataset(encodings)
      tfdataset = tfdataset.batch(1)

      preds = model.predict(tfdataset).logits
      preds = activations.softmax(tf.convert_to_tensor(preds)).numpy()
      return preds[0][0]

  return predict_proba

clf = create_predictor(model, MODEL_NAME, MAX_LEN)
print(clf('wow I cant believe Im so smart'))

0.0


In [None]:
print(clf('I have food poisoning so clearly I feel great'))

0.0


## Save the model

In [None]:
model.save_pretrained('./model/clf')
with open('./model/info.pkl', 'wb') as f:
    pickle.dump((MODEL_NAME, MAX_LEN), f)

In [None]:
new_model = TFDistilBertForSequenceClassification.from_pretrained('./model/clf')
model_name, max_len = pickle.load(open('./model/info.pkl', 'rb'))

clf = create_predictor(new_model, model_name, max_len)
print(clf('wow I cant believe Im so smart'))

Some layers from the model checkpoint at ./model/clf were not used when initializing TFDistilBertForSequenceClassification: ['dropout_179']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at ./model/clf and are newly initialized: ['dropout_199']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


nan


In [None]:
print(clf('Nice job breaking the vase, I really appreciate your carefulness'))

0.0


## Another attempt with a subset of the data

In [None]:
Xsm = irony_df.iloc[:10000, 0].tolist()
ysm = irony_df.iloc[:10000, 1].tolist()

In [None]:
encodingssm = construct_encodings(Xsm, tkzr, max_len=MAX_LEN)
tfdatasetsm = construct_tfdataset(encodingssm, ysm)

for elementsm in tfdatasetsm.take(1):
    print(elementsm)

({'input_ids': <tf.Tensor: shape=(50,), dtype=int32, numpy=
array([  101,  3773,  4903,  2140,  3788,  1059,  1013, 13675,  4904,
        8376,  3084,  2033,  2428,  7568,  2005,  1996,  2279,  1017,
        3134,  1997,  2026,  2166,   102,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(50,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=int32)>}, <tf.Tensor: shape=(), dtype=int32, numpy=1>)


In [None]:
TEST_SPLIT = 0.2
BATCH_SIZEsm = 4

train_sizesm = int(len(Xsm) * (1-TEST_SPLIT))

tfdatasetsm = tfdatasetsm.shuffle(len(Xsm))
tfdataset_trainsm = tfdatasetsm.take(train_sizesm)
tfdataset_testsm = tfdatasetsm.skip(train_sizesm)

tfdataset_trainsm = tfdataset_trainsm.batch(BATCH_SIZEsm)
tfdataset_testsm = tfdataset_testsm.batch(BATCH_SIZEsm)

In [None]:
N_EPOCHS = 2

modelsm = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
optimizer = optimizers.Adam(learning_rate=1e5)
loss = losses.BinaryCrossentropy()
modelsm.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

modelsm.fit(tfdataset_trainsm, batch_size=BATCH_SIZEsm, epochs=N_EPOCHS)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7915db480dc0>

Unfortunately it wasn't any better. I'm not sure if it's the data itself that's difficult to predict? Or is it that I need to unfreeze some of the training layers?