## 1. DATA CLEANING AND PRE PROCESSING

In [4]:
import pandas as pd
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('train-balanced-sarcasm.csv')
df

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...
...,...,...,...,...,...,...,...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,TwarkMain,reddit.com,2,2,0,2009-04,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,..."
1010822,1,"whatever you do, don't vote green!",BCHarvey,climate,1,1,0,2009-05,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...
1010823,1,Perhaps this is an atheist conspiracy to make ...,rebelcommander,atheism,1,1,0,2009-01,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...
1010824,1,The Slavs got their own country - it is called...,catsi,worldnews,1,1,0,2009-01,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...


In [6]:
df.shape

(1010826, 10)

In [7]:
df = df[['label', 'comment']]
df

Unnamed: 0,label,comment
0,0,NC and NH.
1,0,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.
...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...
1010822,1,"whatever you do, don't vote green!"
1010823,1,Perhaps this is an atheist conspiracy to make ...
1010824,1,The Slavs got their own country - it is called...


In [8]:
df.isna().sum()

Unnamed: 0,0
label,0
comment,55


In [9]:
df.dropna(inplace = True)
df.isna().sum()

Unnamed: 0,0
label,0
comment,0


##droping the symbols and numbers

In [10]:
df['comment'] = df['comment'].str.replace(r'[^a-zA-Z\s]', '', regex = True)

## converting the data into lowercase

In [11]:
def lowercase(text):
  return text.lower()

df['comment']= df['comment'].apply(lowercase)

In [12]:
df = df[:50000]
df

Unnamed: 0,label,comment
0,0,nc and nh
1,0,you do know west teams play against west teams...
2,0,they were underdogs earlier today but since gr...
3,0,this meme isnt funny none of the new york nigg...
4,0,i could use one of those tools
...,...,...
49996,0,thats pretty specific howd you get there consi...
49997,0,shitty tactics get you shitty results
49998,0,remember different rules for the rich nothing ...
49999,0,warp a minion horde before arrows come down


## 2. TOKENIZATION

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### function for tokenization

In [14]:
def tokenize_data(text, max_length = 100):
  return tokenizer(
      text.tolist(),
      max_length = max_length,
      truncation = True,
      padding = 'max_length',
      return_tensors = 'np'
  )

tokenized_data = tokenize_data(df['comment'])

In [15]:
tokenized_data

{'input_ids': array([[  101, 13316,  1998, ...,     0,     0,     0],
       [  101,  2017,  2079, ...,     0,     0,     0],
       [  101,  2027,  2020, ...,     0,     0,     0],
       ...,
       [  101,  3342,  2367, ...,     0,     0,     0],
       [  101, 24136,  1037, ...,     0,     0,     0],
       [  101,  2092,  1996, ...,     0,     0,     0]]), 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])}

## 3. TRAIN TEST SPLIT

In [16]:
X = tokenized_data['input_ids']
y = df['label']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [18]:
X_train.shape, X_test.shape

((40000, 100), (10000, 100))

## Building the model according to the proposed architecture

In [19]:
# Hierarchical BERT model definition
class HierarchicalBERT(tf.keras.Model):

    # Model initialization
    def __init__(self, bert_model, lstm_units, cnn_filters, dense_units):
        super(HierarchicalBERT, self).__init__()

        # Pretrained BERT model
        self.bert = bert_model

        # Sentence encoding layer
        self.dense_sentence = tf.keras.layers.Dense(768, activation='relu')

        # Context summarization layer
        self.mean_pooling = tf.keras.layers.GlobalAveragePooling1D()

        # Context encoding BiLSTM layer
        self.bilstm_encoder = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(lstm_units, return_sequences=True)
        )

        # CNN feature extraction layer
        self.conv = tf.keras.layers.Conv1D(cnn_filters, kernel_size=1, activation='relu')

        # Global max pooling layer
        self.pool = tf.keras.layers.GlobalMaxPooling1D()

        # Fully connected dense layer
        self.dense_df = tf.keras.layers.Dense(dense_units, activation='relu')

        # Output classification layer
        self.output_layer = tf.keras.layers.Dense(1, activation='sigmoid')

    # Forward pass
    def call(self, inputs, training=False):

        # BERT embeddings
        bert_output = self.bert(inputs, training=training).last_hidden_state

        # Sentence-level encoding
        sentence_encoding = self.dense_sentence(bert_output)

        # Context summarization
        context_summarized = self.mean_pooling(sentence_encoding)

        # Expand dimension for LSTM
        context_summarized = tf.expand_dims(context_summarized, axis=1)

        # Context encoding
        context_encoded = self.bilstm_encoder(context_summarized)

        # Convolution operation
        conv_output = self.conv(context_encoded)

        # Pooling operation
        pooled_output = self.pool(conv_output)

        # Fully connected layer
        dense_output = self.dense_df(pooled_output)

        # Final prediction
        final_output = self.output_layer(dense_output)

        return final_output


In [20]:
from transformers import BertTokenizer, TFBertModel

tokenizer = BertTokenizer.from_pretrained(
    "bert-base-uncased",
    use_fast=False   # avoids safetensors fast-path
)

bert_model = TFBertModel.from_pretrained(
    "bert-base-uncased",
    from_pt=True,        # convert PyTorch → TensorFlow safely
    ignore_mismatched_sizes=True
)


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Al

In [21]:
# defining the hierarchical bert model
model = HierarchicalBERT(bert_model, lstm_units = 128, cnn_filters = 64, dense_units = 32)

In [22]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(2e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)


In [23]:
model.fit(X_train, y_train, epochs = 3, batch_size = 64)

Epoch 1/3




Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x782310fb5670>

In [24]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Model accuracy : {accuracy*100}")

Model accuracy : 72.04999923706055


## Testing my Model


In [45]:
import tensorflow as tf

def test_sentence(sentence, model, tokenizer, max_length=128, threshold=0.35):
    """
    Test a single sentence for sarcasm using trained model.
    Returns: (label, confidence)
    """

    # Tokenize input (same as training)
    inputs = tokenizer(
        sentence,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )

    # Model inference
    preds = model(inputs, training=False)

    # Convert model output to probability
    if preds.shape[-1] == 2:
        probs = tf.nn.softmax(preds, axis=-1)
        sarcasm_prob = float(probs[0][1])
    else:
        sarcasm_prob = float(preds[0][0])  # sigmoid output

    # Apply threshold
    label = "Sarcastic" if sarcasm_prob >= threshold else "Not Sarcastic"

    return label, round(sarcasm_prob, 3)


In [47]:
sentence = "Amazing service, waited only two hours."

label, confidence = test_sentence(sentence, model, tokenizer)

print("Sentence:", sentence)
print("Prediction:", label)
print("Confidence:", confidence)


Sentence: Amazing service, waited only two hours.
Prediction: Sarcastic
Confidence: 0.486
