In [12]:
# Installing the transformers for use of DistilBERT
!pip install transformers



In [10]:
# Import the libraries
from google.colab import drive
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
import tensorflow as tf
import numpy as np
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

In [11]:
# Mount the google drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# Import the train data
df = pd.read_csv('/content/drive/MyDrive/NLP/train.csv')

# Check for the null values
print(df.isnull().any(axis = 0))

id                   False
text                 False
is_humor             False
humor_rating          True
humor_controversy     True
offense_rating       False
dtype: bool


In [14]:
# Checking for shape of the dataset
df.shape

(8000, 6)

## **TASK1(A)** Humor detection

In [15]:
# Train feature
X_train=list(df['text'])

In [16]:
# Train Label
y_train=list(df['is_humor'])

In [17]:
# Spliting the data into training and testing the dataset
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.20, random_state = 0)

In [18]:
# DistilBertTokenizerFast for tokenize the text
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…




In [19]:
# Encode the text into id vector with truncation and padding
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [20]:
# Making the training dataset using tensorflow

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [21]:
# Making Training Argument for model training
training_args = TFTrainingArguments(
    output_dir='./results_is_humor',          
    num_train_epochs=2,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=16,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs_is_humor',            
    logging_steps=10,
    eval_steps=16
)

In [22]:
# Use 'distilbert-base-uncased' as pre-trained model to get weights 
with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = TFTrainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset             
)

trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


In [23]:
# Evaluate the model for measuring loss
trainer.evaluate(test_dataset)



{'eval_loss': 0.2925073051452637}

In [24]:
# Load the test dataset
df_test = pd.read_csv('/content/drive/MyDrive/NLP/gold_test.csv')
X_test=list(df_test['text'])
y_test=list(df_test['is_humor'])

# Encode the test dataset
test_encodings = tokenizer(X_test, truncation=True, padding=True)

# make testdataset using tensorflow
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [25]:
# Test the model using test dataset
output=trainer.predict(test_dataset)[1]



In [26]:
# Make confusion matrix
from sklearn.metrics import confusion_matrix

cm=confusion_matrix(y_test,output)
cm

array([[385,   0],
       [  0, 615]])

In [27]:
# Save the model for future use
trainer.save_model('is_humor_model')

## **TASK1(B)** Humor rating

In [28]:
# Read the train data for Humor Rating task
df = pd.read_csv('/content/drive/MyDrive/NLP/train.csv')
df['humor_rating'] = df['humor_rating'].fillna(0)
df = df[(df.is_humor == 1)]

In [29]:
# Train feature
X_train_hr=list(df['text'])

In [30]:
# Train label
y_train_hr=list(df['humor_rating'])

In [31]:
# Spliting the data into training and testing the dataset
X_train_hr, X_test_hr, y_train_hr, y_test_hr = train_test_split(X_train_hr, y_train_hr, test_size = 0.20, random_state = 0)

In [32]:
# DistilBertTokenizerFast for tokenize the text
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [33]:
# Encode the text into id vector with truncation and padding
train_encodings = tokenizer(X_train_hr, truncation=True, padding=True)
test_encodings = tokenizer(X_test_hr, truncation=True, padding=True)

In [34]:
# Making the training dataset using tensorflow
train_dataset_hr = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train_hr
))

test_dataset_hr = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test_hr
))

In [35]:
# Making Training Argument for model training
training_args_humor_rating = TFTrainingArguments(
    output_dir='./results_humor_rating',          
    num_train_epochs=2,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=16,  
    warmup_steps=500,               
    weight_decay=0.01,              
    logging_dir='./logs_humor_rating',
    logging_steps=10,
    eval_steps=64
)

In [36]:
# Use 'distilbert-base-uncased' as pre-trained model to get weights 
with training_args_humor_rating.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels = 1)

trainer = TFTrainer(
    model=model,                         
    args=training_args_humor_rating,     
    train_dataset=train_dataset_hr,      
    eval_dataset=test_dataset_hr         
)

trainer.train()

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use i



In [37]:
# Evaluate the model for measuring loss
trainer.evaluate(test_dataset_hr)



{'eval_loss': 0.017048018593941967}

In [38]:
# Load the test dataset
df_test = pd.read_csv('/content/drive/MyDrive/NLP/gold_test.csv')
df_test['humor_rating'] = df_test['humor_rating'].fillna(0)
X_test=list(df_test['text'])
y_test=list(df_test['humor_rating'])

# Encode the test dataset
test_encodings = tokenizer(X_test, truncation=True, padding=True)

# make testdataset using tensorflow
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [39]:
# Test the model using test dataset
output=trainer.predict(test_dataset)[1]



In [40]:
# Calculate MSE
diff = 0
for i in range(output.shape[0]):
  if(output[i] != 'nan' or y_test[i]):
    diff += (abs(y_test[i] - output[i])**2)

print(diff/len(y_test))

1.9739289303990405e-15


In [41]:
# Save the model for future use
trainer.save_model('humor__rating_model')

## **TASK1(C)** Humor controversy detection

In [42]:
# Read the train data for Humor Controversy task and preprocess with imputing 0 in null cells
df = pd.read_csv('/content/drive/MyDrive/NLP/train.csv')
df['humor_controversy'] = df['humor_controversy'].fillna(0)
df['humor_controversy'] = df['humor_controversy'].apply(np.int64)
df = df[(df.is_humor == 1)]

In [43]:
# Train feature
X_train = list(df['text'])

In [44]:
# Train label
y_train=list(df['humor_controversy'])

In [45]:
# Spliting the data into training and testing the dataset
X_train_hc, X_test_hc, y_train_hc, y_test_hc = train_test_split(X_train, y_train, test_size = 0.20, random_state = 0)

In [46]:
# DistilBertTokenizerFast for tokenize the text
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [47]:
# Encode the text into id vector with truncation and padding
train_encodings = tokenizer(X_train_hc, truncation=True, padding=True)
test_encodings = tokenizer(X_test_hc, truncation=True, padding=True)

In [48]:
# Making the training dataset using tensorflow
train_dataset_hc = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train_hc
))

test_dataset_hc = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test_hc
))

In [49]:
# Making Training Argument for model training
training_args_hc = TFTrainingArguments(
    output_dir='./results_humor_controversy',          
    num_train_epochs=2,             
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=16,  
    warmup_steps=500,               
    weight_decay=0.01,              
    logging_dir='./logs_humor_controversy',
    logging_steps=10,
    eval_steps=16
)

In [50]:
# Use 'distilbert-base-uncased' as pre-trained model to get weights 
with training_args_hc.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = TFTrainer(
    model=model,                         
    args=training_args_hc,               
    train_dataset=train_dataset_hc,      
    eval_dataset=test_dataset_hc         
)

trainer.train()

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_59', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i



In [51]:
# Evaluate the model for measuring loss
trainer.evaluate(test_dataset_hc)



{'eval_loss': 0.7450577520555065}

In [52]:
# Load the test dataset
df_test = pd.read_csv('/content/drive/MyDrive/NLP/gold_test.csv')
df_test['humor_controversy'] = df_test['humor_controversy'].fillna(0)
df_test['humor_controversy'] = df_test['humor_controversy'].apply(np.int64)
X_test=list(df_test['text'])
y_test=list(df_test['humor_controversy'])

# Encode the test dataset
test_encodings = tokenizer(X_test, truncation=True, padding=True)

# make testdataset using tensorflow
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [53]:
# Test the model using test dataset
output=trainer.predict(test_dataset)[1]



In [54]:
# Make confusion matrix
cm=confusion_matrix(y_test,output)
cm

array([[721,   0],
       [  0, 279]])

In [55]:
# Save the model for future use
trainer.save_model('humor_controversy_model')

## **TASK2** Humor offense rating

In [56]:
# Read the train data for Humor Offense Rating task
df = pd.read_csv('/content/drive/MyDrive/NLP/train.csv')
df = df[(df.is_humor == 1)]

In [57]:
# Train feature
X_train = list(df['text'])

In [58]:
# Train label
y_train=list(df['offense_rating'])

In [59]:
# Spliting the data into training and testing the dataset
X_train_or, X_test_or, y_train_or, y_test_or = train_test_split(X_train, y_train, test_size = 0.20, random_state = 0)

In [60]:
# DistilBertTokenizerFast for tokenize the text
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [61]:
# Encode the text into id vector with truncation and padding
train_encodings = tokenizer(X_train_or, truncation=True, padding=True)
test_encodings = tokenizer(X_test_or, truncation=True, padding=True)

In [62]:
# Making the training dataset using tensorflow
train_dataset_or = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train_or
))

test_dataset_or = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test_or
))

In [63]:
# Making Training Argument for model training
training_args_or = TFTrainingArguments(
    output_dir='./results_Offense_Rating',          
    num_train_epochs=2,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=16,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs_Offense_Rating',
    logging_steps=10,
    eval_steps=16
)

In [64]:
# Use 'distilbert-base-uncased' as pre-trained model to get weights 
with training_args_or.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = TFTrainer(
    model=model,                         
    args=training_args_or,          
    train_dataset=train_dataset_or,    
    eval_dataset=test_dataset_or       
)

trainer.train()

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use i



In [65]:
# Evaluate the model for measuring loss
trainer.evaluate(test_dataset_or)



{'eval_loss': nan}

In [66]:
# Load the test dataset
df_test = pd.read_csv('/content/drive/MyDrive/NLP/gold_test.csv')
df_test['humor_controversy'] = df_test['humor_controversy'].fillna(0)
df_test['humor_controversy'] = df_test['humor_controversy'].apply(np.int64)
X_test=list(df_test['text'])
y_test=list(df_test['offense_rating'])

# Encode the test dataset
test_encodings = tokenizer(X_test, truncation=True, padding=True)

# make testdataset using tensorflow
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [67]:
# Test the model using test dataset
output=trainer.predict(test_dataset)[1]



In [68]:
# Calculate MSE
diff = 0
for i in range(output.shape[0]):
  diff += (abs(y_test[i] - output[i])**2)

print(diff/len(y_test))

5.770972587808648e-16


In [69]:
# Save the model for future use
trainer.save_model('offence_rating_model')

## References:

- https://huggingface.co/transformers/training.html
- https://youtu.be/V1-Hm2rNkik
- https://huggingface.co/transformers/model_doc/distilbert.html
