In [1]:
!pip install "transformers[sentencepiece]"

In [None]:
!pip install datasets

In [3]:
import numpy as np
import pandas as pd

In [25]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
df_train.head(2)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [5]:
print("Shape of the train data: ", df_train.shape)
print("shape of the test data: ", df_test.shape)


Shape of the train data:  (27481, 4)
shape of the test data:  (3534, 3)


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_val = train_test_split(df_train,test_size=0.1,random_state=42)

print("Shape of the train data: ", X_train.shape)
print("Shape of the validation data: ", X_val.shape)

Shape of the train data:  (24732, 4)
Shape of the validation data:  (2749, 4)


In [7]:
# filling na values with "" 
X_train.dropna(inplace=True)
X_val.dropna(inplace=True)

print("Shape of the train data: ", X_train.shape)
print("Shape of the validation data: ", X_val.shape)

Shape of the train data:  (24731, 4)
Shape of the validation data:  (2749, 4)


In [8]:
from transformers import AutoTokenizer

# using roberta-base tokenizer to tokenize the text into input IDs that model can make sense of.
model_checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [52]:
tokenizer.save_pretrained("roberta_base_tokenizer_new")

('roberta_base_tokenizer_new/tokenizer_config.json',
 'roberta_base_tokenizer_new/special_tokens_map.json',
 'roberta_base_tokenizer_new/vocab.json',
 'roberta_base_tokenizer_new/merges.txt',
 'roberta_base_tokenizer_new/added_tokens.json',
 'roberta_base_tokenizer_new/tokenizer.json')

In [9]:
from datasets import Dataset

## converting train and validation pandas dataset into huggingFace Dataset format.

X_train.reset_index(drop=True,inplace=True)
X_val.reset_index(drop=True,inplace=True)

train_data = Dataset.from_pandas(X_train)
validation_data = Dataset.from_pandas(X_val)

train_data

Dataset({
    features: ['textID', 'text', 'selected_text', 'sentiment'],
    num_rows: 24731
})

In [10]:
context = train_data[0]["text"]
question = train_data[0]["sentiment"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

'<s>positive</s></s>WTF facebook just cleared out my whole survey and i was on the last q, this night gets better and better  what else is next?</s>'

In [11]:
# function to tokenize each sample.
def preprocess(example):
  return tokenizer(
    example["sentiment"],
    example["text"],
    return_offsets_mapping = True
)

check_dataset = train_data.map(
    preprocess,
    batched=True,
)

check_dataset

  0%|          | 0/25 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'offset_mapping', 'selected_text', 'sentiment', 'text', 'textID'],
    num_rows: 24731
})

In [12]:
## finding the maximum length of a sequece after tokenization
MAX_LENGTH = 0
for i in check_dataset:
  length = len(i["input_ids"])
  if length > MAX_LENGTH:
    MAX_LENGTH = length
    
print("Max length of a sequence after tokenization: ", MAX_LENGTH)

Max length of a sequence after tokenization:  105


In [13]:
def preprocess_training_examples(examples):
  inputs = tokenizer(
    examples["sentiment"],
    examples["text"],
    max_length = MAX_LENGTH,
    return_offsets_mapping = True,
    padding = "max_length",
    )
  start_positions = []
  end_positions = []
  #print(inputs["offset_mapping"])

  for i, offset in enumerate(inputs["offset_mapping"]):

    answer = examples["selected_text"][i]
    question = examples["sentiment"][i]
    context = examples["text"][i]
    
    # print("context: ", context)
    # print("Answer: ", answer)
    # finding the index of first character and the index of last character of answer in the context(tweet_text)
    
    start_char = 0
    end_char  = 0
    for idx,ch in enumerate(context):
      count = idx
      flag = True
      for j in answer:
        if context[count] == j:
          count +=1
        else:
          flag = False
          break
      if flag:
        start_char = idx
        break

    end_char = start_char + len(answer)
    # print((tokenizer.decode(inputs["input_ids"][i])))
    # print("*"*200)
    # print(len(inputs["input_ids"]))
    # print("*"*200)
    # print((inputs.sequence_ids(i)))

    # finding the start  and end of the context
    sequence_ids = inputs.sequence_ids(i)
    idx = 0
    try:
      while sequence_ids[idx]!=1:
        idx+=1
      context_start = idx

      while sequence_ids[idx] ==1:
        idx+=1
        if idx == len(sequence_ids):
          break
      context_end = idx-1
    except:
      print(sequence_ids)
    # finding the start position 
    # print("Shart Char: ", start_char)
    # print("End char: ", end_char)
    # print("Context Start: ", context_start)
    # print("Context end: ", context_end)
    
    idx = context_start
    while idx <= context_end and offset[idx][0] <= start_char:
      idx+=1
    start_positions.append(idx-1)
    
    idx = context_end -1
    while idx >= context_start and offset[idx][1] >= end_char:
      idx-=1
    end_positions.append(idx+1)
  
  inputs["start_positions"] = start_positions
  inputs["end_positions"] = end_positions
   
  return inputs

In [14]:
processed_train_data = train_data.map(preprocess_training_examples,batched=True)
processed_train_data

  0%|          | 0/25 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'end_positions', 'input_ids', 'offset_mapping', 'selected_text', 'sentiment', 'start_positions', 'text', 'textID'],
    num_rows: 24731
})

In [15]:
idx = 20000
answer = processed_train_data[idx]["selected_text"]

start = processed_train_data[idx]["start_positions"]
end = processed_train_data[idx]["end_positions"]
labeled_answer = tokenizer.decode(processed_train_data[idx]["input_ids"][start : end +1 ])

print(f"Theoretical answer: {answer}")
print(f"labels give: {labeled_answer}")

Theoretical answer: I missed you today...I know you must feel very tired.
labels give:  I missed you today...I know you must feel very tired.


In [16]:
processed_val_data = validation_data.map(preprocess_training_examples, batched = True)
processed_val_data

  0%|          | 0/3 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'end_positions', 'input_ids', 'offset_mapping', 'selected_text', 'sentiment', 'start_positions', 'text', 'textID'],
    num_rows: 2749
})

In [17]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFRobertaForQuestionAnswering.

Some layers of TFRobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
tf_train_dataset = processed_train_data.to_tf_dataset(
    columns=[
        "input_ids",
        "start_positions",
        "end_positions",
        "attention_mask",
    ],
    dummy_labels=True,
    shuffle=True,
    batch_size=32,
)

In [19]:
tf_eval_dataset = processed_val_data.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    shuffle=False,
    batch_size=32,
)

In [20]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

num_train_epochs = 5
num_train_steps = len(tf_train_dataset) * num_train_epochs
optimizer, schedule = create_optimizer(
    init_lr=3e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.001,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
#tf.keras.mixed_precision.set_global_policy("mixed_float16")

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as the 'labels' key of the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


In [21]:
import tensorflow as tf

earlyStop_callback = tf.keras.callbacks.EarlyStopping(
    monitor='loss', min_delta=0, patience=1, verbose=0,
    mode='auto',baseline=None, restore_best_weights=True
    )
# We're going to do validation afterwards, so no validation mid-training
model.fit(tf_train_dataset, epochs=num_train_epochs,callbacks = [earlyStop_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7efd8911ded0>

In [26]:
model.fit(tf_train_dataset, epochs=10,initial_epoch = 5,callbacks = [earlyStop_callback])

Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.callbacks.History at 0x7eff124a8890>

In [29]:
model.save_pretrained("roberta_base_fine_tuned")

In [30]:
df_test.reset_index(drop=True,inplace=True)

test_data = Dataset.from_pandas(df_test)
test_data

Dataset({
    features: ['textID', 'text', 'sentiment'],
    num_rows: 3534
})

In [34]:
def post_porocess_data(examples):
  questions = examples["sentiment"]
  context = examples["text"]
  inputs = tokenizer(
      questions,
      context,
      max_length = MAX_LENGTH,
      padding="max_length",
      return_offsets_mapping = True,   
  )

  for i in range(len(inputs["input_ids"])):
    offset = inputs["offset_mapping"][i]
    sequence_ids = inputs.sequence_ids(i)
    inputs["offset_mapping"][i] = [
                                  o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
    ]
  return inputs

In [35]:
processed_val_data = validation_data.map(
    post_porocess_data,
    batched = True
)

processed_val_data

  0%|          | 0/3 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'offset_mapping', 'selected_text', 'sentiment', 'text', 'textID'],
    num_rows: 2749
})

In [36]:
tf_eval_dataset = processed_val_data.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    shuffle=False,
    batch_size=16,
)

In [37]:
processed_test_data = test_data.map(
    post_porocess_data,
    batched = True
)

processed_test_data

  0%|          | 0/4 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'offset_mapping', 'sentiment', 'text', 'textID'],
    num_rows: 3534
})

In [38]:
tf_test_dataset = processed_test_data.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    shuffle=False,
    batch_size=16,
)

In [39]:
outputs = model.predict(tf_eval_dataset)


In [40]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [41]:
from tqdm.auto import tqdm
n_best = 20

def predict_answers(start_logits,end_logits, inputs, examples):
    predicted_answers = []
    for i in range(len(examples["textID"])):
        start_logit = start_logits[i]
        end_logit = end_logits[i]
        context = examples["text"][i]

        offset = inputs["offset_mapping"][i]
        start_indexes = np.argsort(start_logit)[-1: -n_best - 1:-1].tolist()
        end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()

        flag = False
        for start_index in start_indexes:
            for end_index in end_indexes:
                # skip answer that are not in the context.
                if offset[start_index] is None or offset[end_index] is None:
                    continue
                # skip answer with length that is either < 0
                if end_index < start_index:
                    continue

                flag = True
                answer = context[offset[start_index][0]: offset[end_index][1]]
                predicted_answers.append(answer)
                break
            if flag:
                break
        if not flag:
            predicted_answers.append(context)
    return predicted_answers


In [42]:
predicted_val_answers = predict_answers(start_logits, end_logits, processed_val_data, validation_data)

In [43]:
len(predicted_val_answers)

2749

In [44]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [45]:
theoritcal_answers = processed_val_data["selected_text"]

# calculating the jaccard score
score = 0
for i in range(len(theoritcal_answers)):
  score += jaccard(theoritcal_answers[i],predicted_val_answers[i])


In [46]:
score /= len(theoritcal_answers)
print("Jaccard score on validation data: ", score)

Jaccard score on validation data:  0.7180039345097456


In [47]:
tf_test_dataset

<PrefetchDataset shapes: {attention_mask: (None, None), input_ids: (None, None)}, types: {attention_mask: tf.int64, input_ids: tf.int64}>

In [48]:
outputs = model.predict(tf_test_dataset)

start_logits = outputs.start_logits
end_logits = outputs.end_logits


In [49]:
predicted_test_answers = predict_answers(start_logits, end_logits, processed_test_data, test_data)

len(predicted_test_answers)

3534