# <span style="color:red">Twitter Sentiment Extractor</span>



> This is an **extractive question-answering problem** where tweet_text is a context, sentiment is a question and selected_text is an answer.

In [1]:
## basic imports
import pandas as pd
import numpy as np

In [2]:
pip install "transformers[sentencepiece]"

In [3]:
pip install datasets

# 1. Data Prep

In [10]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_train.head(2)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [11]:
print("Shape of the train data: ", df_train.shape)
print("shape of the test data: ", df_test.shape)


Shape of the train data:  (27481, 4)
shape of the test data:  (3534, 3)


# 1. Train-Validation Split.

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_val = train_test_split(df_train,test_size=0.1,random_state=42)

print("Shape of the train data: ", X_train.shape)
print("Shape of the validation data: ", X_val.shape)

Shape of the train data:  (24732, 4)
Shape of the validation data:  (2749, 4)


In [13]:
# filling na values with "" 
X_train.dropna(inplace=True)
X_val.dropna(inplace=True)

print("Shape of the train data: ", X_train.shape)
print("Shape of the validation data: ", X_val.shape)

Shape of the train data:  (24731, 4)
Shape of the validation data:  (2749, 4)


# 2. Preparing the Data/ Preprocessing.

> The **text(context) field** and **sentiment(question) field** are very straightforward to use as input for the model. The **seleced_text( answer) field** is a bit trickier to use as the output.</span>
>
> We need to generate labels for the question's answer. And the **labels will be start and end position of the token** corresponding to the token inside the context. So, **labels will be index of token where the answer starts and index of the token where the answer ends.**
>
> And the model will be tasked to **predict one start and end logit per token** in the input context.

In [14]:
from transformers import AutoTokenizer

# using bert-base tokenizer to tokenize the text into input IDs that model can make sense of.
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [15]:
from datasets import Dataset

## converting train and validation pandas dataset into huggingFace Dataset format.

X_train.reset_index(drop=True,inplace=True)
X_val.reset_index(drop=True,inplace=True)

train_data = Dataset.from_pandas(X_train)
validation_data = Dataset.from_pandas(X_val)

train_data

Dataset({
    features: ['textID', 'text', 'selected_text', 'sentiment'],
    num_rows: 24731
})

### NOTE: We can pass a question and the context together as this tokenizer will properly insert the special tokens to form a sentance that look like this:     
[CLS] question [SEP] context [SEP]

In [16]:
context = train_data[0]["text"]
question = train_data[0]["sentiment"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

'[CLS] positive [SEP] WTF facebook just cleared out my whole survey and i was on the last q, this night gets better and better what else is next? [SEP]'

In [17]:
# function to tokenize each sample.
def preprocess(example):
  return tokenizer(
    example["sentiment"],
    example["text"],
    return_offsets_mapping = True
)

check_dataset = train_data.map(
    preprocess,
    batched=True,
)

check_dataset

  0%|          | 0/25 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'offset_mapping', 'selected_text', 'sentiment', 'text', 'textID', 'token_type_ids'],
    num_rows: 24731
})

In [18]:
## finding the maximum length of a sequece after tokenization
MAX_LENGTH = 0
for i in check_dataset:
  length = len(i["input_ids"])
  if length > MAX_LENGTH:
    MAX_LENGTH = length
    
print("Max length of a sequence after tokenization: ", MAX_LENGTH)

Max length of a sequence after tokenization:  112


In [20]:
def preprocess_training_examples(examples):
  inputs = tokenizer(
    examples["sentiment"],
    examples["text"],
    max_length = MAX_LENGTH,
    return_offsets_mapping = True,
    padding = "max_length",
    )
  start_positions = []
  end_positions = []

  for i, offset in enumerate(inputs["offset_mapping"]):

    answer = examples["selected_text"][i]
    question = examples["sentiment"][i]
    context = examples["text"][i]
    
    # finding the index of first character and the index of last character of answer in the context(tweet_text)
    
    start_char = 0
    end_char  = 0
    for idx,ch in enumerate(context):
      count = idx
      flag = True
      for j in answer:
        if context[count] == j:
          count +=1
        else:
          flag = False
          break
      if flag:
        start_char = idx
        break

    end_char = start_char + len(answer)

    # finding the start  and end of the context
    token_type_ids = inputs["token_type_ids"][i]

    idx = 0
    try:
      while token_type_ids[idx]!=1:
        idx+=1
      context_start = idx

      while token_type_ids[idx] ==1:
        idx+=1
        if idx == len(token_type_ids):
          break
      context_end = idx-1
    except:
      print(token_type_ids)
    # finding the start position 
    idx = context_start
    while idx <= context_end and offset[idx][0] <= start_char:
      idx+=1
    start_positions.append(idx-1)
    
    idx = context_end -1
    while idx >= context_start and offset[idx][1] >= end_char:
      idx-=1
    end_positions.append(idx+1)
  
  inputs["start_positions"] = start_positions
  inputs["end_positions"] = end_positions
   
  return inputs

In [21]:
processed_train_data = train_data.map(preprocess_training_examples,batched=True)
processed_train_data

  0%|          | 0/25 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'end_positions', 'input_ids', 'offset_mapping', 'selected_text', 'sentiment', 'start_positions', 'text', 'textID', 'token_type_ids'],
    num_rows: 24731
})

In [22]:
idx = 2121
answer = processed_train_data[idx]["selected_text"]

start = processed_train_data[idx]["start_positions"]
end = processed_train_data[idx]["end_positions"]
labeled_answer = tokenizer.decode(processed_train_data[idx]["input_ids"][start : end +1 ])

print(f"Theoretical answer: {answer}")
print(f"labels give: {labeled_answer}")

Theoretical answer: bummed
labels give: bummed


In [23]:
processed_val_data = validation_data.map(preprocess_training_examples, batched = True)
processed_val_data

  0%|          | 0/3 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'end_positions', 'input_ids', 'offset_mapping', 'selected_text', 'sentiment', 'start_positions', 'text', 'textID', 'token_type_ids'],
    num_rows: 2749
})

## 3. Fine Tuning Model.

In [24]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)


Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

Some layers of TFBertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
tf_train_dataset = processed_train_data.to_tf_dataset(
    columns=[
        "input_ids",
        "start_positions",
        "end_positions",
        "attention_mask",
        "token_type_ids",
    ],
    dummy_labels=True,
    shuffle=True,
    batch_size=16,
)


In [27]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

num_train_epochs = 10
num_train_steps = len(tf_train_dataset) * num_train_epochs
optimizer, schedule = create_optimizer(
    init_lr=3e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
#tf.keras.mixed_precision.set_global_policy("mixed_float16")

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as the 'labels' key of the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


In [28]:
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

earlyStop_callback = tf.keras.callbacks.EarlyStopping(
    monitor='loss', min_delta=0, patience=1, verbose=0,
    mode='auto',baseline=None, restore_best_weights=True)

# We're going to do validation afterwards, so no validation mid-training
model.fit(tf_train_dataset, epochs=num_train_epochs,callbacks = [earlyStop_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4de6221490>

In [29]:
model.save_pretrained("bert_base_cased")

## 4. Processing the validation data.

In [31]:
df_test.reset_index(drop=True,inplace=True)

test_data = Dataset.from_pandas(df_test)
test_data

Dataset({
    features: ['textID', 'text', 'sentiment'],
    num_rows: 3534
})

In [32]:
def post_porocess_data(examples):
  questions = examples["sentiment"]
  context = examples["text"]
  inputs = tokenizer(
      questions,
      context,
      max_length = MAX_LENGTH,
      padding="max_length",
      return_offsets_mapping = True,   
  )

  for i in range(len(inputs["input_ids"])):
    offset = inputs["offset_mapping"][i]
    token_type_ids = inputs["token_type_ids"][i]
    inputs["offset_mapping"][i] = [
                                  o if token_type_ids[k] == 1 else None for k, o in enumerate(offset)
    ]
  return inputs

  

In [33]:
processed_val_data = validation_data.map(
    post_porocess_data,
    batched = True
)

processed_val_data

  0%|          | 0/3 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'offset_mapping', 'selected_text', 'sentiment', 'text', 'textID', 'token_type_ids'],
    num_rows: 2749
})

In [34]:
tf_eval_dataset = processed_val_data.to_tf_dataset(
    columns=["input_ids", "attention_mask", "token_type_ids"],
    shuffle=False,
    batch_size=16,
)

In [35]:
processed_test_data = test_data.map(
    post_porocess_data,
    batched = True
)

processed_test_data

  0%|          | 0/4 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'offset_mapping', 'sentiment', 'text', 'textID', 'token_type_ids'],
    num_rows: 3534
})

In [36]:
tf_test_dataset = processed_test_data.to_tf_dataset(
    columns=["input_ids", "attention_mask", "token_type_ids"],
    shuffle=False,
    batch_size=16,
)

## 5. Post-Processing.

In [37]:
outputs = model.predict(tf_eval_dataset)


In [38]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [39]:
from tqdm.auto import tqdm
n_best = 20

def predict_answers(start_logits,end_logits, inputs, examples):
    predicted_answers = []
    for i in range(len(examples["textID"])):
        start_logit = start_logits[i]
        end_logit = end_logits[i]
        context = examples["text"][i]

        offset = inputs["offset_mapping"][i]
        start_indexes = np.argsort(start_logit)[-1: -n_best - 1:-1].tolist()
        end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()

        flag = False
        for start_index in start_indexes:
            for end_index in end_indexes:
                # skip answer that are not in the context.
                if offset[start_index] is None or offset[end_index] is None:
                    continue
                # skip answer with length that is either < 0
                if end_index < start_index:
                    continue

                flag = True
                answer = context[offset[start_index][0]: offset[end_index][1]]
                predicted_answers.append(answer)
                break
            if flag:
                break
        if not flag:
            predicted_answers.append(context)
    return predicted_answers


In [40]:
predicted_val_answers = predict_answers(start_logits, end_logits, processed_val_data, validation_data)

In [41]:
len(predicted_val_answers)

2749

In [42]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [43]:
theoritcal_answers = processed_val_data["selected_text"]

# calculating the jaccard score
score = 0
for i in range(len(theoritcal_answers)):
  score += jaccard(theoritcal_answers[i],predicted_val_answers[i])


In [44]:
score /= len(theoritcal_answers)
print("Jaccard score on validation data: ", score)

Jaccard score on validation data:  0.6825797196116131


In [None]:
tf_test_dataset

<PrefetchDataset shapes: {attention_mask: (None, None), input_ids: (None, None), token_type_ids: (None, None)}, types: {attention_mask: tf.int64, input_ids: tf.int64, token_type_ids: tf.int64}>

In [None]:
outputs = model.predict(tf_test_dataset)

start_logits = outputs.start_logits
end_logits = outputs.end_logits


In [None]:
predicted_test_answers = predict_answers(start_logits, end_logits, processed_test_data, test_data)

len(predicted_test_answers)

3534

In [None]:
submission_df = pd.DataFrame({"textID":test_data["textID"],"selected_text":predicted_test_answers})
submission_df.head(2)

Unnamed: 0,textID,selected_text
0,f87dea47db,Last session of the day
1,96d74cb729,exciting


In [None]:
submission_df.to_csv("submission_bert_base_cased.csv",index =False)

In [None]:
from prettytable import PrettyTable

x = PrettyTable()

x.field_names = ["Model", "Jaccard Score"]
x.add_row(["Bert-base-cased", 1295, 1158259, 600.5])
x.add_row(["Roberta-base-cased", 5905, 1857594, 1146.4])
x.add_row(["", 112, 120900, 1714.7])