In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [2]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
df_train.head(2)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [3]:
print("Shape of the train data: ", df_train.shape)
print("shape of the test data: ", df_test.shape)


Shape of the train data:  (27481, 4)
shape of the test data:  (3534, 3)


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_val = train_test_split(df_train,test_size=0.1,random_state=42)

print("Shape of the train data: ", X_train.shape)
print("Shape of the validation data: ", X_val.shape)

Shape of the train data:  (24732, 4)
Shape of the validation data:  (2749, 4)


In [5]:
# filling na values with "" 
X_train.dropna(inplace=True)
X_val.dropna(inplace=True)

print("Shape of the train data: ", X_train.shape)
print("Shape of the validation data: ", X_val.shape)

Shape of the train data:  (24731, 4)
Shape of the validation data:  (2749, 4)


In [6]:
from datasets import Dataset

## converting train and validation pandas dataset into huggingFace Dataset format.

X_train.reset_index(drop=True,inplace=True)
X_val.reset_index(drop=True,inplace=True)

train_data = Dataset.from_pandas(X_train)
validation_data = Dataset.from_pandas(X_val)

train_data

Dataset({
    features: ['textID', 'text', 'selected_text', 'sentiment'],
    num_rows: 24731
})

In [7]:
n_best = 20

def predict_answers(inputs):
    predicted_answer = []
    for i in range(len(inputs["offset_mapping"])):
        start_logit = inputs["start_logits"][i]
        end_logit = inputs["end_logits"][i]
        context = inputs["text"][i]
        offset = inputs["offset_mapping"][i]
        start_indexes = np.argsort(start_logit)[-1: -n_best - 1:-1].tolist()
        end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
        
        flag = False
        for start_index in start_indexes:
            for end_index in end_indexes:
                # skip answer that are not in the context.
                if offset[start_index] is None or offset[end_index] is None:
                    continue
                # skip answer with length that is either < 0
                if end_index < start_index:
                    continue
                flag = True
                answer = context[offset[start_index][0]: offset[end_index][1]]
                predicted_answer.append(answer)
                break
            if flag:
                break
        if not flag:
            predicted_answer.append(answer)
    return {"predicted_answer":predicted_answer}

In [8]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

## 1. Using Baseline Model: Bert Model

In [16]:
from transformers import AutoTokenizer
from transformers import TFAutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("saved_models/bert_base_cased/bert_tokenizer",local_files_only=True)

model = TFAutoModelForQuestionAnswering.from_pretrained("saved_models/bert_base_cased/bert_model",local_files_only=True)



All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

All the layers of TFBertForQuestionAnswering were initialized from the model checkpoint at saved_models/bert_base_cased/bert_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


In [17]:
MAX_LENGTH = 112

def post_porocess_data(examples):
  questions = examples["sentiment"]
  context = examples["text"]
  inputs = tokenizer(
      questions,
      context,
      max_length = MAX_LENGTH,
      padding="max_length",
      return_offsets_mapping = True,   
  )

  for i in range(len(inputs["input_ids"])):
    offset = inputs["offset_mapping"][i]
    token_type_ids = inputs["token_type_ids"][i]
    inputs["offset_mapping"][i] = [
                                  o if token_type_ids[k] == 1 else None for k, o in enumerate(offset)
    ]
  return inputs

In [18]:
processed_val_data = validation_data.map(
    post_porocess_data,
    batched = True
)

processed_val_data

  0%|          | 0/3 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'offset_mapping', 'selected_text', 'sentiment', 'text', 'textID', 'token_type_ids'],
    num_rows: 2749
})

In [19]:
tf_eval_dataset = processed_val_data.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    shuffle=False,
    batch_size=16,
)

In [20]:
outputs = model.predict(tf_eval_dataset)

start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [21]:
processed_val_data.set_format("pandas")
processed_val_df =  processed_val_data[:]

processed_val_df["start_logits"] = start_logits.tolist()
processed_val_df["end_logits"] = end_logits.tolist()

processed_val_df["text"] = validation_data["text"]

bert_final_val_data = Dataset.from_pandas(processed_val_df)

bert_final_val_data = bert_final_val_data.map(predict_answers,batched=True)
bert_final_val_data

  0%|          | 0/3 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'end_logits', 'input_ids', 'offset_mapping', 'predicted_answer', 'selected_text', 'sentiment', 'start_logits', 'text', 'textID', 'token_type_ids'],
    num_rows: 2749
})

In [22]:
theoritcal_answers = processed_val_data["selected_text"]

# calculating the jaccard score
score = 0
predicted_val_answers_by_bert = bert_final_val_data["predicted_answer"]
for i in range(len(theoritcal_answers)):
  score += jaccard(theoritcal_answers[i],predicted_val_answers_by_bert[i])

score /= len(theoritcal_answers)
print("Jaccard score on validation data: ", score)

Jaccard score on validation data:  0.6768815054600802


## 2. Using the best model: Roberta

In [23]:
tokenizer = AutoTokenizer.from_pretrained("saved_models/roberta-base/roberta_base_tokenizer",local_files_only=True)

model = TFAutoModelForQuestionAnswering.from_pretrained("saved_models/roberta-base/roberta_base",local_files_only=True)


All model checkpoint layers were used when initializing TFRobertaForQuestionAnswering.

All the layers of TFRobertaForQuestionAnswering were initialized from the model checkpoint at saved_models/roberta-base/roberta_base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForQuestionAnswering for predictions without further training.


In [1]:
MAX_LENGTH = 105

def post_porocess_data(examples):
  questions = examples["sentiment"]
  context = examples["text"]
  inputs = tokenizer(
      questions,
      context,
      max_length = MAX_LENGTH,
      padding="max_length",
      return_offsets_mapping = True,   
  )

  for i in range(len(inputs["input_ids"])):
    offset = inputs["offset_mapping"][i]
    sequence_ids = inputs.sequence_ids(i)
    inputs["offset_mapping"][i] = [
                                  o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
    ]
  return inputs

In [25]:
processed_val_data = validation_data.map(
    post_porocess_data,
    batched = True
)
processed_val_data

  0%|          | 0/3 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'offset_mapping', 'selected_text', 'sentiment', 'text', 'textID'],
    num_rows: 2749
})

In [26]:
tf_eval_dataset = processed_val_data.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    shuffle=False,
    batch_size=16,
)

In [27]:
outputs = model.predict(tf_eval_dataset)

start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [28]:
processed_val_data.set_format("pandas")
processed_val_df =  processed_val_data[:]

processed_val_df["start_logits"] = start_logits.tolist()
processed_val_df["end_logits"] = end_logits.tolist()

processed_val_df["text"] = validation_data["text"]

roberta_final_val_data = Dataset.from_pandas(processed_val_df)

roberta_final_val_data = roberta_final_val_data.map(predict_answers,batched=True)
roberta_final_val_data

  0%|          | 0/3 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'end_logits', 'input_ids', 'offset_mapping', 'predicted_answer', 'selected_text', 'sentiment', 'start_logits', 'text', 'textID'],
    num_rows: 2749
})

In [29]:
# calculating the jaccard score
score = 0
predicted_val_answers_by_roberta = roberta_final_val_data["predicted_answer"]
for i in range(len(theoritcal_answers)):
  score += jaccard(theoritcal_answers[i],predicted_val_answers_by_roberta[i])

score /= len(theoritcal_answers)
print("Jaccard score on validation data: ", score)

Jaccard score on validation data:  0.7180039345097456


## 3. Comparison on predictions between basline model-bert and the best model roberta.

In [30]:
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Model", "Jaccard score on validation data"]
x.add_row(["Bert-base-cased", 0.67])
x.add_row(["Roberta-base-cased", 0.71])
print(x)

+--------------------+----------------------------------+
|       Model        | Jaccard score on validation data |
+--------------------+----------------------------------+
|  Bert-base-cased   |               0.67               |
| Roberta-base-cased |               0.71               |
+--------------------+----------------------------------+


In [31]:
## prediction by bert 
for i in range(10):
    print("Ground truth result: ",theoritcal_answers[i] )
    print("Prediction by Bert model: ",predicted_val_answers_by_bert[i] )
    print("*"*50)

Ground truth result:  t?  lovelovelove
Prediction by Bert model:  lovelovelove<3
**************************************************
Ground truth result:  resting had a whole day of walking
Prediction by Bert model:  resting had a whole day of walking
**************************************************
Ground truth result:  was in Palawan a couple of days ago, i`ll try to post pictures tom.
Prediction by Bert model:  was in Palawan a couple of days ago, i`ll try to post pictures tom.
**************************************************
Ground truth result:  horrible.
Prediction by Bert model:  its horrible. DON`T TELL ON ME!
**************************************************
Ground truth result:  glad
Prediction by Bert model:  Glad
**************************************************
Ground truth result:  Are the drugs working?
Prediction by Bert model:  Are the drugs working?
**************************************************
Ground truth result:  if not idgaf
Prediction by Bert model:  if

In [32]:
## prediction by roberta 
for i in range(10):
    print("Ground truth result: ",theoritcal_answers[i] )
    print("Prediction by Robert model: ",predicted_val_answers_by_roberta[i] )
    print("*"*50)

Ground truth result:  t?  lovelovelove
Prediction by Robert model:  ?  lovelovelove
**************************************************
Ground truth result:  resting had a whole day of walking
Prediction by Robert model:  resting had a whole day of walking
**************************************************
Ground truth result:  was in Palawan a couple of days ago, i`ll try to post pictures tom.
Prediction by Robert model:  was in Palawan a couple of days ago, i`ll try to post pictures tom.
**************************************************
Ground truth result:  horrible.
Prediction by Robert model:  horrible.
**************************************************
Ground truth result:  glad
Prediction by Robert model:  Glad
**************************************************
Ground truth result:  Are the drugs working?
Prediction by Robert model:  Are the drugs working?
**************************************************
Ground truth result:  if not idgaf
Prediction by Robert model:  deleted
*