In [11]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [12]:
df_train = pd.read_csv("../input/tweet-sentiment-extraction/train.csv")
df_test = pd.read_csv("../input/tweet-sentiment-extraction/test.csv")
df_train.head(2)

In [13]:
print("Shape of the train data: ", df_train.shape)
print("shape of the test data: ", df_test.shape)


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val = train_test_split(df_train,test_size=0.1,random_state=42)

print("Shape of the train data: ", X_train.shape)
print("Shape of the validation data: ", X_val.shape)

In [15]:
# filling na values with "" 
X_train.dropna(inplace=True)
X_val.dropna(inplace=True)

print("Shape of the train data: ", X_train.shape)
print("Shape of the validation data: ", X_val.shape)

In [16]:
from datasets import Dataset

## converting train and validation pandas dataset into huggingFace Dataset format.

X_train.reset_index(drop=True,inplace=True)
X_val.reset_index(drop=True,inplace=True)

train_data = Dataset.from_pandas(X_train)
validation_data = Dataset.from_pandas(X_val)

train_data

In [27]:
n_best = 20

def predict_answers(inputs):
    predicted_answer = []
    for i in range(len(inputs["offset_mapping"])):
        start_logit = inputs["start_logits"][i]
        end_logit = inputs["end_logits"][i]
        context = inputs["text"][i]
        offset = inputs["offset_mapping"][i]
        start_indexes = np.argsort(start_logit)[-1: -n_best - 1:-1].tolist()
        end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
        
        flag = False
        for start_index in start_indexes:
            for end_index in end_indexes:
                # skip answer that are not in the context.
                if offset[start_index] is None or offset[end_index] is None:
                    continue
                # skip answer with length that is either < 0
                if end_index < start_index:
                    continue
                flag = True
                answer = context[offset[start_index][0]: offset[end_index][1]]
                predicted_answer.append(answer)
                break
            if flag:
                break
        if not flag:
            predicted_answer.append(answer)
    return {"predicted_answer":predicted_answer}

In [18]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

## 1. Using Baseline Model: Bert Model

In [19]:
from transformers import AutoTokenizer
from transformers import TFAutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("../input/tokenizerbert",local_files_only=True)

model = TFAutoModelForQuestionAnswering.from_pretrained("../input/bert-base-qa-model",local_files_only=True)



In [20]:
MAX_LENGTH = 112

def post_porocess_data(examples):
  questions = examples["sentiment"]
  context = examples["text"]
  inputs = tokenizer(
      questions,
      context,
      max_length = MAX_LENGTH,
      padding="max_length",
      return_offsets_mapping = True,   
  )

  for i in range(len(inputs["input_ids"])):
    offset = inputs["offset_mapping"][i]
    token_type_ids = inputs["token_type_ids"][i]
    inputs["offset_mapping"][i] = [
                                  o if token_type_ids[k] == 1 else None for k, o in enumerate(offset)
    ]
  return inputs

In [21]:
processed_val_data = validation_data.map(
    post_porocess_data,
    batched = True
)

processed_val_data

In [22]:
tf_eval_dataset = processed_val_data.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    shuffle=False,
    batch_size=16,
)

In [23]:
outputs = model.predict(tf_eval_dataset)

start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [28]:
processed_val_data.set_format("pandas")
processed_val_df =  processed_val_data[:]

processed_val_df["start_logits"] = start_logits.tolist()
processed_val_df["end_logits"] = end_logits.tolist()

processed_val_df["text"] = validation_data["text"]

bert_final_val_data = Dataset.from_pandas(processed_val_df)

bert_final_val_data = bert_final_val_data.map(predict_answers,batched=True)
bert_final_val_data

In [40]:
theoritcal_answers = processed_val_data["selected_text"]

# calculating the jaccard score
score = 0
predicted_val_answers_by_bert = bert_final_val_data["predicted_answer"]
for i in range(len(theoritcal_answers)):
  score += jaccard(theoritcal_answers[i],predicted_val_answers_by_bert[i])

score /= len(theoritcal_answers)
print("Jaccard score on validation data: ", score)

## 2. Using the best model: Roberta

In [30]:
tokenizer = AutoTokenizer.from_pretrained("../input/robertatokenizer",local_files_only=True)

model = TFAutoModelForQuestionAnswering.from_pretrained("../input/robertamodel",local_files_only=True)


In [31]:
MAX_LENGTH = 73

def post_porocess_data(examples):
  questions = examples["sentiment"]
  context = examples["text"]
  inputs = tokenizer(
      questions,
      context,
      max_length = MAX_LENGTH,
      padding="max_length",
      return_offsets_mapping = True,   
  )

  for i in range(len(inputs["input_ids"])):
    offset = inputs["offset_mapping"][i]
    sequence_ids = inputs.sequence_ids(i)
    inputs["offset_mapping"][i] = [
                                  o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
    ]
  return inputs

In [32]:
processed_val_data = validation_data.map(
    post_porocess_data,
    batched = True
)
processed_val_data

In [33]:
tf_eval_dataset = processed_val_data.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    shuffle=False,
    batch_size=16,
)

In [34]:
outputs = model.predict(tf_eval_dataset)

start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [35]:
processed_val_data.set_format("pandas")
processed_val_df =  processed_val_data[:]

processed_val_df["start_logits"] = start_logits.tolist()
processed_val_df["end_logits"] = end_logits.tolist()

processed_val_df["text"] = validation_data["text"]

roberta_final_val_data = Dataset.from_pandas(processed_val_df)

roberta_final_val_data = roberta_final_val_data.map(predict_answers,batched=True)
roberta_final_val_data

In [41]:
# calculating the jaccard score
score = 0
predicted_val_answers_by_roberta = roberta_final_val_data["predicted_answer"]
for i in range(len(theoritcal_answers)):
  score += jaccard(theoritcal_answers[i],predicted_val_answers_by_roberta[i])

score /= len(theoritcal_answers)
print("Jaccard score on validation data: ", score)

## 3. Comparison on predictions between basline model-bert and the best model roberta.

In [47]:
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Model", "Jaccard score on validation data"]
x.add_row(["Bert-base-cased", 0.67])
x.add_row(["Roberta-base-cased", 0.71])
print(x)

In [45]:
## prediction by bert 
for i in range(10):
    print("Ground truth result: ",theoritcal_answers[i] )
    print("Prediction by Bert model: ",predicted_val_answers_by_bert[i] )
    print("*"*50)

In [46]:
## prediction by roberta 
for i in range(10):
    print("Ground truth result: ",theoritcal_answers[i] )
    print("Prediction by Robert model: ",predicted_val_answers_by_roberta[i] )
    print("*"*50)