In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
df_train = pd.read_csv("../input/tweet-sentiment-extraction/train.csv")
df_test = pd.read_csv("../input/tweet-sentiment-extraction/test.csv")
df_train.head(2)

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_val = train_test_split(df_train,test_size=0.1,random_state=42)

print("Shape of the train data: ", X_train.shape)
print("Shape of the validation data: ", X_val.shape)

In [4]:
# filling na values with "" 
X_train.dropna(inplace=True)
X_val.dropna(inplace=True)

print("Shape of the train data: ", X_train.shape)
print("Shape of the validation data: ", X_val.shape)

In [6]:
from transformers import AutoTokenizer

# using alberta-base tokenizer to tokenize the text into input IDs that model can make sense of.
model_checkpoint = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
tokenizer.save_pretrained("alberta_base_tokenizer")

In [9]:
from datasets import Dataset

## converting train and validation pandas dataset into huggingFace Dataset format.

X_train.reset_index(drop=True,inplace=True)
X_val.reset_index(drop=True,inplace=True)

train_data = Dataset.from_pandas(X_train)
validation_data = Dataset.from_pandas(X_val)

train_data

In [10]:
context = train_data[0]["text"]
question = train_data[0]["sentiment"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

In [11]:
# function to tokenize each sample.
def preprocess(example):
  return tokenizer(
    example["sentiment"],
    example["text"],
    return_offsets_mapping = True
)

check_dataset = train_data.map(
    preprocess,
    batched=True,
)

check_dataset

In [12]:
## finding the maximum length of a sequece after tokenization
MAX_LENGTH = 0
for i in check_dataset:
  length = len(i["input_ids"])
  if length > MAX_LENGTH:
    MAX_LENGTH = length
    
print("Max length of a sequence after tokenization: ", MAX_LENGTH)

In [13]:
def preprocess_training_examples(examples):
  inputs = tokenizer(
    examples["sentiment"],
    examples["text"],
    max_length = MAX_LENGTH,
    return_offsets_mapping = True,
    padding = "max_length",
    )
  start_positions = []
  end_positions = []
  #print(inputs["offset_mapping"])

  for i, offset in enumerate(inputs["offset_mapping"]):

    answer = examples["selected_text"][i]
    question = examples["sentiment"][i]
    context = examples["text"][i]
    start_char = 0
    end_char  = 0
    for idx,ch in enumerate(context):
      count = idx
      flag = True
      for j in answer:
        if context[count] == j:
          count +=1
        else:
          flag = False
          break
      if flag:
        start_char = idx
        break

    end_char = start_char + len(answer)
    token_type_ids = inputs["token_type_ids"][i]

    idx = 0
    try:
      while token_type_ids[idx]!=1:
        idx+=1
      context_start = idx

      while token_type_ids[idx] ==1:
        idx+=1
        if idx == len(token_type_ids):
          break
      context_end = idx-1
    except:
      print(token_type_ids)
    idx = context_start
    while idx <= context_end and offset[idx][0] <= start_char:
      idx+=1
    start_positions.append(idx-1)
    
    idx = context_end -1
    while idx >= context_start and offset[idx][1] >= end_char:
      idx-=1
    end_positions.append(idx+1)
  
  inputs["start_positions"] = start_positions
  inputs["end_positions"] = end_positions
   
  return inputs

In [14]:
processed_train_data = train_data.map(preprocess_training_examples,batched=True)
processed_train_data

In [15]:
idx = 20000
answer = processed_train_data[idx]["selected_text"]

start = processed_train_data[idx]["start_positions"]
end = processed_train_data[idx]["end_positions"]
labeled_answer = tokenizer.decode(processed_train_data[idx]["input_ids"][start : end +1 ])

print(f"Theoretical answer: {answer}")
print(f"labels give: {labeled_answer}")

In [16]:
processed_val_data = validation_data.map(preprocess_training_examples, batched = True)
processed_val_data

In [17]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [18]:
tf_train_dataset = processed_train_data.to_tf_dataset(
    columns=[
        "input_ids",
        "start_positions",
        "end_positions",
        "attention_mask",
        "token_type_ids",
    ],
    dummy_labels=True,
    shuffle=True,
    batch_size=32,
)

In [19]:
from transformers import create_optimizer
import tensorflow as tf

num_train_epochs = 10
num_train_steps = len(tf_train_dataset) * num_train_epochs
optimizer, schedule = create_optimizer(
    init_lr=3e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.001,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
#tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [20]:
import tensorflow as tf

earlyStop_callback = tf.keras.callbacks.EarlyStopping(
    monitor='loss', min_delta=0, patience=1, verbose=0,
    mode='auto',baseline=None, restore_best_weights=True
    )
# We're going to do validation afterwards, so no validation mid-training
model.fit(tf_train_dataset, epochs=num_train_epochs,callbacks = [earlyStop_callback])

In [21]:
model.save_pretrained("alberta_model")

In [22]:
def post_porocess_data(examples):
  questions = examples["sentiment"]
  context = examples["text"]
  inputs = tokenizer(
      questions,
      context,
      max_length = MAX_LENGTH,
      padding="max_length",
      return_offsets_mapping = True,   
  )

  for i in range(len(inputs["input_ids"])):
    offset = inputs["offset_mapping"][i]
    sequence_ids = inputs.sequence_ids(i)
    inputs["offset_mapping"][i] = [
                                  o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
    ]
  return inputs

In [23]:
processed_val_data = validation_data.map(
    post_porocess_data,
    batched = True
)

processed_val_data

In [24]:
tf_eval_dataset = processed_val_data.to_tf_dataset(
    columns=["input_ids", "attention_mask","token_type_ids"],
    shuffle=False,
    batch_size=16
)

In [25]:
df_test.reset_index(drop=True,inplace=True)

test_data = Dataset.from_pandas(df_test)

processed_test_data = test_data.map(
    post_porocess_data,
    batched = True
)

processed_test_data

In [26]:
outputs = model.predict(tf_eval_dataset)

start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [27]:
n_best = 20

def predict_answers(inputs):
    predicted_answer = []
    for i in range(len(inputs["offset_mapping"])):
        start_logit = inputs["start_logits"][i]
        end_logit = inputs["end_logits"][i]
        context = inputs["text"][i]
        offset = inputs["offset_mapping"][i]
        start_indexes = np.argsort(start_logit)[-1: -n_best - 1:-1].tolist()
        end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
        
        flag = False
        for start_index in start_indexes:
            for end_index in end_indexes:
                # skip answer that are not in the context.
                if offset[start_index] is None or offset[end_index] is None:
                    continue
                # skip answer with length that is either < 0
                if end_index < start_index:
                    continue
                flag = True
                answer = context[offset[start_index][0]: offset[end_index][1]]
                predicted_answer.append(answer)
                break
            if flag:
                break
        if not flag:
            predicted_answer.append(answer)
    return {"predicted_answer":predicted_answer}

In [30]:
len(validation_data["text"])

In [34]:
processed_val_data.set_format("pandas")

processed_val_df =  processed_val_data[:]
processed_val_df["start_logits"] = start_logits.tolist()
processed_val_df["end_logits"] = end_logits.tolist()

processed_val_df["text"] = validation_data["text"]

processed_val_df.head(2)

In [36]:
final_val_data = Dataset.from_pandas(processed_val_df)

final_val_data = final_val_data.map(predict_answers,batched=True)
final_val_data

In [37]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

theoritcal_answers = processed_val_data["selected_text"]

# calculating the jaccard score
score = 0
predicted_val_answers = final_val_data["predicted_answer"]

for i in range(len(theoritcal_answers)):
    score += jaccard(theoritcal_answers[i],predicted_val_answers[i])
    
score /= len(theoritcal_answers)
print("Jaccard score on validation data: ", score)