In [54]:
import pandas as pd

In [61]:
train_data = pd.read_json('raw_data/webis-clickbait-22/train.jsonl',lines=True)
validate_data = pd.read_json('raw_data/webis-clickbait-22/validation.jsonl',lines=True)


In [63]:
#filter only phrase tags for Q&A task
train_data['tags']=train_data['tags'].apply(lambda x:x[0])
train_data=train_data[train_data['tags']=='phrase'].reset_index(drop=True)

#filter only phrase tags for Q&A task
validate_data['tags']=validate_data['tags'].apply(lambda x:x[0])
validate_data=validate_data[validate_data['tags']=='phrase'].reset_index(drop=True)

In [73]:
# data processing to get context and spoiler start position
def create_df(dataset):

    col1=[]
    col2=[]

    for idx in range(len(dataset)):
        concatlen=0
        line_occuring=dataset['spoilerPositions'][idx][0][0][0]

        for i in range(line_occuring):
            concatlen+=len(dataset['targetParagraphs'][idx][i])


        starting_position=concatlen+dataset['spoilerPositions'][idx][0][0][1]

        col1.append({'answer_start': [starting_position], 'text': dataset['spoiler'][idx]})


        l=''
        for line in dataset['targetParagraphs'][idx]:
            l+=line
        
        col2.append(l)

    df = pd.DataFrame(list(zip(col1, col2)),
    columns =['answers', 'context'])
    df['question']=dataset['postText'].apply(lambda x: x[0])
    df['id']=dataset['postId']

    json_data=[]
    for index, row in df.iterrows():
        json_data.append({
            'question': row['question'],
            'context': row['context'],
            'answers': row['answers']
        })

    return json_data





In [76]:
import json

train_json=create_df(train_data)
validate_json=create_df(validate_data)

with open('train_json.json', 'w') as f:
    json.dump(train_json, f)

with open('validate_json.json', 'w') as f:
    json.dump(validate_json, f)

In [93]:
from datasets import load_dataset, DatasetDict
# Load the JSON file as a Dataset object
train_dataset = load_dataset('json', data_files='train_json.json')
validate_dataset = load_dataset('json', data_files='validate_json.json')



Found cached dataset json (C:/Users/MUTHUKUMAR S/.cache/huggingface/datasets/json/default-e53caa348637767f/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
100%|██████████| 1/1 [00:00<00:00, 501.41it/s]
Found cached dataset json (C:/Users/MUTHUKUMAR S/.cache/huggingface/datasets/json/default-ab100f04a2129cf5/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
100%|██████████| 1/1 [00:00<00:00, 500.75it/s]


In [94]:
# Create a DatasetDict object containing the Dataset

dataset_dict = DatasetDict({'train': train_dataset['train'],'validate':validate_dataset['train']})


In [95]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['question', 'context', 'answers'],
        num_rows: 1367
    })
    validate: Dataset({
        features: ['question', 'context', 'answers'],
        num_rows: 335
    })
})

In [96]:
from transformers import AutoTokenizer
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [97]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [99]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [101]:
features = prepare_train_features(dataset_dict['train'][:5])


In [102]:
tokenized_datasets = dataset_dict.map(prepare_train_features, batched=True, remove_columns=dataset_dict["train"].column_names)




# TO DO


In [103]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [105]:
batch_size=16

In [106]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [107]:
from transformers import default_data_collator

data_collator = default_data_collator

In [108]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (manager-core).
Your token has been saved to C:\Users\MUTHUKUMAR S\.cache\huggingface\token
Login successful


In [111]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validate"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

e:\USC\spring23\nlp\NLP-Final-BERT\Clickbait-BERT\distilbert-base-uncased-finetuned-squad is already a clone of https://huggingface.co/Lahen/distilbert-base-uncased-finetuned-squad. Make sure you pull the latest changes with `repo.git_pull()`.
  0%|          | 0/1104 [3:53:14<?, ?it/s]


In [114]:
trainer.train()


  0%|          | 0/150 [02:19<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 72.00 MiB (GPU 0; 4.00 GiB total capacity; 3.35 GiB already allocated; 0 bytes free; 3.42 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.save_model("test-squad-trained")
