In [11]:
# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git



# Question answering

In [26]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

In [15]:
df = pd.read_csv("/content/output_curator.csv")

In [19]:
df = df[["question", "context", "answer"]]

In [28]:
# Split the DataFrame into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
data = DatasetDict({"train": train_dataset, "test": test_dataset})

# Verify the DatasetDict
print(data)

DatasetDict({
    train: Dataset({
        features: ['question', 'context', 'answer'],
        num_rows: 9
    })
    test: Dataset({
        features: ['question', 'context', 'answer'],
        num_rows: 3
    })
})


In [29]:
data["train"][0]

{'question': 'What is the target year for climate commitment?',
 'context': 'We continue to work towards delivering on our Net Carbon Footprint ambition to cut the intensity of the greenhouse gas emissions of the energy products we sell by about 50% by 2050, and 20% by 2035 compared to our 2016 levels, in step with society as it moves towards meeting the goals of the Paris Agreement. In 2019, we set shorter-term targets for 2021 of 2-3% lower than our 2016 baseline Net Carbon Footprint. In early 2020, we set a Net Carbon Footprint target for 2022 of 3-4% lower than our 2016 baseline. We will continue to evolve our approach over time.',
 'answer': '2050'}

There are several important fields here:

- `answers`: the starting location of the answer token and the answer text.
- `context`: background information from which the model needs to extract the answer.
- `question`: the question a model should answer.

## Preprocess

In [30]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [46]:
example = data["train"][0]
example

{'question': 'What is the target year for climate commitment?',
 'context': 'We continue to work towards delivering on our Net Carbon Footprint ambition to cut the intensity of the greenhouse gas emissions of the energy products we sell by about 50% by 2050, and 20% by 2035 compared to our 2016 levels, in step with society as it moves towards meeting the goals of the Paris Agreement. In 2019, we set shorter-term targets for 2021 of 2-3% lower than our 2016 baseline Net Carbon Footprint. In early 2020, we set a Net Carbon Footprint target for 2022 of 3-4% lower than our 2016 baseline. We will continue to evolve our approach over time.',
 'answer': '2050'}

In [None]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer


def preprocess_function(examples):
    questions = examples["question"]
    contexts = examples["context"]
    answers = examples["answer"]

    # Tokenize questions and contexts
    tokenized_inputs = tokenizer(
        questions, contexts, max_length=512, truncation=True, padding="max_length"
    )

    # Initialize lists to hold start and end positions
    start_positions = []
    end_positions = []

    # Loop through each example
    for i in range(len(questions)):
        # Get the answer text
        answer = answers[i]
        answer_start = contexts[i].find(answer)

        if answer_start == -1:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_positions.append(
                tokenizer.encode(
                    contexts[i][:answer_start], add_special_tokens=False
                ).__len__()
            )
            end_positions.append(
                tokenizer.encode(
                    contexts[i][: answer_start + len(answer)], add_special_tokens=False
                ).__len__()
                - 1
            )

    tokenized_inputs.update(
        {"start_positions": start_positions, "end_positions": end_positions}
    )

    return tokenized_inputs


# Apply the preprocessing function to the dataset
processed_datasets = data.map(preprocess_function, batched=True)

# Remove columns that are not needed
processed_datasets = processed_datasets.remove_columns(
    ["question", "context", "answer"]
)

# Verify the processed dataset
print(processed_datasets)

In [50]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

## Train

In [10]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
training_args = TrainingArguments(
    output_dir="my_awesome_qa_model",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    logging_dir="logs",  # Directory for logs
    logging_steps=10,  # Log every 10 steps
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_datasets["train"],
    eval_dataset=processed_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,6.079496
2,No log,6.034035
3,No log,6.011786


TrainOutput(global_step=3, training_loss=6.016239166259766, metrics={'train_runtime': 156.7107, 'train_samples_per_second': 0.172, 'train_steps_per_second': 0.019, 'total_flos': 3527633700864.0, 'train_loss': 6.016239166259766, 'epoch': 3.0})

## Evaluate

In [None]:
eval_result = trainer.evaluate(processed_datasets["test"])
print("Evaluation results:")
for key, value in eval_result.items():
    print(f"{key}: {value}")

In [None]:
from transformers import Trainer
import numpy as np

# Predict labels for the evaluation dataset
predictions = trainer.predict(processed_datasets["test"])
start_logits = predictions.predictions[0]  # Start logits
end_logits = predictions.predictions[1]  # End logits

# Convert logits to start and end positions
predicted_starts = np.argmax(start_logits, axis=1)
predicted_ends = np.argmax(end_logits, axis=1)

# Extract true start and end positions from the dataset
true_starts = np.array(
    [example["start_positions"] for example in processed_datasets["test"]]
)
true_ends = np.array(
    [example["end_positions"] for example in processed_datasets["test"]]
)

# Calculate accuracy (you might want a different metric depending on your needs)
accuracy = np.mean((predicted_starts == true_starts) & (predicted_ends == true_ends))
print("Accuracy:", accuracy)

# Print inputs along with predicted and true answer spans
for i in range(len(processed_datasets["test"])):
    eva_data = processed_datasets["test"][i]
    input_ids = eva_data["input_ids"]
    true_start = true_starts[i]
    true_end = true_ends[i]
    predicted_start = predicted_starts[i]
    predicted_end = predicted_ends[i]

    input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
    predicted_answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(input_ids[predicted_start : predicted_end + 1])
    )
    true_answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(input_ids[true_start : true_end + 1])
    )

    print(f"Input: {input_text}")
    print(f"True Answer: {true_answer}")
    print(f"Predicted Answer: {predicted_answer}")
    print()

# Save the model and tokenizer
model.save_pretrained("my_awesome_qa_model")
tokenizer.save_pretrained("my_awesome_qa_model")