In [3]:
import torch
import transformers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, metrics

In [2]:
pip install transformers

Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.7 kB ? eta -:--:--
     ---------------------------------------- 43.7/43.7 kB ? eta 0:00:00
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.24.6-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp311-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp311-none-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   ------- -------------------------------- 1.8/9.5 MB 37.3 MB/s eta 0:00:01
   ------------------- -------------------- 4.6/9.5 MB 49.0 MB/s eta 0:00:01
   ------------------------------- -------- 7.4/9.5 MB 52.1 MB/s eta 0:00:01
   -----------------


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stanford-question-answering-dataset/train-v1.1.json
/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json


In [3]:
import json
data = json.load(open('/kaggle/input/stanford-question-answering-dataset/train-v1.1.json'))

In [4]:
listt = []
for i in data['data']:
    d = {}
    for para in i['paragraphs']:
            context = para['context']
            for qa in para['qas']:
                d['answer'] = qa['answers'][0]['answer_start']
                d['text'] = qa['answers'][0]['text']
                d['question'] = qa['question']
                d['context'] = context
    listt.append(d)


In [5]:
df = pd.DataFrame(listt)

In [6]:
df

Unnamed: 0,answer,text,question,context
0,1232,Jim Wetherbee,Which notable astronaut is known to have atten...,Notre Dame alumni work in various fields. Alum...
1,533,Salma Hayek and Frida Giannini,Who did Beyoncé work with in 2013 on the Chime...,"In December, Beyoncé along with a variety of o..."
2,926,"Great Falls, Lewistown, Cut Bank and Glasgow",Where were air bases built in Montana?,When the U.S. entered World War II on December...
3,995,humanizing a devalued group,What is one preventive effort in circumventing...,Other authors have focused on the structural c...
4,763,"Allan Coukell,",Who is a director at the Pew Charitable Trusts?,Possible improvements include clarification of...
...,...,...,...,...
437,456,establish a restaurant guest's identity and fo...,How could police help the owner when a restaur...,"In contrast, the police are entitled to protec..."
438,57,Ghazals and folk songs,What kind of music does Roshen Ara Begum perform?,"For the popular taste however, light music, pa..."
439,955,being bitten during a fight,How did tyrannosaurs become infected?,Evidence of infection in fossil remains is a s...
440,404,poaching,What else is partly to blame for the declining...,"In contrast, Botswana has recently been forced..."


In [7]:
config = {
    "max_length": 512,
    "model_path": "microsoft/xtremedistil-l6-h256-uncased",
    
    "output_dir": "./my-model",
    "train_batch_size": 64,
    "valid_batch_size": 64,
    "learning_rate": 3e-5,
    "epochs": 300,
    
    "debug": True,
}

In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained(config["model_path"])
class TextDataset:
    
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return self.data.shape[0]
    
    def preprocess_function(self,question, context, answer_start_char, answer_end_char):
        inputs = tokenizer(
            question,
            context,
            max_length=config["max_length"],
            truncation="only_second",
            return_offsets_mapping=True,
            padding="max_length",
        )

        offset = inputs.pop("offset_mapping") 
        sequence_ids = inputs.sequence_ids()

        context_start, context_end = -1, -1


        # Add logic to find the token indices for context start and context end using `sequence_ids``.
        for index in range(len(sequence_ids)):
            i = sequence_ids[index]
            if context_start == -1:
                if i == 1:
                    context_start = index
            else:
                if i != 1:
                    context_end = index          

        context_offsets = offset[context_start: context_end]

        # Create a mapping of character index to token index.
        character_pos_to_token_pos = {}
        for token_pos, (char_start, char_end) in enumerate(context_offsets):
            token_pos1 = context_start + token_pos
            for i in range(char_start, char_end+1):
                character_pos_to_token_pos[i] = token_pos1

        start_pos = character_pos_to_token_pos.get(answer_start_char, 0)
        end_pos = character_pos_to_token_pos.get(
            answer_end_char - 1, 
            0 if start_pos == 0 else config['max_length'] - 1
        )

        inputs["start_positions"] = start_pos
        inputs["end_positions"] = end_pos

        return inputs
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        question = row['question']
        context = row['context']
        answer_start = row['answer']
        answer_end = answer_start + len(row['text'])
        
        return self.preprocess_function(question, context, answer_start,answer_end)

config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



In [9]:
dff = TextDataset(df)


In [10]:
def preprocess_function(question, context, answer_start_char, answer_end_char):
    inputs = tokenizer(
        question,
        context,
        max_length=config["max_length"],
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    offset = inputs.pop("offset_mapping") 
    sequence_ids = inputs.sequence_ids()
    
    context_start, context_end = -1, -1
    

    # Add logic to find the token indices for context start and context end using `sequence_ids``.
    for index in range(len(sequence_ids)):
        i = sequence_ids[index]
        print(i)
        if context_start == -1:
            if i == 1:
                context_start = index
        else:
            if i != 1:
                context_end = index          
    
    context_offsets = offset[context_start: context_end]
    
    # Create a mapping of character index to token index.
    character_pos_to_token_pos = {}
    for token_pos, (char_start, char_end) in enumerate(context_offsets):
        token_pos1 = context_start + token_pos
        for i in range(char_start, char_end+1):
            character_pos_to_token_pos[i] = token_pos1
            
    start_pos = charcter_pos_to_token_pos.get(answer_start_char, 0)
    end_pos = charcter_pos_to_token_pos.get(
        answer_end_char - 1, 
        0 if start_pos == 0 else config['max_length'] - 1
    )
        
    inputs["start_positions"] = start_pos
    inputs["end_positions"] = end_pos
        
    return inputs

In [11]:
train, valid = model_selection.train_test_split(
    df,
    test_size=0.2,
    shuffle=True
)

In [12]:
train_ds = TextDataset(train)
valid_ds = TextDataset(valid)

In [13]:
model = transformers.AutoModelForQuestionAnswering.from_pretrained(config["model_path"])

pytorch_model.bin:   0%|          | 0.00/51.0M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
import pickle
from datasets import load_metric
import numpy as np

def compute_metrics(eval_pred):
    # Load metrics
    exact_match_metric = load_metric("exact_match")
    f1_metric = load_metric("f1")

    start_logits, end_logits = eval_pred.predictions
    examples = eval_pred.label_ids
    
    # Extract information from examples
    example_ids = examples["example_id"]
    contexts = examples["context"]
    offset_mappings = examples["offset_mapping"]
    ground_truth_answers = examples["answers"]  # Assume this contains the true answers
    
    exact_match = 0
    f1 = 0
    
    for i in range(len(example_ids)):
        # Get the predicted start and end positions
        start_logit = start_logits[i]
        end_logit = end_logits[i]
        
        start_index = np.argmax(start_logit)
        end_index = np.argmax(end_logit)
        
        # Get the offset mapping for this example
        offsets = offset_mappings[i]
        
        # Get the predicted answer span
        pred_start_char = offsets[start_index][0]
        pred_end_char = offsets[end_index][1]
        
        pred_answer = contexts[i][pred_start_char:pred_end_char]
        
        # Get the ground truth answer
        ground_truth_answer = ground_truth_answers[i]
        
        # Compute exact match
        if pred_answer.strip() == ground_truth_answer.strip():
            exact_match += 1
        
        # Compute F1 score
        f1 += compute_f1(pred_answer, ground_truth_answer)
    
    total = len(example_ids)
    avg_exact_match = exact_match / total
    avg_f1 = f1 / total
    
    return {
        "exact_match": avg_exact_match,
        "f1": avg_f1
    }

def compute_f1(pred_answer, ground_truth_answer):
    # Function to compute F1 score
    from sklearn.metrics import f1_score
    
    # Tokenize answers for F1 score calculation
    pred_tokens = pred_answer.split()
    ground_truth_tokens = ground_truth_answer.split()
    
    # Handle cases where there are no tokens
    if not pred_tokens:
        return 0 if ground_truth_tokens else 1
    
    if not ground_truth_tokens:
        return 0
    
    return f1_score(ground_truth_tokens, pred_tokens, average="micro")



training_args = transformers.TrainingArguments(
     output_dir="./results",                      # Directory for storing results
    evaluation_strategy="steps",                 # Evaluate every few steps
    per_device_train_batch_size=config['train_batch_size'],              # Batch size per device during training
    per_device_eval_batch_size=config['train_batch_size'],               # Batch size per device during evaluation
    num_train_epochs=config['epochs'],                          # Total number of training epochs
    warmup_steps=500,                            # Number of warmup steps for learning rate scheduler
    save_total_limit=2,
    logging_dir=None,                            # Disable logging directory
    logging_strategy="no",
    report_to=[]# Limit the total amount of checkpoints`

)



In [15]:
trainer = transformers.Trainer(
    model=model,                                 # The model to be trained
    args=training_args,                          # The training arguments, defined above
    train_dataset=train_ds,                 # The training dataset
    eval_dataset=valid_ds,                   # The evaluation dataset
    tokenizer=tokenizer,                         # The tokenizer
    compute_metrics=compute_metrics, 
)

In [None]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss


  exact_match_metric = load_metric("exact_match")


Downloading builder script:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

In [None]:
results = trainer.evaluate()

# Print evaluation results
print(results)

In [None]:
trainer.save_state()

In [None]:
trainer.save_model()

In [None]:
from transformers import pipeline

# Load the QnA pipeline with a pre-trained model
qna_pipeline = pipeline("question-answering", model="microsoft/xtremedistil-l6-h256-uncased")

# Example input data
question = "What is the capital of France?"
context = "France is a country in Europe. The capital of France is Paris."

# Perform inference
result = qna_pipeline(question=question, context=context)

print(result)
