In [1]:
!pip install transformers



In [2]:
!pip install accelerate -U



In [3]:
!pip install torch transformers bitsandbytes accelerate



In [4]:
!pip install --quiet bitsandbytes
!pip install --quiet --upgrade transformers
!pip install --quiet --upgrade accelerate
!pip install --quiet sentencepiece

In [5]:
!pip install optuna



In [6]:
!pip install datasets



In [7]:
import numpy as np
import pandas as pd
from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from datasets import Dataset, DatasetDict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix

In [10]:
import pandas as pd
df = pd.read_csv('questions_answers_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Filename,context,question,answer
0,0,1426261.json,Cricket match played on 2024-04-09 at city Mo...,Who has played the match?,Sunrisers Hyderabad and Punjab Kings
1,1,1426261.json,Cricket match played on 2024-04-09 at city Mo...,Who won the match?,Sunrisers Hyderabad
2,2,1426261.json,Cricket match played on 2024-04-09 at city Mo...,Who won the player of the match?,Nithish Kumar Reddy
3,3,1426261.json,Cricket match played on 2024-04-09 at city Mo...,Who has scorred most runs in the match?,Nithish Kumar Reddy
4,4,1426261.json,Cricket match played on 2024-04-09 at city Mo...,How many runs has scorred by PJ Cummins?,3


In [11]:
df.shape

(8703, 5)

In [12]:
df.columns

Index(['Unnamed: 0', 'Filename', 'context', 'question', 'answer'], dtype='object')

In [13]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df.shape

(8703, 4)

In [14]:
df.drop('Filename', axis=1, inplace=True)
df.shape

(8703, 3)

In [15]:
len(df['context'][3])

1091

In [16]:
df.head(2)

Unnamed: 0,context,question,answer
0,Cricket match played on 2024-04-09 at city Mo...,Who has played the match?,Sunrisers Hyderabad and Punjab Kings
1,Cricket match played on 2024-04-09 at city Mo...,Who won the match?,Sunrisers Hyderabad


In [17]:
dataset = Dataset.from_pandas(df)

In [18]:
dataset

Dataset({
    features: ['context', 'question', 'answer'],
    num_rows: 8703
})

In [19]:
train_df = df.sample(frac=0.8, random_state=42)
eval_df = df.drop(train_df.index)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'eval': eval_dataset
})

In [20]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer', '__index_level_0__'],
        num_rows: 6962
    })
    eval: Dataset({
        features: ['context', 'question', 'answer', '__index_level_0__'],
        num_rows: 1741
    })
})

In [21]:
model_name = "distilbert-base-uncased"

In [22]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# from transformers import LongformerTokenizer, LongformerForQuestionAnswering

# model_name = "allenai/longformer-base-4096"
# tokenizer = LongformerTokenizer.from_pretrained(model_name)
# model = LongformerForQuestionAnswering.from_pretrained(model_name)

In [24]:
def split_into_chunks(context, max_length, overlap):
    tokens = tokenizer.tokenize(context)
    chunks = []
    for i in range(0, len(tokens), max_length - overlap):
        chunk = tokens[i:i + max_length]
        chunks.append(tokenizer.convert_tokens_to_string(chunk))
        if i + max_length >= len(tokens):
            break
    return chunks

In [25]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples['question']]
    contexts = [c.strip() for c in examples['context']]
    answers = [{'text': a, 'answer_start': c.find(a)} for a, c in zip(examples['answer'], contexts)]

    tokenized_examples = {
        'input_ids': [],
        'attention_mask': [],
        'start_positions': [],
        'end_positions': []
    }

    max_length = 512
    overlap = 50

    for i in range(len(questions)):
        question = questions[i]
        context = contexts[i]
        answer = answers[i]

        chunks = split_into_chunks(context, max_length=max_length, overlap=overlap)

        for chunk in chunks:
            encoded = tokenizer.encode_plus(question, chunk, max_length=max_length, truncation=True, padding='max_length', return_offsets_mapping=True)
            start_char = context.find(answer['text'])
            end_char = start_char + len(answer['text'])

            offset_mapping = encoded['offset_mapping']
            start_position = None
            end_position = None
            for idx, (start, end) in enumerate(offset_mapping):
                if start <= start_char < end:
                    start_position = idx
                if start < end_char <= end:
                    end_position = idx
                    break

            if start_position is not None and end_position is not None:
                tokenized_examples['input_ids'].append(encoded['input_ids'])
                tokenized_examples['attention_mask'].append(encoded['attention_mask'])
                tokenized_examples['start_positions'].append(start_position)
                tokenized_examples['end_positions'].append(end_position)

    return tokenized_examples

In [26]:
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True, remove_columns=dataset_dict['train'].column_names)
tokenized_datasets

Map:   0%|          | 0/6962 [00:00<?, ? examples/s]

Map:   0%|          | 0/1741 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 3763
    })
    eval: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 954
    })
})

In [27]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix
import numpy as np

def compute_metrics(p):
    start_preds = np.argmax(p.predictions[0], axis=1)
    end_preds = np.argmax(p.predictions[1], axis=1)

    start_true = p.label_ids[0]
    end_true = p.label_ids[1]

    precision, recall, f1, _ = precision_recall_fscore_support(
        np.concatenate((start_true, end_true)),
        np.concatenate((start_preds, end_preds)),
        average='macro'
    )
    acc = accuracy_score(
        np.concatenate((start_true, end_true)),
        np.concatenate((start_preds, end_preds))
    )

    start_conf_matrix = confusion_matrix(start_true, start_preds)
    end_conf_matrix = confusion_matrix(end_true, end_preds)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [28]:
import tempfile
temp_output_dir = tempfile.mkdtemp()

In [29]:
training_args = TrainingArguments(
    output_dir=temp_output_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    compute_metrics=compute_metrics
)



In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,5.9984,5.737653,0.115828,0.030742,0.031875,0.025321
2,4.2468,3.773542,0.219602,0.024127,0.051471,0.031442
3,2.7764,2.510608,0.430294,0.177999,0.1819,0.163906


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=177, training_loss=4.69492785135905, metrics={'train_runtime': 531.5154, 'train_samples_per_second': 21.239, 'train_steps_per_second': 0.333, 'total_flos': 1474942846261248.0, 'train_loss': 4.69492785135905, 'epoch': 3.0})

In [31]:
eval_results = trainer.evaluate()
eval_results

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 2.510608196258545,
 'eval_accuracy': 0.430293501048218,
 'eval_precision': 0.17799904632405134,
 'eval_recall': 0.1819000649816704,
 'eval_f1': 0.16390648627295698,
 'eval_runtime': 14.4691,
 'eval_samples_per_second': 65.934,
 'eval_steps_per_second': 4.147,
 'epoch': 3.0}

In [32]:
first_model = "question_answer_model"
model.save_pretrained(first_model)
tokenizer.save_pretrained(first_model)

('question_answer_model/tokenizer_config.json',
 'question_answer_model/special_tokens_map.json',
 'question_answer_model/vocab.txt',
 'question_answer_model/added_tokens.json',
 'question_answer_model/tokenizer.json')

In [33]:
import shutil
from IPython.display import FileLink

shutil.make_archive(first_model, 'zip', first_model)
display(FileLink(f'{first_model}.zip'))

In [None]:
#Checking Results

In [34]:
from transformers import pipeline
qa_pipeline_context = pipeline('question-answering', model=first_model, tokenizer=first_model)

In [38]:
context = " Cricket match played on 2024-04-09 at city Mohali between Sunrisers Hyderabad (SH) and Punjab Kings (PK), toss is won by Punjab Kings and they have decided to field Winner of the match is Sunrisers Hyderabad they won by 2 runs, and Player of the match is Nithish Kumar Reddy.. First Inning is played by Sunrisers Hyderabad and they have set the target of 183. First Inning batsman has scorred runs as follows:  TM Head has scored 21 runs,  Abhishek Sharma has scored 16 runs,  AK Markram has scored 0 runs,  Nithish Kumar Reddy has scored 64 runs,  RA Tripathi has scored 11 runs,  H Klaasen has scored 9 runs,  Abdul Samad has scored 25 runs,  Shahbaz Ahmed has scored 14 runs,  PJ Cummins has scored 3 runs,  B Kumar has scored 6 runs,  JD Unadkat has scored 6 runs, . Second Inning batsman has scorred runs as follows:  S Dhawan has scored 14 runs,  JM Bairstow has scored 0 runs,  P Simran Singh has scored 4 runs,  SM Curran has scored 29 runs,  Sikandar Raza has scored 28 runs,  Shashank Singh has scored 46 runs,  JM Sharma has scored 19 runs,  Ashutosh Sharma has scored 33 runs, ."

In [43]:
question = "Who won the player of the match?"
result = qa_pipeline_context({'question': question, 'context': context})
result

{'score': 0.08385574072599411,
 'start': 247,
 'end': 263,
 'answer': 'match is Nithish'}

In [44]:
result['answer']

'match is Nithish'

### Retrivial Model

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib

In [46]:
vectorizer = TfidfVectorizer().fit(df['context'])
context_vectors = vectorizer.transform(df['context'])

In [47]:
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(context_vectors, 'context_vectors.pkl')

['context_vectors.pkl']

In [48]:
model_2 = AutoModelForQuestionAnswering.from_pretrained(first_model)
tokenizer_2 = AutoTokenizer.from_pretrained(first_model)

In [49]:
from transformers import pipeline
qa_pipeline = pipeline('question-answering', model=model_2, tokenizer=tokenizer_2)

In [50]:
vectorizer = joblib.load('tfidf_vectorizer.pkl')
context_vectors = joblib.load('context_vectors.pkl')

In [51]:
def retrieve_context(question, top_n=1):
    question_vector = vectorizer.transform([question])
    similarities = cosine_similarity(question_vector, context_vectors).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    return [df['context'].iloc[i] for i in top_indices]

def get_answer(question, context):
    result = qa_pipeline({'question': question, 'context': context})
    return result['answer']

def answer_question(question):
    retrieved_contexts = retrieve_context(question)
    if not retrieved_contexts:
        return "No relevant context found."
    answer = get_answer(question, retrieved_contexts[0])
    return answer

In [52]:
question = "Who won the player of the match?"
answer = answer_question(question)
answer

'between Delhi Daredevils'

### Hyper parameter tuning

In [58]:
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import ParameterSampler
import numpy as np


param_grid = {
    'learning_rate': [1e-5, 3e-5, 5e-5, 7e-5],
    'per_device_train_batch_size': [8, 16, 32, 64],
    'num_train_epochs': [3, 4, 5],
    'weight_decay': [0.0, 0.01, 0.1],
    'warmup_steps': [0, 100, 500],
    'adam_epsilon': [1e-8, 1e-7, 1e-6]
}

In [59]:
param_combinations = list(ParameterSampler(param_grid, n_iter=20, random_state=42))

In [60]:
param_combinations

[{'weight_decay': 0.01,
  'warmup_steps': 100,
  'per_device_train_batch_size': 32,
  'num_train_epochs': 4,
  'learning_rate': 3e-05,
  'adam_epsilon': 1e-07},
 {'weight_decay': 0.01,
  'warmup_steps': 100,
  'per_device_train_batch_size': 32,
  'num_train_epochs': 3,
  'learning_rate': 7e-05,
  'adam_epsilon': 1e-08},
 {'weight_decay': 0.01,
  'warmup_steps': 500,
  'per_device_train_batch_size': 32,
  'num_train_epochs': 5,
  'learning_rate': 5e-05,
  'adam_epsilon': 1e-07},
 {'weight_decay': 0.0,
  'warmup_steps': 100,
  'per_device_train_batch_size': 32,
  'num_train_epochs': 4,
  'learning_rate': 3e-05,
  'adam_epsilon': 1e-06},
 {'weight_decay': 0.0,
  'warmup_steps': 500,
  'per_device_train_batch_size': 8,
  'num_train_epochs': 5,
  'learning_rate': 5e-05,
  'adam_epsilon': 1e-06},
 {'weight_decay': 0.1,
  'warmup_steps': 100,
  'per_device_train_batch_size': 16,
  'num_train_epochs': 3,
  'learning_rate': 3e-05,
  'adam_epsilon': 1e-06},
 {'weight_decay': 0.01,
  'warmup_step

In [61]:
import tempfile
temp_output_dir = tempfile.mkdtemp()

In [62]:
def hyperparameter_tuning(params):
    training_args = TrainingArguments(
        output_dir=temp_output_dir,
        evaluation_strategy="epoch",
        learning_rate=params['learning_rate'],
        per_device_train_batch_size=params['per_device_train_batch_size'],
        per_device_eval_batch_size=params['per_device_train_batch_size'],
        num_train_epochs=params['num_train_epochs'],
        weight_decay=params['weight_decay'],
        warmup_steps=params['warmup_steps'],
        adam_epsilon=params['adam_epsilon'],
        logging_dir='./logs',
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['eval'],
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results['eval_f1']


In [None]:
best_f1 = 0
best_params = None

for params in param_combinations:
    f1 = hyperparameter_tuning(params)
    print(f"Params: {params}, F1: {f1}")
    if f1 > best_f1:
        best_f1 = f1
        best_params = params

print(f"Best Params: {best_params}, Best F1: {best_f1}")



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7057,1.459476,0.609015,0.358123,0.368742,0.33975
2,1.0306,1.020572,0.713836,0.441898,0.459515,0.42725
3,0.7383,0.825727,0.756289,0.523328,0.529623,0.508965
4,0.6914,0.75189,0.79717,0.561534,0.570452,0.551645


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Params: {'weight_decay': 0.01, 'warmup_steps': 100, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'learning_rate': 3e-05, 'adam_epsilon': 1e-07}, F1: 0.5516446688831579


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7176,0.788159,0.768344,0.548906,0.53785,0.524898
2,0.526,0.642309,0.843291,0.641804,0.646092,0.627247
3,0.3015,0.658559,0.853249,0.696977,0.665135,0.665603


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Params: {'weight_decay': 0.01, 'warmup_steps': 100, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'learning_rate': 7e-05, 'adam_epsilon': 1e-08}, F1: 0.6656033581496726


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.189,0.692263,0.851153,0.692106,0.659164,0.65618
2,0.2341,0.82892,0.839099,0.660995,0.629522,0.62882
3,0.3029,0.75982,0.842243,0.671958,0.639213,0.6361
4,0.3571,0.618037,0.860063,0.666878,0.647773,0.641803
5,0.2347,0.697728,0.863732,0.714787,0.692403,0.686751


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Params: {'weight_decay': 0.01, 'warmup_steps': 500, 'per_device_train_batch_size': 32, 'num_train_epochs': 5, 'learning_rate': 5e-05, 'adam_epsilon': 1e-07}, F1: 0.6867506755002364


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1202,0.894466,0.854298,0.694627,0.675751,0.663542
2,0.0933,0.99997,0.861635,0.717358,0.680686,0.677008
3,0.0569,1.030588,0.871593,0.704513,0.681494,0.673031
4,0.0185,1.03328,0.874214,0.725355,0.696708,0.692529


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Params: {'weight_decay': 0.0, 'warmup_steps': 100, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'learning_rate': 3e-05, 'adam_epsilon': 1e-06}, F1: 0.6925292245049892


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2304,0.866747,0.857442,0.645962,0.617261,0.609541
2,0.3724,0.954626,0.868973,0.64323,0.653827,0.628222
3,0.0708,1.128837,0.875262,0.692752,0.655992,0.654275
4,0.0854,1.10279,0.883124,0.712428,0.684075,0.677527
5,0.0267,1.17911,0.887841,0.733439,0.699005,0.699488


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Params: {'weight_decay': 0.0, 'warmup_steps': 500, 'per_device_train_batch_size': 8, 'num_train_epochs': 5, 'learning_rate': 5e-05, 'adam_epsilon': 1e-06}, F1: 0.6994877264933655


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0471,1.03027,0.880503,0.712081,0.684167,0.680794
2,0.048,1.215953,0.880503,0.735832,0.683585,0.689485
3,0.0027,1.242429,0.886268,0.73092,0.691563,0.69282


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Params: {'weight_decay': 0.1, 'warmup_steps': 100, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'learning_rate': 3e-05, 'adam_epsilon': 1e-06}, F1: 0.6928201001018514


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0005,1.321223,0.885744,0.736921,0.704192,0.69979
2,0.0031,1.27482,0.882075,0.736377,0.691827,0.693434
3,0.0003,1.313568,0.886268,0.724691,0.690303,0.687567
4,0.0019,1.299516,0.886792,0.743588,0.695934,0.699263


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Params: {'weight_decay': 0.01, 'warmup_steps': 0, 'per_device_train_batch_size': 16, 'num_train_epochs': 4, 'learning_rate': 1e-05, 'adam_epsilon': 1e-07}, F1: 0.6992629486990901


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0001,1.335956,0.890461,0.746768,0.699152,0.703418
2,0.0562,1.291952,0.870021,0.699897,0.657789,0.657232
3,0.0143,1.431532,0.879455,0.69746,0.664858,0.66216
4,0.0215,1.289768,0.887317,0.729678,0.710872,0.699028


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Params: {'weight_decay': 0.1, 'warmup_steps': 500, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'learning_rate': 1e-05, 'adam_epsilon': 1e-06}, F1: 0.68818622080592


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0588,1.339324,0.885744,0.723913,0.695801,0.691421
2,0.0028,1.355672,0.886268,0.711817,0.685759,0.681633
3,0.0001,1.358498,0.888365,0.721941,0.686557,0.686968
4,0.0,1.376077,0.885744,0.722752,0.687066,0.686103
5,0.0001,1.373269,0.885744,0.726206,0.687499,0.686601


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Params: {'weight_decay': 0.01, 'warmup_steps': 100, 'per_device_train_batch_size': 8, 'num_train_epochs': 5, 'learning_rate': 1e-05, 'adam_epsilon': 1e-08}, F1: 0.6866010019679454


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0787,1.391382,0.884696,0.706149,0.680198,0.67444
2,0.1196,1.330507,0.892558,0.712687,0.701426,0.690614
3,0.0424,1.367736,0.883648,0.706896,0.676748,0.67911
4,0.0094,1.384613,0.881551,0.727996,0.693925,0.692851
5,0.005,1.351433,0.890985,0.724339,0.699094,0.695206


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Params: {'weight_decay': 0.01, 'warmup_steps': 500, 'per_device_train_batch_size': 16, 'num_train_epochs': 5, 'learning_rate': 5e-05, 'adam_epsilon': 1e-07}, F1: 0.695206274075355


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [64]:
best_params

{'weight_decay': 0.0,
 'warmup_steps': 500,
 'per_device_train_batch_size': 8,
 'num_train_epochs': 5,
 'learning_rate': 5e-05,
 'adam_epsilon': 1e-06}

In [65]:
training_args = TrainingArguments(
    output_dir=temp_output_dir,
    evaluation_strategy="epoch",
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=best_params['per_device_train_batch_size'],
    num_train_epochs=best_params['num_train_epochs'],
    weight_decay=best_params['weight_decay'],
    warmup_steps=best_params['warmup_steps'],
    adam_epsilon=best_params['adam_epsilon'],
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    compute_metrics=compute_metrics
)



In [66]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0017,1.702566,0.870021,0.676278,0.643162,0.640692
2,0.174,1.431884,0.868449,0.695807,0.646748,0.652862
3,0.0417,1.504858,0.88522,0.728639,0.696555,0.692899
4,0.0001,1.527325,0.887841,0.708209,0.69063,0.67798
5,0.0003,1.587592,0.888365,0.72353,0.694559,0.689488


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=2355, training_loss=0.04829610299107609, metrics={'train_runtime': 949.8197, 'train_samples_per_second': 19.809, 'train_steps_per_second': 2.479, 'total_flos': 2458238077102080.0, 'train_loss': 0.04829610299107609, 'epoch': 5.0})

In [67]:
eval_results = trainer.evaluate()
eval_results

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.5875920057296753,
 'eval_accuracy': 0.8883647798742138,
 'eval_precision': 0.7235303439720586,
 'eval_recall': 0.6945594087253724,
 'eval_f1': 0.6894881633359431,
 'eval_runtime': 14.2854,
 'eval_samples_per_second': 66.782,
 'eval_steps_per_second': 8.4,
 'epoch': 5.0}

In [68]:
final_model_name = "question_and_answer_from_document"

In [69]:
model.save_pretrained(final_model_name)
tokenizer.save_pretrained(final_model_name)

('question_and_answer_from_document/tokenizer_config.json',
 'question_and_answer_from_document/special_tokens_map.json',
 'question_and_answer_from_document/vocab.txt',
 'question_and_answer_from_document/added_tokens.json',
 'question_and_answer_from_document/tokenizer.json')

In [70]:
import shutil
from IPython.display import FileLink

shutil.make_archive(final_model_name, 'zip', final_model_name)
display(FileLink(f'{final_model_name}.zip'))

##Answer from context

In [71]:
from transformers import pipeline
qa_pipeline_context = pipeline('question-answering', model=final_model_name, tokenizer=final_model_name)

In [72]:
context = "Indian Premier League's match number 23 of season 2023 played on 2023-04-16 in the stadium Narendra Modi Stadium, Ahmedabad at city Ahmedabad between Gujarat Titans (GT) and Rajasthan Royals (RR), toss is won by Rajasthan Royals and they have decided to field. Players for Gujarat Titans (GT) are Noor Ahmad, WP Saha, Shubman Gill, B Sai Sudharsan, HH Pandya, DA Miller, A Manohar, R Tewatia, Rashid Khan, AS Joseph, Mohammed Shami, MM Sharma and players for Rajasthan Royals (RR) are YS Chahal, YBK Jaiswal, JC Buttler, D Padikkal, SV Samson, R Parag, SO Hetmyer, Dhruv Jurel, R Ashwin, TA Boult, Sandeep Sharma, A Zampa. Winner of the match is Rajasthan Royals they won by 3 wickets, and Player of the match is SO Hetmyer. First Inning is played by Gujarat Titans and they have set the target of 178. First Inning batsman has scorred runs as follows:  WP Saha has scored 4 runs,  B Sai Sudharsan has scored 20 runs,  Shubman Gill has scored 45 runs,  HH Pandya has scored 28 runs,  DA Miller has scored 46 runs,  A Manohar has scored 27 runs,  Rashid Khan has scored 1 runs,  R Tewatia has scored 1 runs, . Second Inning batsman has scorred runs as follows:  YBK Jaiswal has scored 1 runs,  JC Buttler has scored 0 runs,  D Padikkal has scored 26 runs,  SV Samson has scored 60 runs,  R Parag has scored 5 runs,  SO Hetmyer has scored 56 runs,  Dhruv Jurel has scored 18 runs,  R Ashwin has scored 10 runs,  TA Boult has scored 0 runs, ."

In [73]:
question = "Who won the player of the match?"
result = qa_pipeline_context({'question': question, 'context': context})
result

{'score': 0.10221756249666214, 'start': 281, 'end': 287, 'answer': 'Titans'}

In [74]:
result['answer']

'Titans'

### Retrive answers

In [75]:
final_model = AutoModelForQuestionAnswering.from_pretrained(final_model_name)
final_tokenizer = AutoTokenizer.from_pretrained(final_model_name)

In [76]:
qa_pipeline = pipeline('question-answering', model=final_model, tokenizer=final_tokenizer)

In [77]:
vectorizer = joblib.load('tfidf_vectorizer.pkl')
context_vectors = joblib.load('context_vectors.pkl')

In [78]:
def retrieve_context(question, top_n=1):
    question_vector = vectorizer.transform([question])
    similarities = cosine_similarity(question_vector, context_vectors).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    return [df['context'].iloc[i] for i in top_indices]

def get_answer(question, context):
    result = qa_pipeline({'question': question, 'context': context})
    return result['answer']

def answer_question(question):
    retrieved_contexts = retrieve_context(question)
    if not retrieved_contexts:
        return "No relevant context found."
    answer = get_answer(question, retrieved_contexts[0])
    return answer

In [79]:
question = "How many runs has scorred by WP Sahaon 2022-04-27, where the match has played in between Sunrisers Hyderabad and Gujarat Titans?"
answer = answer_question(question)
answer

'68'

In [80]:
question = "Who played the match on 2024-04-09?"
answer = answer_question(question)
answer

'between Sunrisers Hyderabad'

In [81]:
question = "Who won the match between Punjab Kings and Mumbai Indians played on 2022-04-13?"
answer = answer_question(question)
answer

') and Punjab'

In [82]:
question = "Who won the player of the match between Punjab Kings and Mumbai Indians played on 2022-04-13?"
answer = answer_question(question)
answer

'match is KL'

In [83]:
question = "Who played the match on 2024-04-23?"
answer = answer_question(question)
answer

'between Mumbai Indians'

In [84]:
question = "How many runs has scorred by Q de Kockon 2021-09-19, where the match has played in between Chennai Super Kings and Mumbai Indians?"
answer = answer_question(question)
answer

'runs'

In [85]:
question = "Who won the match between Chennai Super Kings and Lucknow Super Giants played on 2022-03-31?"
answer = answer_question(question)
answer

') and Chennai Super'

In [86]:
question = "Who has scorred most runs in the match between Kolkata Knight Riders and Delhi Capitals played on 2021-04-29?"
answer = answer_question(question)
answer

'match is'

In [87]:
question = "Who played the match on 2023-04-16?"
answer = answer_question(question)
answer

'target'