In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

!apt install git-lfs

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m307.2/542.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting xx

In [None]:
import pandas as pd
from datasets import Dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer

# Assuming you've loaded your dataset into a DataFrame 'df'
dataset_path = '/data/train.csv'  # Update with the actual path to your dataset
df = pd.read_csv(dataset_path)
df.rename(columns={'Question': 'question', 'Answer': 'context'}, inplace=True)  # Adjust column names as necessary

# Load tokenizer
model_checkpoint = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_length = 384
stride = 128

def preprocess_examples(examples):
    questions = [str(q) for q in examples['question']]  # Ensure questions are treated as strings
    contexts = [str(c) for c in examples['context']]    # Ensure contexts are treated as strings

    tokenized_inputs = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_inputs.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_inputs.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_mapping[i]
        context = str(examples['context'][sample_index]).lower()  # Explicit conversion to string and lowercasing
        answer = context  # Assuming the entire context is the answer

        start_char = context.find(answer)
        end_char = start_char + len(answer)

        start_pos = None
        end_pos = None

        for idx, offset in enumerate(offsets):
            if start_char >= offset[0] and start_char < offset[1]:
                start_pos = idx
            if end_char > offset[0] and end_char <= offset[1]:
                end_pos = idx

        if start_pos is None or end_pos is None:
            start_pos = 0
            end_pos = 0

        start_positions.append(start_pos)
        end_positions.append(end_pos)

    tokenized_inputs.update({
        'start_positions': start_positions,
        'end_positions': end_positions
    })

    return tokenized_inputs


# Convert DataFrame to Hugging Face dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.map(preprocess_examples, batched=True, remove_columns=dataset.column_names)

# Model loading and training setup
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

from datasets import DatasetDict

# Assuming 'dataset' is already loaded and preprocessed
# Split the dataset into training and validation sets
train_test_split = dataset.train_test_split(test_size=0.1)  # 10% for validation
dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})

# Model loading and training setup
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],  # Pass the validation dataset for evaluation
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Prediction example function
def predict_answer(question, context):
    inputs = tokenizer(question, context, return_tensors="pt", max_length=max_length, truncation=True)
    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    # Get the most likely beginning of answer with the argmax of the score
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    # Convert tokens to answer
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    return answer

# Example usage
sample_question = "What is the main theme?"
sample_context = "The main theme of the text is about the challenges of space exploration."
predicted_answer = predict_answer(sample_question, sample_context)
print("Predicted Answer:", predicted_answer)




Map:   0%|          | 0/503 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,No log,0.206779
2,No log,0.342818
3,No log,0.20603


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
import torch

def predict_answer(question, context, model, tokenizer):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    inputs = tokenizer.encode_plus(
        question, context, add_special_tokens=True, return_tensors="pt",
        max_length=max_length, truncation=True, padding="max_length"
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    return answer

# Example usage
sample_question = "Who is at risk for Lymphocytic Choriomeningitis (LCM)?"
sample_context = "LCMV infections can occur after exposure to fresh urine, droppings, saliva, or nesting materials from infected rodents. Transmission may also occur when these materials are directly introduced into broken skin, the nose, the eyes, or the mouth, or presumably, via the bite of an infected rodent. Person-to-person transmission has not been reported, with the exception of vertical transmission from infected mother to fetus, and rarely, through organ transplantation."
predicted_answer = predict_answer(sample_question, sample_context, model, tokenizer)
print("Predicted Answer:", predicted_answer)

Predicted Answer: LCMV infections can occur after exposure to fresh urine, droppings, saliva, or nesting materials from infected rodents.


In [None]:
!pip install PyMuPDF transformers


Collecting PyMuPDF
  Downloading PyMuPDF-1.24.2-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting PyMuPDFb==1.24.1 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.8/30.8 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.2 PyMuPDFb-1.24.1


In [None]:
!pip install PyMuPDF transformers




In [None]:
import fitz  # PyMuPDF
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Function to extract text from the entire PDF
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    full_text = ''
    for page in document:
        full_text += page.get_text()
    document.close()
    return full_text

# Function to chunk text into manageable parts
def chunk_text(text, max_length=3000):
    # Split the text into chunks of max_length
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

def find_answer_in_chunks(question, chunks, model, tokenizer):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Determine if we are using GPU or CPU
    model.to(device)  # Move the model to the appropriate device
    best_answer = ""
    highest_score = float('-inf')

    for chunk in chunks:
        # Encode the question and the chunk of text
        inputs = tokenizer.encode_plus(question, chunk, add_special_tokens=True, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
        inputs = {k: v.to(device) for k, v in inputs.items()}  # Ensure all input tensors are moved to the correct device

        with torch.no_grad():
            outputs = model(**inputs)
            answer_start_scores = outputs.start_logits
            answer_end_scores = outputs.end_logits

        # Find the best answer in the current chunk
        answer_start = torch.argmax(answer_start_scores)
        answer_end = torch.argmax(answer_end_scores) + 1
        score = answer_start_scores[0, answer_start] + answer_end_scores[0, answer_end]

        if score > highest_score:
            highest_score = score
            tokens = inputs['input_ids'][0, answer_start:answer_end]
            best_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(tokens))

    return best_answer


# Main execution block
if __name__ == '__main__':
    # Load tokenizer and model from Hugging Face
    model_checkpoint = "deepset/roberta-base-squad2"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

    # Specify the PDF file path and extract text
    pdf_path = '/content/Medical_Chatbot.pdf'
    extracted_text = extract_text_from_pdf(pdf_path)

    # Break the extracted text into chunks
    chunks = chunk_text(extracted_text)

    # Example: Ask a question and find the answer
    user_question = "What allergic rhinitis"
    best_answer = find_answer_in_chunks(user_question, chunks, model, tokenizer)

    print("Best Answer:", best_answer)



Best Answer:  seasonal and
perennial


In [None]:
# Main execution block
if __name__ == '__main__':
    # Load tokenizer and model from Hugging Face
    model_checkpoint = "deepset/roberta-base-squad2"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

    # Specify the PDF file path and extract text
    pdf_path = '/content/Medical_Chatbot.pdf'
    extracted_text = extract_text_from_pdf(pdf_path)

    # Break the extracted text into chunks
    chunks = chunk_text(extracted_text)



Best Answer: Intact D&X


In [None]:
    # Example: Ask a question and find the answer
    user_question = "How to diagnose Parasites - Cysticercosis"
    best_answer = find_answer_in_chunks(user_question, chunks, model, tokenizer)

    print("Best Answer:", best_answer)

Best Answer:  to make
normal breasts larger for cosmetic purposes


In [None]:
print("Extracted Text:", extracted_text[:1050000])  # Print first 500 characters of the extracted text
