In [127]:
!pip install PyPDF2 pdfplumber pytesseract pillow nltk




In [128]:
# Import required libraries
import PyPDF2
import pdfplumber
import pytesseract
from PIL import Image
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [129]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [130]:
# Function to extract text from PDF using PyPDF2
def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

In [131]:
# Function to extract text from a scanned PDF or image using OCR (Tesseract)
def extract_text_from_scanned_pdf(pdf_file):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            # Extract the page image
            img = page.to_image()
            img = img.original
            # Convert to PIL image
            pil_img = Image.fromarray(img)
            # Use Tesseract to do OCR on the image
            page_text = pytesseract.image_to_string(pil_img)
            text += page_text
    return text

In [132]:
# Function to preprocess the extracted text
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return words

In [133]:
# Example usage for extracting text from a standard PDF
pdf_file = 'Think-And-Grow-Rich.pdf'
pdf_text = extract_text_from_pdf(pdf_file)

In [134]:
# Preprocess the text
preprocessed_text = preprocess_text(pdf_text)

In [135]:

# Show how the preprocessed text looks like
print(preprocessed_text[:100])

['think', 'grow', 'rich', 'free', 'digital', 'download', 'pdf', 'ebook', 'edition', 'republished', 'wwwthinkandgrowrichebookcom', 'think', 'grow', 'rich', 'legal', 'notice', 'disclaimer', 'digital', 'download', 'pdf', 'ebook', 'edition', 'related', 'web', 'site', 'prepared', 'approved', 'licensed', 'endorsed', 'sponsored', 'otherwise', 'affiliated', 'napoleon', 'hill', 'family', 'heirs', 'napoleon', 'hill', 'foundation', 'ralston', 'society', 'past', 'present', 'publishers', 'book', 'web', 'site', 'ebook', 'dedicated', 'classic', 'work', 'think', 'grow', 'rich', 'written', 'napoleon', 'hill', '1937', 'electronic', 'ebook', 'edition', 'published', '2007', 'thinkandgrowrichebookcom', 'reproduction', 'complete', '1937', 'version', 'originally', 'published', 'ralston', 'society', 'public', 'domain', 'think', 'grow', 'rich', 'registered', 'trademark', 'property', 'napoleon', 'hill', 'foundation', 'book', 'title', 'think', 'grow', 'rich', 'used', 'digital', 'ebook', 'related', 'web', 'site',

# Fine-Tuning BERT for Masked Language Modeling (MLM)

In [136]:
!pip install transformers torch datasets



In [137]:
# Import required libraries
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments
from datasets import Dataset
import torch

In [138]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [139]:
# Prepare the preprocessed text data for MLM (Masked Language Modeling)
preprocessed_text = [" ".join(preprocessed_text)]

In [140]:
# Create a dataset from the preprocessed text
def tokenize_function(examples):
    # Tokenize the input text
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

    # Create labels identical to input_ids
    tokenized_inputs['labels'] = tokenized_inputs['input_ids'].copy()

    return tokenized_inputs

In [141]:
dataset = Dataset.from_dict({"text": preprocessed_text})


In [142]:
# If the dataset has more than one sample, split it; otherwise, use the entire dataset for training
if len(dataset) > 1:
    train_test_split = dataset.train_test_split(test_size=0.1)
    train_dataset = train_test_split['train']
    eval_dataset = train_test_split['test']
else:
    train_dataset = dataset
    eval_dataset = None  # No evaluation dataset if we only have one sample


In [143]:
# Map the tokenization function to the dataset
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
if eval_dataset:
    tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [144]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch" if eval_dataset else "no",  # Disable evaluation if there's no eval dataset
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)



In [145]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset if eval_dataset else None,  # Only provide eval dataset if it exists
    tokenizer=tokenizer,
)

In [146]:
# Train the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=3, training_loss=0.45960354804992676, metrics={'train_runtime': 45.112, 'train_samples_per_second': 0.067, 'train_steps_per_second': 0.067, 'total_flos': 197403609600.0, 'train_loss': 0.45960354804992676, 'epoch': 3.0})

In [147]:
# Save the trained model
model.save_pretrained("./trained_bert_mlm")
tokenizer.save_pretrained("./trained_bert_mlm")

('./trained_bert_mlm/tokenizer_config.json',
 './trained_bert_mlm/special_tokens_map.json',
 './trained_bert_mlm/vocab.txt',
 './trained_bert_mlm/added_tokens.json')

# Use Pre-trained QA Model

In [148]:
!pip install transformers torch



In [149]:
# Import required libraries
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

In [150]:
# Load pre-trained QA model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [151]:
# Example context (replace with your preprocessed text or a relevant passage)
context = """Think and Grow Rich is a book written by Napoleon Hill. It was published in 1937 and is considered a personal development and self-help book."""

In [152]:
# Function to get the answer from the model
def get_answer(question, context):
    # Encode the inputs
    inputs = tokenizer(question, context, return_tensors="pt")

    # Get the model's output
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the answer by taking the argmax of the start and end logits
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    # Decode the answer from the input IDs
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs.input_ids[0][answer_start:answer_end]))
    return answer


In [153]:
# Main loop to get user input and provide an answer
def main():
    while True:
        # Get the question from the user
        question = input("Enter your question (or type 'exit' to quit): ")
        if question.lower() == 'exit':
            break

        # Get and print the answer
        answer = get_answer(question, context)
        print(f"Answer: {answer}")

if __name__ == "__main__":
    main()

Enter your question (or type 'exit' to quit): book name
Answer: think and grow rich
Enter your question (or type 'exit' to quit): book category?
Answer: personal development and self - help
Enter your question (or type 'exit' to quit): exit
