In [19]:
!pip install PyPDF2 pdfplumber transformers torch nltk

Collecting pdfplumber
  Downloading pdfplumber-0.11.3-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m738.9 kB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.3-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.0-py3-n

In [20]:
import PyPDF2

def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Extract text from the PDF
pdf_file = 'Think-And-Grow-Rich.pdf'
pdf_text = extract_text_from_pdf(pdf_file)


In [21]:
import re
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Tokenize
    words = word_tokenize(text)
    # Join tokens back into a single string
    return ' '.join(words)

# Preprocess the extracted text
preprocessed_text = preprocess_text(pdf_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
from transformers import GPT2Tokenizer

# Load the GPT-2 tokenizer and set padding token
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the text
tokens = tokenizer(preprocessed_text, return_tensors='pt', truncation=True, padding=True)


In [23]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")


In [24]:
from torch.utils.data import DataLoader, Dataset

class BookDataset(Dataset):
    def __init__(self, tokens):
        self.input_ids = tokens['input_ids']
        self.attn_masks = tokens['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attn_masks[idx]
        }

dataset = BookDataset(tokens)


In [25]:
from transformers import AdamW

dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(5):  # Increased number of epochs for better fine-tuning
    for batch in dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['input_ids'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")




Epoch: 1, Loss: 3.23862624168396
Epoch: 2, Loss: 2.9155731201171875
Epoch: 3, Loss: 2.8220036029815674
Epoch: 4, Loss: 2.6728200912475586
Epoch: 5, Loss: 2.503666639328003


In [26]:
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")


('fine_tuned_gpt2/tokenizer_config.json',
 'fine_tuned_gpt2/special_tokens_map.json',
 'fine_tuned_gpt2/vocab.json',
 'fine_tuned_gpt2/merges.txt',
 'fine_tuned_gpt2/added_tokens.json')

In [28]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("fine_tuned_gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("fine_tuned_gpt2")

def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        **inputs,
        max_length=100,  # Adjust this as needed
        temperature=0.7,  # Lower temperature for more focused output
        top_k=50,  # Consider only top-k predictions
        top_p=0.9,  # Use nucleus sampling for more diversity
        repetition_penalty=1.2,  # Penalize repetition
        do_sample=True  # Allow sampling for more variety in output
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Loop to keep prompting the user until they type 'exit'
while True:
    user_question = input("Please enter your question (type 'exit' to stop): ")

    if user_question.lower() == "exit":
        print("Exiting the program.")
        break

    # Generate and print the response based on the user's question
    response = generate_text(user_question)
    print("\nResponse:")
    print(response)
    print("\n")  # Add a blank line for better readability


Please enter your question (type 'exit' to stop): what is book name?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Response:
what is book name?
- A collection of short stories from a wide range, including: The Tales From the Earthsea and its inhabitants. (The first two books were published in 2006.) This includes some very popular tales that have been translated into English or other languages by authors such as Gail Collins, Edith Waugh, Anne Siegel, Mark Twain. Many more titles are available online but please visit our website at www.[list]com for further information about what's on offer if


Please enter your question (type 'exit' to stop): who write think and grow rich book?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Response:
who write think and grow rich book? Or is this a game of chess?"
The answer, for me, was "no". I had to be prepared. In the end, though my brain wasn't as good as it should have been (as we've seen with books), there were two things that kept coming back: 1) The power involved in writing novels became overwhelming – which meant you couldn"t keep up your productivity; 2] You needed something else more than just reading fiction or


Please enter your question (type 'exit' to stop): which book is this ? do you know


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Response:
which book is this? do you know what it does?)
The only thing I can say about the story which has been published so far (and that's saying a lot) are its many flaws. It was originally conceived as an attempt to explain how and why humans evolved into mammals, but later on changed their appearance for better science fiction or fantasy stories with new concepts added in such cases! The main reason we continue writing novels now even though there have never really existed any sort of scientific basis upon


Please enter your question (type 'exit' to stop): whats your name?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Response:
whats your name?


[A: I've been using the term "Powers" since at least 2012.] You know, it's a word that comes up quite often in discussions of women. It means 'the power to change'. If you're not making any progress on anything and someone says something like this or does an interview for me then we'll get kicked out from our community because there is no one who can help us with things! (laughs) So how do they feel


Please enter your question (type 'exit' to stop): exit
Exiting the program.
