### Custom_GPT2_TextGeneration

In [25]:
# -Import the required libraries-
import os
import re
import glob
import torch
import nltk
import numpy as np
import pandas as pd
import PyPDF2
import requests
from io import BytesIO
import torch
import string
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments


In [26]:
# -Tokenizer for GPT-2-
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [27]:
# -Fetch text from pdf by link-
def fetch_text_from_pdf(pdf_link):
    try:
        # -Download the PDF file from the provided link-
        response = requests.get(pdf_link)
        response.raise_for_status()

        # -Check if the response content type is PDF-
        if response.headers.get('content-type') == 'application/pdf':
            # -You have successfully fetched the PDF content-
            pdf_content = response.content

            # -Create a BytesIO stream from the PDF content-
            pdf_stream = BytesIO(pdf_content)

            # -Create a PDF reader object-
            pdf_reader = PyPDF2.PdfFileReader(pdf_stream)

            # -Initialize a variable to store the extracted text-
            extracted_text = ""

            # -Extract text from each page of the PDF-
            for page_num in range(pdf_reader.numPages):
                page = pdf_reader.getPage(page_num)
                extracted_text += page.extractText()
            # -Return the extracted text-
            return extracted_text 

        else:
            print("The fetched content is not a PDF.")
            return None

    except Exception as e:
        print(f"Error fetching text from PDF: {e}")
        return None

def preprocess_text(text):
    if text is None:
        # -Return an empty string if text is None-
        return ""  

    # -Remove non-printable characters and Unicode escape sequences-
    text = text.encode('ascii', 'ignore').decode('utf-8')
    # -TODO: Add preprocess steps as per data, Convert the text to lowercase-
    text = text.lower()
    # -Tokenize the text into individual words-
    tokens = word_tokenize(text)
    # -Remove stopwords and punctuation from the tokens-
    stop_words = set(stopwords.words('english'))
    # -Access the punctuation characters-
    punctuation = set(string.punctuation)  
    tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    return " ".join(tokens)

# -Collect and preprocess data from the PDFs-

# -Use a list to store preprocessed text for each book-
corpus = []  

# -List of books on Chanakya Neeti with their PDF links-
books = [
    {"title": "Language Models are Few-Shot Learners", "author": "Tom B. Brown", 
     "pdf_link": "https://arxiv.org/pdf/2005.14165.pdf"},
    {"title": "Explorations in Artificial Intelligence and Machine Learning", 
     "author": "Prof. Roberto V. Zicari", "pdf_link": "https://www.routledge.com/rsc/downloads/AI_FreeBook.pdf"},
    {"title": "Artificial Intelligence A Modern Approach Third Edition",
     "author": "Stuart J. Russell and Peter Norvig",
     "pdf_link": "https://people.engr.tamu.edu/guni/csce421/files/AI_Russell_Norvig.pdf"}
    # -Add more books to the list-
]

for book in books:
    pdf_link = book["pdf_link"]
    text = fetch_text_from_pdf(pdf_link)
    
    if text is not None:
        processed_text = preprocess_text(text)
        # -Append the preprocessed text for each book to the corpus list-
        corpus.append(processed_text)  

# -Print the preprocessed data-
for i, book in enumerate(books):
    print(f"Book {i + 1} - Title: {book['title']}, Author: {book['author']}")
    # -You can print the preprocessed text for each book-
    # print(corpus[i]) 
    print("\n")




Book 1 - Title: Language Models are Few-Shot Learners, Author: Tom B. Brown


Book 2 - Title: Explorations in Artificial Intelligence and Machine Learning, Author: Prof. Roberto V. Zicari


Book 3 - Title: Artificial Intelligence A Modern Approach Third Edition, Author: Stuart J. Russell and Peter Norvig




In [28]:
# -Tokenize the preprocessed text using the GPT-2 tokenizer-
tokenized_corpus = [tokenizer(text, return_tensors="pt") for text in corpus]

# -Create a custom dataset for training-
class CustomTextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, block_size):
        self.texts = texts
        self.tokenizer = tokenizer
        self.block_size = block_size

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(
            text,
            max_length=self.block_size,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(),  # -Remove the extra dimension-
            "attention_mask": inputs["attention_mask"].squeeze(),
        }

block_size = 128  # -This can be changed as per requirement-
custom_dataset = CustomTextDataset(corpus, tokenizer, block_size)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

Token indices sequence length is longer than the specified maximum sequence length for this model (41381 > 1024). Running this sequence through the model will result in indexing errors


In [29]:
# Initialize the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Training arguments
training_args = TrainingArguments(
    output_dir="./custom_gpt2_textgeneration",
    overwrite_output_dir=True,
    num_train_epochs=20,  # Adjust the number of training epochs
    per_device_train_batch_size=32,  # Adjust batch size based on your GPU memory
    save_steps=10_000,   # Adjust this as needed
    save_total_limit=2,  # Only keep the last two checkpoints
    learning_rate=2e-4,  # Adjust the learning rate
    logging_dir="./logs",
    logging_steps=100,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=custom_dataset,
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained("./custom_gpt2_textgeneration")
tokenizer.save_pretrained("./custom_gpt2_textgeneration")


Step,Training Loss


('./custom_gpt2_textgeneration/tokenizer_config.json',
 './custom_gpt2_textgeneration/special_tokens_map.json',
 './custom_gpt2_textgeneration/vocab.json',
 './custom_gpt2_textgeneration/merges.txt',
 './custom_gpt2_textgeneration/added_tokens.json')

In [30]:
# Provide a prompt
prompt = "What is Artificial Neural Network"

# Generate text
generated_text = model.generate(
    input_ids=tokenizer.encode(prompt, return_tensors="pt"),
    max_length=100,  # Adjust the length as needed
    num_return_sequences=1,  # Number of text sequences to generate
    no_repeat_ngram_size=2,  # Avoid repeating n-grams in the output
    top_k=50,  # Limit the choice of next tokens
    top_p=0.95,  # Control randomness in the output
    temperature=0.7,  # Adjust the temperature for randomness
)

# Decode the generated text
generated_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [31]:
print(generated_text)

What is Artificial Neural Network learning machine learning 2nd ed. machine vision 3rd eduplnick tom b. brownbeard machine code prof. roberto v. zicari 1st ednicholas j. ansicommon lisp 2 bayesian networks ariel m. jared bernie profanez alicher tom henighan rewon child aditya ramesh daniel mccain wu clemens winter christopher hesse mark chen er
