In [None]:
huggingface_token = ''
import os
os.environ["HF_TOKEN"] = str(huggingface_token)

In [None]:
import fitz  # PyMuPDF
import pandas as pd
import os
from transformers import LlamaTokenizer

baseline_model_path = "meta-llama/Llama-2-7b-chat-hf"


# Initialize the Llama tokenizer
tokenizer = LlamaTokenizer.from_pretrained(baseline_model_path)

def tokenize_and_chunk(text, chunk_size=512):
    # Tokenize the text
    tokens = tokenizer.encode(text)

    # Break tokens into chunks of 'chunk_size', ensuring not to split words
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

    # Convert token ids back to strings and ensure the last word is not split
    chunk_texts = []
    for chunk in chunks:
        chunk_text = tokenizer.decode(chunk, clean_up_tokenization_spaces=True)
        chunk_texts.append(chunk_text)
    return chunk_texts

# Function to read PDF and tokenize text
def process_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return tokenize_and_chunk(text)

# Paths to your PDF files
pdf_paths = ["/content/Book1.pdf", "/content/Book2.pdf", "/content/Book3.pdf",
             "/content/Book4.pdf", "/content/Book5.pdf", "/content/Book6.pdf",
             "/content/Book7.pdf"]

# Tokenize all PDFs and collect texts
all_texts = []
for path in pdf_paths:
    print(path)
    all_texts.extend(process_pdf(path))

# Save to CSV
df = pd.DataFrame(all_texts, columns=['text'])
csv_path = "Books.csv"
df.to_csv(csv_path, index=False)

# Output the path to the saved CSV file
csv_path


/content/Book1.pdf
/content/Book2.pdf
/content/Book3.pdf
/content/Book4.pdf
/content/Book5.pdf
/content/Book6.pdf
/content/Book7.pdf


'Books.csv'

In [None]:
df['text'][11]

'?”\n      “I’ve come to bring Harry to his aunt and uncle. They’re the only family\nhe has left now.”\n      “You don’t mean – you can’t mean the people who live here?” cried\nProfessor McGonagall, jumping to her feet and pointing at number four.\n“Dumbledore — you can’t. I’ve been watching them all day. You couldn’t find\ntwo people who are less like us. And they’ve got this son — I saw him kicking\nhis mother all the way up the street, screaming for sweets. Harry Potter come\nand live here!”\n      “It’s the best place for him,” said Dumbledore firmly. “His aunt and\nuncle will be able to explain everything to him when he’s older. I’ve written\nthem a letter.”\n      “A letter?” repeated Professor McGonagall faintly, sitting back down on\nthe wall. “Really, Dumbledore, you think you can explain all this in a letter?\nThese people will never understand him! He’ll be famous — a legend — I\nwouldn’t be surprised if today was known as Harry Potter day in the future —\nthere will be book

In [None]:
import fitz  # PyMuPDF
import pandas as pd
from transformers import LlamaTokenizer

# Initialize the Llama tokenizer with a hypothetical path; replace with your actual path
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

def preprocess_text(text):
    # Preprocess the text to handle hyphens and line breaks
    text = text.replace('-\n', '').replace('\n', ' ')
    return text

def tokenize_and_chunk(text):
    # Preprocess the text
    preprocessed_text = preprocess_text(text)

    # Tokenize the text and get token ids
    tokens = tokenizer(preprocessed_text, add_special_tokens=False, return_tensors='pt')['input_ids'].squeeze()

    # Initialize an empty list to store chunks
    chunks = []
    for i in range(0, tokens.size(0), 512):
        # Ensure the chunk does not exceed 512 tokens
        chunk = tokens[i:i+512]
        # Decode the chunk back to text
        chunk_text = tokenizer.decode(chunk, clean_up_tokenization_spaces=True)
        chunks.append(chunk_text)

    return chunks

def process_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return tokenize_and_chunk(text)

# Assume paths to PDF files are correctly set
pdf_paths = ["/content/Book1.pdf", "/content/Book2.pdf", "/content/Book3.pdf",
             "/content/Book4.pdf", "/content/Book5.pdf", "/content/Book6.pdf",
             "/content/Book7.pdf"]

all_texts = []
for path in pdf_paths:
    print(path)
    all_texts.extend(process_pdf(path))

# Save tokenized and chunked text to a DataFrame and then to a CSV file
df = pd.DataFrame(all_texts, columns=['text'])
csv_path = "Books.csv"
df.to_csv(csv_path, index=False)

print(f"Saved tokenized texts to {csv_path}")


/content/Book1.pdf
/content/Book2.pdf
/content/Book3.pdf
/content/Book4.pdf
/content/Book5.pdf
/content/Book6.pdf
/content/Book7.pdf
Saved tokenized texts to Books.csv


In [None]:
len(tokenizer(df['text'][12], add_special_tokens=False, return_tensors='pt')['input_ids'].squeeze())

512

In [None]:
len(df['text'])

3583