In [1]:
import os
import pypdfium2 as pdfium
import matplotlib.pyplot as plt

from PIL import Image
from io import BytesIO
from pytesseract import image_to_string

In [2]:
def convert_pdf_to_images(file_path, scale=300/72):
    pdf_file = pdfium.PdfDocument(file_path)  
    page_indices = [i for i in range(len(pdf_file))]
    renderer = pdf_file.render(pdfium.PdfBitmap.to_pil, page_indices = page_indices, scale = scale)
    
    list_final_images = [] 
    for i, image in zip(page_indices, renderer):
        image_byte_array = BytesIO()
        image.save(image_byte_array, format='jpeg', optimize=True)
        image_byte_array = image_byte_array.getvalue()
        list_final_images.append(dict({i:image_byte_array}))
    return list_final_images

def display_images(list_dict_final_images):
    all_images = [list(data.values())[0] for data in list_dict_final_images]
    for index, image_bytes in enumerate(all_images):
        image = Image.open(BytesIO(image_bytes))
        plt.figure(figsize = (image.width / 100, image.height / 100))
        plt.title(f"----- Page Number {index+1} -----")
        plt.imshow(image)
        plt.axis("off")
        plt.show()

def extract_text_with_pytesseract(list_dict_final_images):
    image_list = [list(data.values())[0] for data in list_dict_final_images]
    image_content = []
    for image_bytes in image_list:
        image = Image.open(BytesIO(image_bytes))
        raw_text = str(image_to_string(image))
        image_content.append(raw_text)
    return "\n".join(image_content)

In [3]:
file_path = '1706.03762v7.pdf'
image_file = convert_pdf_to_images(file_path)
text = extract_text_with_pytesseract(image_file)

In [4]:
text

'1706.03762v7 [cs.CL] 2 Aug 2023\n\narXiv\n\nProvided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\n\nAttention Is All You Need\n\nAshish Vaswani* Noam Shazeer* Niki Parmar* Jakob Uszkoreit*\nGoogle Brain Google Brain Google Research Google Research\navaswani@google.com noam@google.com nikip@google.com usz@google.com\n\nLlion Jones* Aidan N. Gomez* ¢ Lukasz Kaiser*\nGoogle Research University of Toronto Google Brain\nllion@google.com aidan@cs.toronto.edu lukaszkaiser@google.com\n\nIllia Polosukhin* #\nillia.polosukhin@gmail.com\n\nAbstract\n\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on atten

In [5]:
output_file1 = 'pypdf.txt'
with open(output_file1, 'w', encoding='utf-8') as f:
    f.write(text)

In [None]:
dir_path = './docs'

pdf_to_images = []
for filename in os.listdir(dir_path):
    if filename.endswith('.pdf'):
        file_path = os.path.join(dir_path, filename)
        pdf_to_images.append(convert_pdf_to_images(file_path))

display_images(pdf_to_images[0])

In [None]:
images_to_text = []
for image_file in pdf_to_images:
    images_to_text.append(extract_text_with_pytesseract(image_file))

print(images_to_text[0])

In [7]:
import os
import nest_asyncio
from llama_parse import LlamaParse

nest_asyncio.apply()

In [8]:
parser = LlamaParse(api_key=os.environ['LLAMAINDEX_API_KEY'], result_type="markdown")
text = parser.load_data(file_path)
text

Started parsing the file under job_id 2c15a3b4-fdc5-4ca1-8897-27de375211c8


[Document(id_='bd994ac4-5f8a-4bf6-83d0-27cdbad3a12a', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='# Attention Is All You Need\n\nAshish Vaswani*\n\nGoogle Brain\n\navaswani@google.com\n\nNoam Shazeer*\n\nGoogle Brain\n\nnoam@google.com\n\nNiki Parmar*\n\nGoogle Research\n\nnikip@google.com\n\nJakob Uszkoreit*\n\nGoogle Research\n\nusz@google.com\n\nLlion Jones*\n\nGoogle Research\n\nllion@google.com\n\nAidan N. Gomez*†\n\nUniversity of Toronto\n\naidan@cs.toronto.edu\n\nŁukasz Kaiser*\n\nGoogle Brain\n\nlukaszkaiser@google.com\n\nIllia Polosukhin*‡\n\nillia.polosukhin@gmail.com\n\n# Abstract\n\nThe dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on atten

In [9]:
combined_text = '\n\n'.join(doc.text for doc in text)
combined_text


'# Attention Is All You Need\n\nAshish Vaswani*\n\nGoogle Brain\n\navaswani@google.com\n\nNoam Shazeer*\n\nGoogle Brain\n\nnoam@google.com\n\nNiki Parmar*\n\nGoogle Research\n\nnikip@google.com\n\nJakob Uszkoreit*\n\nGoogle Research\n\nusz@google.com\n\nLlion Jones*\n\nGoogle Research\n\nllion@google.com\n\nAidan N. Gomez*†\n\nUniversity of Toronto\n\naidan@cs.toronto.edu\n\nŁukasz Kaiser*\n\nGoogle Brain\n\nlukaszkaiser@google.com\n\nIllia Polosukhin*‡\n\nillia.polosukhin@gmail.com\n\n# Abstract\n\nThe dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while bein

In [10]:
output_file2 = 'llamaParser.txt'
with open(output_file2, 'w', encoding='utf-8') as f:
    f.write(combined_text)

In [None]:
dir_path = './docs'
parser = LlamaParse(api_key=os.environ['LLAMAINDEX_API_KEY'], result_type="markdown")

pdf_to_text = []
for filename in os.listdir(dir_path):
    if filename.endswith('.pdf'):
        file_path = os.path.join(dir_path, filename)
        pdf_to_text.append(parser.load_data(file_path))

In [6]:
dir_path = './docs'
for filename in os.listdir(dir_path):
    if filename.endswith('.pdf'):
        print(filename)

2022 Q3 AMZN.pdf
2023 Q1 AAPL.pdf
2023 Q3 AAPL.pdf
2022 Q3 MSFT.pdf
2022 Q3 INTC.pdf
2023 Q2 AAPL.pdf
2023 Q1 AMZN.pdf
2022 Q3 NVDA.pdf
2023 Q3 MSFT.pdf
2023 Q2 INTC.pdf
2023 Q3 NVDA.pdf
2023 Q1 INTC.pdf
2022 Q3 AAPL.pdf
2023 Q1 MSFT.pdf
2023 Q3 AMZN.pdf
2023 Q2 NVDA.pdf
2023 Q2 MSFT.pdf
2023 Q2 AMZN.pdf
2023 Q1 NVDA.pdf
2023 Q3 INTC.pdf


In [None]:
print(pdf_to_text[0])

In [5]:
import pickle
arr = [item.text for doc in pdf_to_text for item in doc]

with open('data.pkl', 'wb') as file:
    pickle.dump(arr, file)

In [11]:
import pickle
with open('data.pkl', 'rb') as file:
    brr = pickle.load(file)

print(len(brr))
# for str in brr:
    # print(len(str))

1037


In [None]:
import llama_index
output = pdf_to_text[0]
if isinstance(output, list):
    for item in output:
        if isinstance(item, llama_index.core.schema.Document):
            print(item.text)
        else:
            print(f"Non-Document element found: {type(item)}")
else:
    print("Unexpected structure:", type(output))


In [1]:
import pandas as pd
import networkx as nx
import google.generativeai as genai

from langchain_community.graphs.networkx_graph import NetworkxEntityGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.documents import Document
from langchain.chains import GraphQAChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain import HuggingFacePipeline

2024-10-11 21:53:54.997540: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-11 21:53:55.228511: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-11 21:53:55.228562: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-11 21:53:55.228989: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-11 21:53:55.294148: I tensorflow/core/platform/cpu_feature_g

In [None]:
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
llm = GoogleGenerativeAI(model='gemini-1.5-pro', google_api_key=os.environ['GOOGLE_API_KEY'])

print('Available base models:', [m.name for m in genai.list_models()])

Available base models: ['models/chat-bison-001', 'models/text-bison-001', 'models/embedding-gecko-001', 'models/gemini-1.0-pro-latest', 'models/gemini-1.0-pro', 'models/gemini-pro', 'models/gemini-1.0-pro-001', 'models/gemini-1.0-pro-vision-latest', 'models/gemini-pro-vision', 'models/gemini-1.5-pro-latest', 'models/gemini-1.5-pro-001', 'models/gemini-1.5-pro-002', 'models/gemini-1.5-pro', 'models/gemini-1.5-pro-exp-0801', 'models/gemini-1.5-pro-exp-0827', 'models/gemini-1.5-flash-latest', 'models/gemini-1.5-flash-001', 'models/gemini-1.5-flash-001-tuning', 'models/gemini-1.5-flash', 'models/gemini-1.5-flash-exp-0827', 'models/gemini-1.5-flash-002', 'models/gemini-1.5-flash-8b', 'models/gemini-1.5-flash-8b-001', 'models/gemini-1.5-flash-8b-latest', 'models/gemini-1.5-flash-8b-exp-0827', 'models/gemini-1.5-flash-8b-exp-0924', 'models/embedding-001', 'models/text-embedding-004', 'models/aqa']


In [2]:
text = """
Marie Curie was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.
She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.
Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
She was, in 1906, the first woman to become a professor at the University of Paris.
"""

len(text)

554

In [None]:
text = ' '.join(item.text for doc in pdf_to_text for item in doc)

In [None]:
model_name = "huggyllama/llama-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=pipe)

In [4]:
documents = [Document(page_content=text)]
llm_transformer = LLMGraphTransformer(llm=llm)

In [None]:
type(documents[0])

In [None]:
len(pdf_to_text[0])

In [42]:
from langchain_core.documents.base import Document

def split_documents_into_chunks(documents, chunk_size=2048, overlap_size=20):
    chunks = []
    for document in documents:
        for i in range(0, len(document), chunk_size - overlap_size):
            chunk = document[i: i + chunk_size]
            chunks.append(chunk)
    return chunks

docs = [item.text for doc in pdf_to_text for item in doc]

# Assuming 'documents' is a list of strings
chunks = split_documents_into_chunks(text)

# Create Document objects from chunks
documents = []
for i, chunk in enumerate(chunks):
    node1 = Document(page_content=chunk, metadata={"id": str(i)})
    documents.append(node1)

In [None]:
documents

In [45]:
llm_transformer = LLMGraphTransformer(llm=llm)

In [5]:
graph_document = llm_transformer.convert_to_graph_documents(documents)

In [None]:
from langchain_community.callbacks import get_openai_callback

with get_openai_callback() as cb:
    graph_document = llm_transformer.convert_to_graph_documents(documents)

print(cb)

In [6]:
df = pd.DataFrame(columns=['node1', 'node2', 'relation'])
for edge in graph_document[0].relationships:
    df = pd.concat([df, pd.DataFrame({'node1': [edge.source.id], 'node2': [edge.target.id], 'relation': [edge.type]})], ignore_index=True)

df

Unnamed: 0,node1,node2,relation
0,Marie Curie,physicist,OCCUPATION
1,Marie Curie,chemist,OCCUPATION
2,Marie Curie,Nobel Prize,AWARDED
3,Marie Curie,Nobel Prize,AWARDED
4,Marie Curie,Nobel Prize,AWARDED
5,Marie Curie,Pierre Curie,SPOUSE
6,Pierre Curie,Nobel Prize,AWARDED
7,Marie Curie,professor,POSITION
8,Marie Curie,University of Paris,WORKS_FOR


In [7]:
graph = NetworkxEntityGraph()

for node in graph_document[0].nodes:
    graph.add_node(node.id)

for edge in graph_document[0].relationships:
    graph._graph.add_edge(edge.source.id, edge.target.id, relation=edge.type)

nx.write_gml(graph._graph, 'data.gml')

In [10]:
# with get_openai_callback() as cb:
chain = GraphQAChain.from_llm(llm=llm, graph=graph, verbose=True)

question = """Who is Marie Curie?"""
chain.invoke(question)

Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


Entities Extracted:
[32;1m[1;3mMarie Curie 
[0m
Full Context:
[32;1m[1;3mMarie Curie OCCUPATION physicist
Marie Curie OCCUPATION chemist
Marie Curie AWARDED Nobel Prize
Marie Curie SPOUSE Pierre Curie
Marie Curie POSITION professor
Marie Curie WORKS_FOR University of Paris[0m

[1m> Finished chain.[0m


{'query': 'Who is Marie Curie?',
 'result': 'Marie Curie was a physicist and chemist, known for winning the Nobel Prize. She was a professor at the University of Paris and was married to Pierre Curie. \n'}