In [5]:
from dataclasses import dataclass
from functools import cached_property

from pathlib import Path
import os

old_dir = Path().absolute()
os.chdir('..')
from keys import API_KEY
os.environ['OPENAI_API_KEY'] = API_KEY
os.chdir(old_dir)


import fitz
import pandas as pd


from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

import logging

logging.basicConfig(level=logging.INFO)


models = ['gpt-4', 'gpt-3.5-turbo', 'gpt-3.5-turbo-16k']




def make_latex_document_str(header, data_description, main_text):
    main_text = main_text.replace("\n\n", " \\\\\n\\\\\n").replace('%', r'\%')
    data_description = data_description.replace("\n\n", " \\\\\n\\\\\n").replace('%', r'\%')
    string = (
        r"\documentclass{article}"+"\n"
        r"\title{"+header+"}"+"\n"
        r"\author{ChatGPT}"+"\n"
        r"\begin{document} "+"\n\n"
        r"\maketitle"+"\n"
        r"\section{Data}"+"\n"
        r""+data_description+" \n"
        r"\section{Summary}"+"\n"
        r""+main_text+" \n\n"
        r"\end{document}"
    )
    return string    

@dataclass
class PdfLLM:
    filepath: str
    model_name: str = 'gpt-3.5-turbo-16k'
    temperature: float = 0
    #'gpt-4'

    def __post_init__(self):

        self.llm = ChatOpenAI(temperature=0, model_name=self.model_name)

        self.detailed_prompt_template = (
            "Write a detailed summary of the text." 
            "The summary should be up to 2000 words." 
            "Do not summarise any graphs, charts or exhibits."
            "Do not summarise the list of references."
            "Do not summarise the appendix."
            "Make sure that it is clear what you are summarising in each step."
            "Be concise, but don't miss details."
            "The text is:\n"
            "{text}\n\n"
            "DETAILED SUMMARY:")

        self.concise_prompt_template = (
            "Write a concise summary of the following:\n"
            "{text}\n\n"
            "CONCISE SUMMARY:")
        
        self.refine_template = (
            "Your job is to produce a final summary\n"
            "We have provided an existing summary up to a certain point: {existing_answer}\n"
            "We have the opportunity to refine the existing summary"
            "(only if needed) with some more context below.\n"
            "------------\n"
            "{text}\n"
            "------------\n"
            "Given the new context, refine the original summary."
            "If the context isn't useful, return the original summary."
        )

        self.data_question = 'What data is used in the study, what countries are they using and what is the timeframe? '
        self.logger = logging.getLogger(f'{self.filepath}.log')

    def save_latex_summary(self, title, filename:str = None):
        filename = title if not filename else filename 
        filename = filename if '.tex' in filename else filename+'.tex'
        summary_text = self.detailed_summary
        data_description = self.ask_question(self.data_question)
        latex_str = make_latex_document_str(title, data_description, summary_text)
        with open(filename, 'w') as file:
            file.write(latex_str)

        self.logger.info(f'FILE: {filename} CREATED')
        
    def ask_question(self, question):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = 1000,
            chunk_overlap = 50
        )
        splits = text_splitter.split_documents(self.pages)
        vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
        qa_chain = RetrievalQA.from_chain_type(self.llm,retriever=vectorstore.as_retriever())
        return qa_chain({"query": question})['result']

    @cached_property
    def concise_summary(self):
        return self.get_summary(self.concise_prompt_template)
    
    @cached_property
    def detailed_summary(self):
        return self.get_summary(self.detailed_prompt_template)

    def get_summary(self, promt_template):
        chain = self.get_chain(promt_template)
        pages = self.pages
        self.logger.info('RUNNING CHAIN..')
        result = chain({"input_documents": pages}, return_only_outputs=True)
        self.logger.info('SUMMARY FINISHED')
        return result['output_text']
    
    @cached_property
    def pages(self):
        loader = PyPDFLoader(self.filepath)
        pages = loader.load()
        self.logger.info('LOADED PAGES')
        return pages 

    def get_chain(self, prompt_template):
        prompt = PromptTemplate.from_template(prompt_template)
        refine_prompt = PromptTemplate.from_template(self.refine_template)
        chain = load_summarize_chain(
            llm=self.llm,
            chain_type="refine",
            question_prompt=prompt,
            refine_prompt=refine_prompt,
            return_intermediate_steps=False,
            input_key="input_documents",
            output_key="output_text",
        )
        self.logger.info('CREATED CHAIN')
        return chain



class PdfLLMStuff(PdfLLM):

    # def __post_init__(self):
    #     super().__post_init__()

    def get_chain(self, prompt_template):

        prompt = PromptTemplate.from_template(prompt_template)        
        llm_chain = LLMChain(llm=self.llm, prompt=prompt)
        # Define StuffDocumentsChain
        stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
        self.logger.info('CREATED CHAIN')
        return stuff_chain
    
    def get_summary(self, promt_template):
        chain = self.get_chain(promt_template)
        pages = self.pages
        self.logger.info('RUNNING CHAIN..')
        result = chain.run(pages)
        self.logger.info('SUMMARY FINISHED')
        return result



In [2]:
paths = [Path().cwd().joinpath('papers').joinpath(paper) for paper in os.listdir('papers')]
path = paths[7]

pdfllm = PdfLLM(str(path), model_name='gpt-3.5-turbo')
pdfllmstuff = PdfLLMStuff(str(path), model_name='gpt-4')


In [52]:
pdfllm.save_latex_summary('Factor Momentum Everywhere', str(Path('summaries').joinpath('gpt_3_2_'+path.name.split('.')[0]+'_summary')))

INFO:/Users/niklasgaertner/Desktop/coding/gpt_research/papers/factor_momentum_everywhere.pdf.log:CREATED CHAIN
INFO:/Users/niklasgaertner/Desktop/coding/gpt_research/papers/factor_momentum_everywhere.pdf.log:LOADED PAGES
INFO:/Users/niklasgaertner/Desktop/coding/gpt_research/papers/factor_momentum_everywhere.pdf.log:RUNNING CHAIN..
INFO:/Users/niklasgaertner/Desktop/coding/gpt_research/papers/factor_momentum_everywhere.pdf.log:SUMMARY FINISHED
INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
INFO:/Users/niklasgaertner/Desktop/coding/gpt_research/papers/factor_momentum_everywhere.pdf.log:FILE: summaries/gpt_3_2_factor_momentum_everywhere_summary.tex CREATED


In [121]:
def get_title(path):

    """returns the title of the document based on text size, position and page"""

    NON_TITLE_WORDS = ['draft', 'journal']

    pdf_details_list = []
    with fitz.open(path) as doc:
        for page_no, page in enumerate(doc.pages()):
            blocks = page.get_text("dict")["blocks"]
            for block in blocks: 
                block_no = block['number']
                block_str = ''
                block_bold = []
                block_italic = []
                if 'lines' in block.keys():
                    for line in block['lines']:
                        for span in line['spans']:
                            block_str += span['text']
                            location_x = span['origin'][0]
                            location_y = span['origin'][1]
                            size = span['size']

                # add block to pdf_details if block has text
                if len(block_str.strip()) > 0:
                    pdf_details_list.append([page_no, block_no ,block_str, location_x, location_y, size])

    pdf_details = pd.DataFrame(pdf_details_list, columns=['page', 'block', 'text', 'location_x', 'location_y', 'size']).set_index(['page', 'block'])
    pdf_details['text'] = pdf_details['text'].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii')) #remove weird encoding

    pdf_details = pdf_details.loc[pdf_details.text.str.strip() != '']
    for no_word in NON_TITLE_WORDS:
        pdf_details = pdf_details.loc[~pdf_details.text.str.lower().str.contains(no_word)]
    three_largest = pdf_details.loc[pdf_details['size'].isin(sorted(pdf_details['size'].unique())[-4:])]    
    title = three_largest.loc[0].sort_values('location_y')['text'].iat[0]
    if len(title) < 3:
        title = three_largest.iloc[0]['text']

    return title


    

In [108]:
pdf_details = details_dict['/Users/niklasgaertner/Desktop/coding/gpt_research/notebooks/papers/a_five_factor_asset_pricing_model.pdf']



In [114]:
three_largest = pdf_details.loc[pdf_details['size'].isin(sorted(pdf_details['size'].unique())[-3:])]
three_largest.sort_values('location_y')

Unnamed: 0_level_0,Unnamed: 1_level_0,text,location_x,location_y,size
page,block,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,34,Journal of Financial Economics,194.264496,133.619019,14.223766
0,2,A five-factor asset pricing model$,273.407806,205.505615,9.698017
0,5,a r t i c l e i n f o,62.850399,289.524475,9.143793
0,9,a b s t r a c t,223.007797,289.524475,9.143793


page  block
0     2        False
      5        False
      9        False
      34        True
Name: text, dtype: bool