In [8]:
from pathlib import Path
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain

from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor


from keys import API_KEY

import os 
os.environ['OPENAI_API_KEY'] = API_KEY


In [9]:
paths = [Path().cwd().joinpath('papers').joinpath(paper) for paper in os.listdir('papers')]
path = str(paths[0])


# Questions

### 1. Load

In [3]:
loader = PyPDFLoader(path)
docs = loader.load()

### 2. Split

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 50
)

splits = text_splitter.split_documents(docs)


### 3. Store

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 50
)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
question = "What approach is the author using?"
docs = vectorstore.similarity_search(question)

### 4. Retrieve

In [6]:
question = "What approach is the author using?"
docs = vectorstore.similarity_search(question)

### 5. Generate

In [7]:


llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever())
qa_chain({"query": question})

{'query': 'What approach is the author using?',
 'result': 'The author is comparing and evaluating two different approaches: the constant volatility scaling (CVS) approach and the dynamic volatility scaling (DVS) approach.'}

# Summaries

### Simple Summarize Chain

In [40]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.chains.summarize import load_summarize_chain

loader = PyPDFLoader(path)
pages = loader.load()

llm = ChatOpenAI(temperature=0, model_name="gpt-4")
chain = load_summarize_chain(llm, chain_type="stuff")

In [None]:
summary = chain.run(pages)
print(summary)

In [46]:
summary = 'The study "Risk adjusted momentum strategies: a comparison between constant and dynamic volatility scaling approaches" by Fan, Li, and Liu (2018) investigates the efficacy of two volatility scaling methods in momentum strategies: the constant volatility scaling (CVS) approach of Barroso and Santa-Clara (2015), and the dynamic volatility scaling (DVS) method of Daniel and Moskowitz (2016). These methods are designed to reduce the risks associated with momentum strategies, which, while capable of generating consistent abnormal returns, are also susceptible to significant crashes. \n\nThe researchers applied these strategies to a diversified portfolio of 55 global liquid futures contracts, spanning from June 1986 to May 2017. The CVS-based momentum strategy was found to be the most efficient, yielding an annual return of 15.3%. The study further segments the sample period into three sub-periods: pre-crisis, crisis, and post-crisis for a comprehensive analysis. The CVS-based strategy outperformed the DVS-based strategy in the pre-crisis period, but this superiority became statistically insignificant during and after the crisis. \n\nThe study also examines the impact of ex-ante realized volatility and bear market indicators on WML return using the method of Daniel and Moskowitz (2016). They found that high volatility lowers market expectation and vice versa. The volatility scaled strategies outperformed unscaled strategies, with the CVS-based strategy remaining the most profitable among all. \n\nThe study concludes that the CVS approach is a more efficient volatility scaling method for momentum strategies in futures markets, despite incorporating relatively larger risk and drawdown. It ends up with the highest cumulative returns, outperforming all other strategies including the equally weighted buy-and-hold, TSMOM and XSMOM as well as scaled buy-and-hold and TSMOM strategies. However, the study also acknowledges that the CVS approach displays higher risks compared to other volatility scaling approaches, which might affect its profitability in times of uncertainty. The researchers suggest that future research could focus on investigating the source of this risk and how to alleviate it. \n\nThe study, published in Research in International Business and Finance, is openly accessible via the Queen\'s University Belfast Research Portal. The authors retain copyright, but the work is available under the CC-BY-NC-ND 4.0 license, allowing noncommercial distribution and reproduction with proper citation.'

In [48]:
with open(f'summariessummary__{paths[0].name.split(".pdf")[0]}.tex', 'w') as file:
    file.write(latex_document_str(summary))


### Stuff Summarise Chain

In [144]:
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

# Define prompt USE THIS TO CUSTOMISE!!!
prompt_template = """Write a detailed summary of the following:
"{text}"
CONCISE SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

# Define LLM chain
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Define StuffDocumentsChain
stuff_chain = StuffDocumentsChain(
    llm_chain=llm_chain, document_variable_name="text"
)

docs = loader.load()
print(stuff_chain.run(docs))

The study compares the performance of two volatility scaling methods, constant volatility scaling (CVS) and dynamic volatility scaling (DVS), in momentum strategies. The researchers perform momentum strategies based on these two approaches in a diversified portfolio consisting of 55 global liquid futures contracts. They also compare the results to time series momentum and buy-and-hold strategies. 

The findings show that the momentum strategy based on the constant volatility scaling method (CVS) is the most efficient approach with an annual return of 15.3%. The CVS approach outperforms the DVS approach in terms of alpha, or excess return, with a statistically significant difference. 

The study also identifies a momentum crash in futures markets during the 2009-2013 period, which is attributed to the 2007-2008 global financial crisis. 

Overall, the study concludes that the CVS approach is more efficient and profitable than the DVS approach in momentum strategies. However, the superior

In [146]:
len(docs)

24

### Map Reduce Summarisation

In [83]:
llm = ChatOpenAI(temperature=0)

# Map
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify the main themes 
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce
reduce_template = """The following is set of summaries:
{doc_summaries}
Take these and distill it into a final, consolidated summary of the main themes. 
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(docs)

This does not perform as good as the normal stuff summarisation (but could be very useful for other areas)

refine chain is the best option (But expensive)

In [75]:
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI

loader = PyPDFLoader(path)
pages = loader.load()

llm = ChatOpenAI(temperature=0, model_name="gpt-4")

prompt_template = """Write a detailed summary of the following:
{text}\n
DETAILED SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

refine_template = (
    "Your job is to produce a final summary\n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
    "If the context isn't useful, return the original summary."
)

refine_prompt = PromptTemplate.from_template(refine_template)
chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=False,
    input_key="input_documents",
    output_key="output_text",
)
result = chain({"input_documents": pages}, return_only_outputs=True)

In [134]:
with open('test.tex', 'w') as file:
    file.write(latex_document_str('Test Header', result['output_text']))

In [135]:
print(latex_document_str('Test Header', result['output_text']))

\documentclass{article}
\title{Test Header}
\begin{document} 

\maketitle

The research paper "Risk adjusted momentum strategies: a comparison between constant and dynamic volatility scaling approaches" by Fan, Li, and Liu (2018) explores the effectiveness of risk-adjusted momentum strategies using two different volatility scaling approaches: constant and dynamic. The study uses data from the US stock market and a diversified portfolio of 55 global liquid futures contracts, including commodities, sovereign bonds, currencies, and equity index contracts from various exchanges such as COMEX and TOCOM.\\
\\
The constant volatility scaling approach assumes that the volatility of the stock returns is constant over time, while the dynamic approach allows for changes in volatility over time. The study finds that the dynamic volatility scaling approach outperforms the constant volatility scaling approach in terms of risk-adjusted returns, as it provides a more accurate measure of risk, leading 

In [140]:
from dataclasses import dataclass
from functools import cached_property

from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate

def latex_document_str(header, main_text):
    main_text = main_text.replace("\n\n", " \\\\\n\\\\\n").replace('%', r'\%')
    string = (
        r"\documentclass{article}"+"\n"
        r"\title{"+header+"}"+"\n"
        r"\begin{document} "+"\n\n"
        r"\maketitle"+"\n\n"
        r""+main_text+" \n\n"
        r"\end{document}"
    )
    return string    

@dataclass
class PdfLLM:
    filepath: str

    def __post_init__(self):

        self.llm = ChatOpenAI(temperature=0, model_name="gpt-4")

        self.detailed_prompt_template = (
            "Write a detailed summary of the following:\n"
            "{text}\n\n"
            "DETAILED SUMMARY:")

        self.concise_prompt_template = (
            "Write a concise summary of the following:\n"
            "{text}\n\n"
            "CONCISE SUMMARY:")
        
        self.refine_template = (
            "Your job is to produce a final summary\n"
            "We have provided an existing summary up to a certain point: {existing_answer}\n"
            "We have the opportunity to refine the existing summary"
            "(only if needed) with some more context below.\n"
            "------------\n"
            "{text}\n"
            "------------\n"
            "Given the new context, refine the original summary."
            "If the context isn't useful, return the original summary."
        )

    def save_latex_summary(self):
        pass

    def ask_question(self, question):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = 1000,
            chunk_overlap = 50
        )
        splits = text_splitter.split_documents(self.pages)
        vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
        llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
        qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever())
        return qa_chain({"query": question})['result']

    @cached_property
    def concise_summary(self):
        return self.get_summary(self.concise_prompt_template)
    
    @cached_property
    def detailed_summary(self):
        return self.get_summary(self.detailed_prompt_template)

    def get_summary(self, promt_template):
        chain = self.get_chain(promt_template)
        result = chain({"input_documents": self.pages}, return_only_outputs=True)
        return result['output_text']
    
    @cached_property
    def pages(self):
        loader = PyPDFLoader(self.filepath)
        pages = loader.load()
        return pages 

    def get_chain(self, promt_template):
        prompt = PromptTemplate.from_template(prompt_template)
        refine_prompt = PromptTemplate.from_template(self.refine_template)
        chain = load_summarize_chain(
            llm=self.llm,
            chain_type="refine",
            question_prompt=prompt,
            refine_prompt=refine_prompt,
            return_intermediate_steps=False,
            input_key="input_documents",
            output_key="output_text",
        )
        return chain


In [141]:
pdfllm = PdfLLM(path)

In [143]:
pdfllm.ask_question('What is the header? Return only the header:')

'Bloomberg Ticker Sector Start End Mean SD'

In [None]:
prompt = PromptTemplate.from_template(prompt_template)

refine_prompt = PromptTemplate.from_template(refine_template)
