In [4]:
from openai import InvalidRequestError
from keys import API_KEY
import PyPDF2
from tqdm import tqdm
import pandas as pd
import re
import math
import fitz
from pathlib import Path

In [5]:
from functools import lru_cache



@lru_cache
def query_gpt(text, model='gpt-3.5-turbo', **kwargs):

    import openai
    openai.api_key = API_KEY

    response = openai.ChatCompletion.create(
        messages=[
            {"role": "system", "content": "You are a helpful research assistant."},
            {"role": "user", "content": text},
        ],
        model=model,
        **kwargs
    )
    return response["choices"][0]["message"]["content"]


def query_gpt_with_context_concise(text, context, model='gpt-3.5-turbo', **kwargs):
    import openai
    openai.api_key = API_KEY

    response = openai.ChatCompletion.create(
        messages=[
            {"role": "system", "content": f"Context: {context} \n\n You are a helpful research assistant. Please answer as concisely as possible and use the context to inform the answer. If the context does not have enough information, reply with 'NA'"},
            {"role": "user", "content": text},
        ],
        model=model,
        **kwargs
    )
    return response["choices"][0]["message"]["content"]
    
def query_gpt_with_context(text, context, model='gpt-3.5-turbo', **kwargs):

    import openai
    openai.api_key = API_KEY

    response = openai.ChatCompletion.create(
        messages=[
            {"role": "system", "content": f"Context: {context} \n\n You are a helpful research assistant. Please use the context to inform the answer. If the context does not have enough information, reply with 'NA'"},
            {"role": "user", "content": text},
        ],
        model=model,
        **kwargs
    )
    return response["choices"][0]["message"]["content"]

@lru_cache
def query_gpt_with_context_detailed(text, context, model='gpt-3.5-turbo', **kwargs):

    import openai
    openai.api_key = API_KEY

    response = openai.ChatCompletion.create(
        messages=[
            {"role": "system", "content": f"Context: {context} \n\n You are a helpful research assistant. Please answer detailed and use the context to inform the answer. If the context does not have enough information, reply with 'NA'"},
            {"role": "user", "content": text},
        ],
        model=model,
        **kwargs
    )
    return response["choices"][0]["message"]["content"]



def get_summary(text):

    import openai
    openai.api_key = API_KEY
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful research assistant."},
            {"role": "user", "content": f"Summarize this: {text}"},
        ],
    )
    return response["choices"][0]["message"]["content"]



def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return ", ".join(l)

In [6]:
def get_summaries(text, chapters):

    text_by_chapter = {}
    summary_by_chapter = {}
    statistical_tools = {}
    data = {}
    for chapter, next_chapter in tqdm(zip(chapters, chapters[1:] + ['end'])):
        chapter_loc = text.lower().find(chapter.lower())
        next_chapter_loc = text[chapter_loc:].lower().find(next_chapter.lower()) + chapter_loc
        text_by_chapter[chapter] = text[chapter_loc:next_chapter_loc]
        data[chapter] = [chapter_loc, next_chapter_loc]
        try: 
            summary_by_chapter[chapter] = query_gpt(f'Summarise this chapter of a research paper called "{chapter}": {text_by_chapter[chapter]}')
            # statistical_tools[chapter] = query_gpt(f'What statistical tools are the authors using in this chapter? Return just a list of tools. No text, just list. Use "*" as bullet points: {text_by_chapter[chapter]}')
        except InvalidRequestError as e:
            e = str(e)
            num_location = e.find('in ')
            num = int(e[num_location+3: num_location+8])
            num_calls = math.ceil((num/4000))
            split_text = [text_by_chapter[chapter][i*4000: (i+1)*4000] for i in range(num_calls)]
            summary_by_chapter[chapter] = ' '.join([query_gpt(f'Summarise this part of a chapter of a research paper: {text}') for text in split_text])
            # statistical_tools[chapter] = ' '.join([query_gpt(f'What statistical tools are the authors using in this part? Return just a list of tools. No text, just list. Use "*" as bullet points: {text}') for text in split_text])

    return summary_by_chapter, statistical_tools

In [7]:
import logging
from typing import Dict, List
from dataclasses import dataclass
from enum import Enum
from functools import lru_cache
import os

class GPTDetail(str, Enum):
    concise = 'concise'
    detailed = 'detailed'
    

@dataclass
class PDFAnalyser:
    file_path: str = None
    logger = logging.getLogger('log')

    def load_pdf_detailed(self) -> pd.DataFrame:

        pdf_details_list = []
        with fitz.open(self.file_path) as doc:

            for page_no, page in enumerate(doc.pages()):
                blocks = page.get_text("dict")["blocks"]

                for block in blocks: 
                    block_no = block['number']
                    block_str = ''
                    block_bold = []
                    block_italic = []
                    if 'lines' in block.keys():
                        for line in block['lines']:
                            for span in line['spans']:
                                block_str += span['text']
                                flags = flags_decomposer(span['flags'])
                                block_bold.append('bold' in flags)
                                block_italic.append('italic' in flags)
                                location = span['origin'][0]
                                size = span['size']

                    
                    # add block to pdf_details if block has text
                    if len(block_str.strip()) > 0:
                        pdf_details_list.append([page_no, block_no ,block_str, all(block_bold), all(block_italic), location, size])

        pdf_details = pd.DataFrame(pdf_details_list, columns=['page', 'block', 'text', 'bold', 'italic', 'location', 'size']).set_index(['page', 'block'])
        pdf_details['formatting'] = pdf_details['bold'] | pdf_details['italic']
        pdf_details['text'] = pdf_details['text'].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii')) #remove weird encoding

        self._full_text = ' '.join(list(pdf_details['text']))
        self._pages = {page: ' '.join(list(pdf_details.loc[pdf_details.index.get_level_values(0)==page, 'text'])) for page in pdf_details.index.get_level_values(0).unique()}
        
        return pdf_details
    
    def create_toc(self, pdf_details:pd.DataFrame, formatting:str='bold', variance_minimum:float=5, remove_keywords:List=['exhibit', 'table']) -> pd.DataFrame:
        
        # get unique locations of blocks and number of ocurrences 
        locations = pdf_details.loc[pdf_details[formatting]].groupby('location').count()['text'].sort_values()
        
        # go through each location and determine if it should be part of TOC (part of location, formatting, variance of sentence length within group)
        toc_options = pd.DataFrame()
        for i in range(1, len(locations)):    
            n_most_location_bold = pdf_details.loc[(pdf_details.location.round(2) == round(locations.index[-i], 2)) & pdf_details[formatting]].copy()
            n_most_location_bold['variance'] = n_most_location_bold['text'].apply(lambda x: len(x)).var()
            toc_options = pd.concat([toc_options, n_most_location_bold])
        
        if len(toc_options.index) == 0:
            raise ValueError('TOC could not be generated, try a different pdf')

        
        toc = toc_options.loc[
            (toc_options.variance > variance_minimum) 
            & ~(toc_options.variance.isna())].drop_duplicates().sort_index() #remove those that are not relevant
        
        # remove keywords that don't belong in TOC
        for keyword in remove_keywords:
            toc = toc.loc[toc.text.apply(lambda x: keyword not in x.replace(' ', '').lower())].copy()
        
        if len(toc.index) == 0:
            raise ValueError('TOC could not be generated, try a different pdf')

        # If the word intro does not exist, set the start of the second page as first chapter
        if 'intro' not in toc.iloc[0].at['text'].lower():
            toc.loc[(-1,0), 'text'] = pdf_details.loc[(1,)].iloc[0].at['text']
        
        toc = toc.sort_index()
        toc = list(toc['text'])

        # try to remove everything after conclusion/concluding remarks etc.
        try:
            toc = toc[:toc.index([i for i in toc if 'concl' in i.lower()][0])+1]
        except Exception as e:
            print(f'toc has no conclusion: {e}')

        concl_loc = pdf_details.index.get_loc(tuple(pdf_details.loc[pdf_details.text == toc[-1]].index)[0])
        post_conc_loc = pdf_details.index.get_loc(tuple(pdf_details.iloc[concl_loc+1:].loc[pdf_details.bold].iloc[0].name))
        toc.append(pdf_details.iloc[post_conc_loc].loc['text'])

        return toc
    
    def get_chapter_text(self, text, toc) -> dict:

        next_chapter_loc = 0
        text_by_chapter = {}

        for chapter, next_chapter in tqdm(zip(toc[:-1], toc[1:])):
            chapter_loc = text[next_chapter_loc:].lower().find(chapter.lower()) + next_chapter_loc
            next_chapter_loc = text[chapter_loc:].lower().find(next_chapter.lower()) + chapter_loc
            chapter_text = text[chapter_loc:next_chapter_loc]
            text_by_chapter[chapter] = chapter_text
        
        return text_by_chapter 
    
    def summarise_parts(self, parts:dict) -> str:
        summaries = {}
        for part_name, part in (pbar := tqdm(parts.items())):
            pbar.set_description(f'Summarising {part_name}: ')
            summaries[part_name] = self.summarise_text(part)
        return summaries

    def summarise_text(self, text:str) -> str:
        
        try:
            # print(text+'\n\n\n')
            summary = query_gpt(f'Summarise this: {text}')
        except InvalidRequestError as e:
            max_length, current_length = (int(s) for s in str(e).split() if s.isdigit())
            ratio = current_length / max_length
            if ratio % 1 < 0.8: num_parts = int(ratio) + 1
            else: num_parts = int(ratio) + 2
            part_length = int(len(text) / num_parts ) + 1
            split_text = [text[i*part_length: i+1*part_length] for i in range(num_parts)]
            summary = '\n'.join([query_gpt(f'Summarise this part of a chapter of a research paper: {text}') for text in split_text])
        return summary

    def run_page_summary(self) -> str:
        return self.prettify_part_summary(self.summarise_parts(self.pages))        

    def run_chapter_summary(self) -> str:
        return self.prettify_part_summary(self.summarise_parts(self.chapters))        

    def prettify_part_summary(self, summaries:dict):
        return '\n'.join([str(part_name)+'\n'+'='*len(str(part_name))+'\n'+part+'\n\n' for part_name, part in summaries.items()])

    @property
    def full_text(self) -> str:
        if not hasattr(self, '_full_text'):
            self.load_pdf_detailed()
        return self._full_text

    @property
    def pages(self) -> dict:
        if not hasattr(self, '_pages'):
            self.load_pdf_detailed()
        return self._pages
    
    @property
    def toc(self) -> list:
        if not hasattr(self, '_toc'):
            pdf_details = self.load_pdf_detailed()
            self._toc = self.create_toc(pdf_details)
        return self._toc
    
    @property
    def chapters(self) -> dict:
        if not hasattr(self, '_chapters'):
            self._chapters = self.get_chapter_text(self.full_text, self.toc)
        return self._chapters
        


In [13]:
paths = [Path().cwd().joinpath('papers').joinpath(paper) for paper in os.listdir('papers')]
Path('outputs').mkdir(exist_ok=True)
for path in paths:
    name = path.stem
    print(name, '\n', len(name)*'=')
    try:
        analyser = PDFAnalyser(path)
        chapter_summary = analyser.run_chapter_summary()
        structured_summary = query_gpt_with_context_detailed(f'Write a detailed summary of the context with this structure: \n\n1. Introduction \n2. Data & Methodology \n3. Results.', chapter_summary)
        with open(f'outputs/{name}_summary.txt', 'w') as file:
            file.writelines(structured_summary)
    except ValueError as e:
        print(e, '\n')
    print('\n')


comparison_between_constant_and_dynamic_vol_scaling 


7it [00:00, 20956.55it/s]
Summarising 5. Conclusion: : 100%|██████████| 7/7 [00:00<00:00, 13.99it/s]                                                                                   




betting_against_beta 
TOC could not be generated, try a different pdf 



short_and_long_horizon_behavioural 


7it [00:00, 6499.92it/s]
Summarising 6. Conclusion: : 100%|██████████| 7/7 [00:01<00:00,  6.07it/s]                                         




qual_min_junk 


7it [00:00, 7909.52it/s]
Summarising 6. Conclusion : : 100%|██████████| 7/7 [00:00<00:00, 25.66it/s]                                                       




a_five_factor_asset_pricing_model 
TOC could not be generated, try a different pdf 



momentum_and_the_cross_section_of_volatility 


9it [00:00, 12033.39it/s]
Summarising 7. Conclusion: : 100%|██████████| 8/8 [00:00<00:00, 11.78it/s]                                        




factor_momentum_and_momentum_factor 


25it [00:00, 12037.38it/s]
Summarising 6Conclusion: : 100%|██████████| 25/25 [01:39<00:00,  3.98s/it]                                                                 




my_factor_phil 


5it [00:00, 6142.80it/s]
Summarising VI. Conclusion : : 100%|██████████| 5/5 [00:29<00:00,  5.96s/it]                                                                                                                       




momentum_has_its_moments 
TOC could not be generated, try a different pdf 



stock_return_predictability_is_it_there 


31it [00:00, 6555.58it/s]
Summarising 6Bias, Size and Power: :  68%|██████▊   | 21/31 [02:23<01:08,  6.85s/it]                                            


ServiceUnavailableError: The server is overloaded or not ready yet.

In [81]:
def get_overview(summary):
    full_text = ''
    for key, value in summary.items():
        full_text += key.upper()+'\n'
        full_text += '='*len(key)+'\n'
        full_text += value+'\n\n'

    overview_string = ''
    overview_string += 'HYPOTHESIS\n'+query_gpt(f'What is the authors hypothesis?: \n{full_text}')+'\n\n'
    overview_string += 'DATE RANGE\n'+query_gpt(f'What is the date range used in this study? : \n{full_text}')+'\n\n'
    overview_string += 'UNIVERSE\n'+query_gpt(f'What country are the authors analysing? Answer with just the country: \n{full_text}')+'\n\n'
    overview_string += 'CONCLUSION\n'+query_gpt(f'What is the authors conclusion?: \n{full_text}')+'\n\n'
        
    return full_text, overview_string
    

In [82]:
def get_references(pdf_details, words_to_try='bold'):
    for word in words_to_try:
        try:
            page_num = tuple(pdf_details.loc[(pdf_details.text.apply(lambda x: word in x.lower()))].index[0])
            pdf_details.loc[pdf_details.index.get_level_values(0) >= page_num[0]]
            references_until_end = pdf_details.iloc[pdf_details.index.get_loc(page_num)+1:]
            try:
                end_of_references = references_until_end.index.get_loc(references_until_end.loc[references_until_end.formatting].iloc[0].name)
                return references_until_end.iloc[:end_of_references]
            except IndexError as e:
                print(e)
                return references_until_end
        except Exception as e:
            return e

In [153]:
filepaths = []

paths = [Path().cwd().joinpath('papers').joinpath(paper) for paper in os.listdir('papers')]

pdf_detailss = []
references = []
overviews = {}
texts = {}

for path in paths[:2]:
    try:
        print(path, '\n')
        
        chapters, total_text, pdf_details = get_text_toc(path)
        pdf_detailss.append(pdf_details)
        references.append(get_references(pdf_details))
        print(chapters, '\n')

        summary, tools = get_summaries(total_text, chapters)
        text, overview = get_overview(summary)
        overviews[path] = overview
        texts[path] = text
        print(overview)
        print('\n\n\n')
    except Exception as e:
        print(path, '\n')
        print(e)


/Users/niklasgaertner/Desktop/coding/gpt_research/papers/comparison_between_constant_and_dynamic_vol_scaling.pdf 

['Risk adjusted momentum strategies: a comparisonbetween constant and dynamic volatility scalingapproaches', 'Abstract', '1. Introduction', '2. Data', '3. Methodology', '4. Empirical results', '5. Conclusion'] 



0it [00:02, ?it/s]


KeyboardInterrupt: 

In [297]:
refs = references[-1]
reference_text = ''
for text in refs.text:
    reference_text += text + '\n'

In [298]:
print(reference_text)

Ang, A. and G. Bekaert (2007). Stock return predictability: Is it there?The Review of Financial Stud-
ies 20(3), 651707.
Ang, A., R. J. Hodrick, Y. Xing, and X. Zhang (2006). The cross-section of volatility and expected returns.
The Journal of Finance 61(1), 259299.
Arnott, R. D., M. Clements, V. Kalesnik, and J. T. Linnainmaa (2019). Factor momentum. Working paper.
University of Southern California.
Asness, C. S. (2016a). My factor philippic. Working paper. AQR Capital Management.
Asness, C. S. (2016b).The siren song of factor timing.The Journal of Portfolio Management Special
Issue(1).
Asness, C. S., A. Frazzini, and L. H. Pedersen (2019). Quality minus junk. Review of Accounting Stud-
ies 24(1), 34112.
Banz, R. W. (1981).The relationship between return and market value of common stocks.Journal of
Financial Economics 9(1), 318.
Barr Rosenberg, K. R. and R. Lanstein (1984). Persuasive evidence of market ineciency. The Journal of
Portfolio Management 11, 917.
Barroso, P. and P. Santa-C

In [157]:
question = 'What?'

In [200]:
i = 3
answers = []
for i in range(0,len(chapters)):
    context = total_text[total_text.find(chapters[i-1]):total_text.find(chapters[i])]

    
    answers.append(query_gpt_with_context_detailed(text=question, context=context[:4000]))
context = '\n'.join(answers)
query_gpt_with_context_detailed(text='summarise the relevant parts of the context', context=context)

'The author is investigating the issue of risk-adjusted momentum strategies and specifically comparing the performance of constant and dynamic volatility scaling approaches.\n\nThe author is investigating the performance of two volatility scaling methods in momentum strategies, specifically comparing the constant volatility scaling approach of Barroso and Santa-Clara (2015) to the dynamic volatility scaling method of Daniel and Moskowitz (2016).\n\nThe author is investigating the issue of momentum crashes in momentum strategies. Specifically, they are examining the implementation of volatility scaling methods to address the risk of momentum crashes in futures markets across different asset classes.\n\nThe author is investigating the properties and characteristics of various global liquid futures instruments, including commodities, sovereign bonds, currencies, and equity index contracts, along with the returns and volatility associated with these asset classes.\n\nThe author is investig

In [205]:
len(context)

15030

In [202]:
context = total_text[total_text.find(chapters[i-1]):total_text.find(chapters[i])]

In [206]:
query_gpt(f'Summarise this:  {context[:12000]}')

'The empirical results in this research study focus on momentum crashes in futures markets and the comparison between constant and dynamic volatility scaling approaches. Momentum crashes occur when the cumulative returns of the bottom decile are significantly higher than the top decile, typically during times of financial stress. The study identifies momentum crashes in futures markets during the 2007-2008 financial crisis.\n\nThe comparison between constant volatility scaling (CVS) and dynamic volatility scaling (DVS) approaches shows that CVS outperforms DVS in terms of alpha (excess return) during the entire sample period. However, during the financial crisis period, the performance of both CVS and DVS is negatively impacted and the difference in alphas between the two approaches becomes insignificant. The study also compares the scaled XSMOM (Cross-sectional Momentum) strategies with benchmarks such as buy-and-hold and time series momentum (TSMOM) strategies. The scaled XSMOM strat

In [216]:
try: 
    query_gpt_with_context_detailed('summarise the context', context)
except InvalidRequestError as e:
    a = e

In [229]:
ratio = current_length / max_length

if ratio % 1 < 0.8:
    num_parts = int(ratio) + 1
else:
    num_parts = int(ratio) + 2


In [241]:
num_parts

3

In [232]:
int(ratio)

1

In [230]:
ratio % 1

0.0973883329265317

In [228]:
ratio = current_length / max_length
if ratio < 1.5:
    #split string in half 
elif ratio > 2:
    #split current string by ratio 
else:
    

1.0973883329265317

In [209]:
query_gpt_with_context_concise('summarise the context', context[:12000])

'The context is focused on empirical results related to momentum crashes in futures markets. The study examines the performance of constant and dynamic volatility scaling approaches in reducing momentum losses during times of financial stress. The analysis includes the identification of momentum crashes caused by the 2007-2008 financial crisis, as well as a comparison between the two scaling approaches. Overall, the constant volatility scaling approach outperforms the dynamic volatility scaling approach, but this superiority is diminished during the financial crisis period.'

In [210]:
query_gpt_with_context('summarise the context', context[:12000])

'The context is focused on the empirical results of momentum crashes in futures markets and the comparison between constant volatility scaling (CVS) and dynamic volatility scaling (DVS) approaches. Momentum crashes are periods when the cumulative returns of the bottom decile are significantly higher than the cumulative returns of the top decile. The study identifies a momentum crash caused by the 2007-2008 financial crisis in futures markets. The comparison between CVS and DVS approaches reveals that CVS is more efficient overall, but the superiority is almost eliminated during times of financial crisis. The study also includes a cross-strategy comparison, considering buy-and-hold, time series momentum (TSMOM), and cross-sectional momentum (XSMOM) strategies. The scaled XSMOM strategies outperform the benchmark strategies.'