In [8]:
import fitz
import pandas as pd
import re

In [69]:
pdf_document = fitz.open("li.pdf")
toc = pdf_document.get_toc(simple=True)

In [70]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 1000)
pd.DataFrame(toc)

Unnamed: 0,0,1,2
0,1,1 Introduction,9
1,2,1.1 Calcium Imaging,10
2,3,1.1.1 Intracellular calcium dynamics and its modeling,11
3,3,1.1.2 Calcium indicators,13
4,3,1.1.3 Data analysis and its difficulties,15
5,2,1.2 Neural coding: from firing rate to correlation,16
6,3,1.2.1 Correlation quantification,17
7,2,1.3 Question statement,18
8,1,2 Materials and Methods,19
9,2,2.1 Software and hardware,19


In [71]:
def extract_image_from_page(page, page_number):
    image_list = page.get_images(full=True)
    if image_list is None:
        return None
    else:
        images_name = []
        info_names = []
        for img_index, img_info in enumerate(image_list):
            xref = img_info[0]
            image_info_name = img_info[7]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_filename = f"image_{page_number+1}_{img_index + 1}.png"  # page number starts from 0 in fitz and 1 in pdf
            images_name.append(image_filename)
            info_names.append(image_info_name)
            with open(image_filename, "wb") as image_file:
                image_file.write(image_bytes)
        return images_name, info_names

In [72]:
def separate_header(text):
    # Define a regex pattern to match the modified header format
    header_pattern = r'^\d+\s.*\s\d+$'  # Matches lines with a number, text, number
    #add here other header regex patterns!!!
    
    # Find the header pattern in the text
    match = re.search(header_pattern, text, re.MULTILINE)
    
    if match:
        header = match.group(0)
        header_end_index = match.end()
        main_text = text[header_end_index:]
    else:
        # If no header pattern found, return the entire text as main text
        header = ''
        main_text = text
    
    return header, main_text

In [73]:
def find_element_by_title(data_list, title):
    for item in data_list:
        if title in item[4]:  # Check if the title is present in the 5th element (index 4) of the tuple
            return item
    return None 

In [74]:
parts_list = []

for i in range(len(toc)-1):
    title = toc[i][1]
    start_page = toc[i][2] -1
    
    if i != len(toc)-1:
        next_title = toc[i+1][1]
        next_title_start_page = toc[i+1][2] -1
        end_page = toc[i+1][2] -1
    else:
        end_page = len(pdf_document) -1  #!probelm of out of toc pages and generalization and end page

    part_text = []
    part_images = []
    for j in range(start_page, end_page+1): #page number in pdf start from 1 and in fitz from 0
        page = pdf_document.load_page(j)
        page_images, info_names = extract_image_from_page(page, j)

        #checking if image is in the part or belongs to previous or next part
        for k, l in zip(info_names, page_images): 
            image_bbox = page.get_image_bbox(k)[1] #y0
            page_blocks = page.get_text("blocks")
            if j == start_page:
                #title_block = list(filter(lambda x: title.strip() in x.strip(), page_blocks))
                title_block = find_element_by_title(page_blocks, title)
                title_bbox_y0 = title_block[1]
                if next_title_start_page == start_page:    
                    next_title_block = find_element_by_title(page_blocks, next_title)
                    next_title_bbox_y0 = next_title_block[1]
                    if image_bbox > title_bbox_y0 and image_bbox < next_title_bbox_y0:
                        part_images.append(l)
                    elif image_bbox > title_bbox_y0:
                        part_images.append(l)
            elif j == end_page:
                if next_title_start_page == end_page:
                    next_title_block = find_element_by_title(page_blocks, next_title)
                    next_title_bbox_y0 = next_title_block[1]
                    if image_bbox < next_title_bbox_y0:
                        part_images.append(l)
            else:
                part_images.append(l)

        text = page.get_text("text")
        page_header, page_text = separate_header(text)
        part_text.append(page_text)   #incluse index of characers
    
    if (i != len(toc)-1):   #!probelm of out of toc pages and generalization and end page
        part_text[0] = part_text[0].split(title)[1]   #to exclude text from previous part
        part_text[-1] = part_text[-1].split(next_title)[0]  #to exclude text from next part
    else:
        part_text[0] = part_text[0].split(title)[1]     
    '''
    if (i != len(toc)-1):   # Problem of out of toc pages and generalization and end page
        print(i, title, part_text)
    # Check if title is found before splitting to avoid IndexError
        if title in part_text[0]:
            part_text[0] = part_text[0].split(title)[1]   # to exclude text from previous part
        else:
            print(f"Warning: Title '{title}' not found in the beginning of its section.")
    
    # Check if next_title is found before splitting to avoid IndexError
        if next_title in part_text[-1]:
            part_text[-1] = part_text[-1].split(next_title)[0]  # to exclude text from next part
        else:
            print(f"Warning: Next title '{next_title}' not found at the end of the section.")
    else:
        if title in part_text[0]:
            part_text[0] = part_text[0].split(title)[1]
        else:
        # Handle case where title is not found for the last item in the toc
            print(f"Warning: Title '{title}' not found in the beginning of the last section.")
'''
    part_dict = {
        'title': title,
        'pages': f'{start_page+1}-{end_page+1}',  # page number starts from 0 in fitz and 1 in pdf
        'text': part_text,
        'images': part_images
    }

    parts_list.append(part_dict)

#pdf_document.close()

In [75]:
pd.set_option('display.max_colwidth', 10000)
pd.DataFrame(parts_list)

Unnamed: 0,title,pages,text,images
0,1 Introduction,9-10,"[\nRecording neural activity is critical for neuroscience research. Neural activity sculpts\nhow neurons transfer information and interact with each other. Different markers or\nindicators can mediate it. Nowadays, a variety of techniques have been developed to\nmeasure brain activity, such as single-channel recording, multi-channel recording, lo-\ncal field potentials (LFPs), electroencephalogram (EEG), magnetoencephalography\n(MEG), functional magnetic resonance imaging (fMRI), positron emission tomog-\nraphy (PET), calcium imaging, optical imaging of intrinsic signals (OIS), volage-\nsensitive dye imaging (VSDI). These techniques can be classified into two main cat-\negories: electrophysiology and imaging techniques. Electrophysiology (patch clamp,\nmulti-channel recording, LFPs, EEG, MEG) measures and manipulates neuronal\ndynamics such as spike trains, EPSC, EPSP, and LFP. Imaging techniques (fMRI,\nPET, calcium imaging, OIS, VSDI) can capture neuronal activities and identify\nstructures and connections at different scales, from cortex areas and neurons to\ndendrites and spines. Each technology has advantages and disadvantages regarding\ninvasiveness and temporal and spatial resolution. Therefore, combining two or more\ntechnologies to reveal the complex spatiotemporal patterns of neural activity from\na single neuron to the population level is common. Figure 1 gives an overview of\ntemporal resolution and spatial scale.\n, \nFigure 1:\nTemporal resolution and spatial scale of imaging and electrophysi-\nology techniques.\nThe mesoscopic scale is represented by the oval shaded area\n[1].\nINTRA: intracellular recording, SU/MU: single-unit/multi-unit recording,\nLFP: local field potential, MEA: multi-electrode array, VSDI: voltage-sensitive dye\nimaging, OIS: optical imaging of intrinsic signals, EEG/MEG: electroencephalog-\nraphy/magnetoencephalography, fMRI/PET: functional magnetic resonance imag-\ning/poistron emission tompgraphy.\n]",[image_10_1.png]
1,1.1 Calcium Imaging,10-11,"[\nAs one of the imaging techniques, calcium imaging has become more popular in the\nseveral decades. Neuronal spiking is assessed by measuring changes in fluorescence.\nCalcium imaging together with recombinant DNA technologies and microscopy is\na powerful tool in modulating and visualizing cellular and molecular interactions.\nIt can be used not only to observe neuronal activity but also to associate behav-\nioral characterization with physiological states. Under the field of view (FOV), it is\npossible to record the activities of hundreds of neurons. It also enables recording free-\nmoving animals in in-vivo experiments. Two-photon microscopy has excellent spatial\nresolution, but because the calcium signal is slow, two-photon calcium imaging has\nvery good spatial resolution but low temporal resolution. Although experiments’\nsetup and execution could be demanding, calcium imaging can be combined with\n, \nelectrophysiology to compensate the low temporal resolution. Figure 2 is an exam-\nple. Intra-cellular recording directly measures neuronal activities, usually taken as\nground truth. Calcium imaging records fluorescence signals as an indicator of neu-\nronal spiking activity, which is indirect. Fluorescence signals and neuronal spikes\ndiffer from each other. Figure 3 shows single action potential and its evoked calcium\nsignal, obtained from recording rats’ calyx of Held - a large synaptic terminal in\nbrainstem slice. We see action potential is quick (in ms) with a short decay, while\nfluorescence signal (in this case, Fura-2) is slow in several seconds with a longer\ndecay. The fluorescence signal is much noisier than the action potential. The cal-\ncium indicator Section 1.1.2 is one key point to facilitate functional imaging with\nalive animals. For example, with a cranial window over visual cortex of a transgenic\nmouse, it is able to image large scale of neuronal activities on layer 2/3 and layer\n4.\nFigure 2:\nIn-vivo calcium imaging accompanied with simultaneous electrophys-\niology recording on genetically modified mice to produce a calcium indicator\n(GCaMP6s, GCaMP6f) in neurons of the primary visual cortex, adapted from [2].\nLeft: Experimental design that recruits two-photon imaging and cell-attached record-\ning simultaneously. Right: recorded fluorescence trace (top) and membrane potential\ntrace (bottom). afu: arbitrary fluorescence units.\n]","[image_11_1.png, image_11_2.png]"
2,1.1.1 Intracellular calcium dynamics and its modeling,11-13,"[\nCalcium imaging visualizes calcium signaling within neurons based on intracellular\ncalcium dynamics. The interaction of calcium ions between the extracellular milieu\nand intracellular space generates the dynamics of calcium activity. There are mainly\nthree types of calcium channels on the membrane. See Figure 4. The first is voltage-\ngated calcium channels (VGCC); the second is receptor-operated calcium channels\n(ROC); the third is store-operated calcium channels (SOC). VGCC responds to elec-\ntrical signals induced by calcium binding. It helps to propagate electrical impulses.\n, \nFigure 3: Fluorescence signal evoked by a single action potential, adapted from [3].\nA: intracellular recording. B: Calcium imaging of A. Squares indicate regions selected\nin different types of fluorescence measurements.\nC: Presynaptic action potential\nevoked by orthodromic stimulation. D: Presynaptic fluorescence signals evoked by\na single action potential in cells loaded with Fura-2.\nROC is a calcium-permeable receptor channel. They are open when binding with\nexcitatory neurotransmitters, e.g., glutamate, AmPAR, NMDA. Besides, the endo-\nplasmic reticulum (ER) and mitochondria can release calcium intracellularly. SOC\nis activated by stromal interaction molecule (STIM) proteins accumulated at ER\nand plasma membrane (PM) junctions [4]. In excitatory neurons, when membrane\nis depolarized, VGCC open and calcium enormously flux in. Therefore calcium con-\ncentration is associated with action potential events. Helmchen 2012 [5] summarized\ncalcium dynamics as four processes: calcium binding, influx, extrusion, and diffusion.\nCalcium influx will increase the intracellular concentration. Extrusion is, therefore,\nimportant to keep a low intracellular calcium concentration at a balanced level. In\naddition, free calcium ions, as well as calcium-binding molecules, e.g., most cal-\ncium indicators, can diffuse. Mathematical equations were used to describe the four\nprocesses. There are various models for them. For example calcium microdomain\nmodels high calcium concentration in small spatial scale beneath the membrane due\nto opening of calcium-permeable channel pores (diffusion process). An 1-dimensional\ndiffusion model to depict the radial distance dependent calcium redistribution pro-\ncesses. A leaky ”chemical” cable analogue can model calcium diffusion, extrusion\nand buffering. In a simplified single-compartment model, an exponential function\ncan be used to describe the kinetics of the binding process. The calcium signal from\na neuron is a convolution of the spikes with this exponential function as proposed [5].\nIn biological calcium imaging experiments, fluorescent calcium indicators is needed\n, \nto serve as a proxy for the change of calcium concentration during binding process.\nFigure 4: Intracelluar calcium dynamics [4].\nFigure 5: GCaMP working mechanism [4].\n]","[image_12_1.png, image_13_1.png, image_13_2.png]"
3,1.1.2 Calcium indicators,13-15,"[\nThe development of optical reporters promotes the imaging techniques. The first\ncalcium indicator is Aequorin found in Aequorea victoria in 1962. Nowadays, the\nindicators used in calcium imaging can be classified into two categories: organic dyes\nand genetically-encoded calcium indicators(GECIs). The former are small organic\n, \nmolecule indicators,e.g. Oregon Green BAPTA-1. The latter are calcium-sensitive\nfluorescent proteins, which further have two major classes: FRET-based indicators\nand single protein indicator. Fluorescence resonance energy transfer (FRET) hap-\npens on the distance change between a donor fluorescent protein and an acceptor\nfluorescent protein, which can be induced by a conformation change via calcium\nbinding to the linker. FRET leads to fluorescence emissions so that the intensity\ncan be recorded. Single protein indicator e.g. the GCaMP family, are circularly per-\nmutated protein that can sense the conformational change made by calcium binding,\nmediated by the fluorescence intensity of chromophore inside [5]. GCaMP is a syn-\nthetic protein containing green fluorscent protein (GFP), calmodulin and a peptide\nsequence M13 (Figure 5). When bounding to calcium, it emits green luminance\nwith a peak excitation wavelength of 480 nm and a peak emission wavelength of\n510 nm. Comparing with organic dyes, GECIs has an advantage of being able to\ntarget on location in the brain, specific neuron types and subcellular compartments\nsuch as soma or axons. GECIs also has higher signal-to-noise ratio. To be noticed\nis, calcium indicator itself has an influence on the intracellular calcium dynamics,\nwhich can inference the readout information. Besides that, different indicators have\ndifferent response times, which is an important factor of data modeling. In a recent\nwork [6], with a large dataset of curated ground truth recordings on zebrafish and\nmouse, covering different calcium indicators and induction methods, excitatory and\ninhibitory neurons, different sampling frequency and different brain regions, using\nregularized deconvolution to generate the linear kernels of fluorescence transient.\nThe kernels are optimized such that the trace ∆F/F is maximal approximated by\nconvolving the ground truth spike train with the kernel. It was found that depending\non animal types, brain regions, neuron types and calcium indicators, the area under\nthe kernel curve varied. Figure 6 shows different peak amplitudes and decay times\nwith different calcium indicators. Even for neurons within same dataset, the kernels\nappear in diversity as well. This suggests that single neuron responds differently\nwhen underlying action potential goes on.\n, \nFigure 6:\nKernels with different calcium indicator, adapted from [6].\nAcross\ndatasets as well as within datasets, kernels vary in terms of amplitude and shape.\nm: mouse, zf: zebrafish.\n]",[image_15_1.png]
4,1.1.3 Data analysis and its difficulties,15-16,"[\nCalcium imaging throws light on studying the activities of a large neuronal popu-\nlation. Together with genetic encoding techniques, it enables in-vivo experiments\nwith head-fixed or even free-moving awake animals, which is a big advance in sci-\nentific research. In the meantime, analyzing fluorescence signals is more complex\nthan analyzing signals from electrophysiology. The raw data of calcium imaging\nis movie or image stacks. A common benchmark of data analysis involves motion\ncorrection, source extraction, cell registration, spike inference, and further analysis\nof spike trains. Spike inference is very challenging among these steps. Assuming\nfluorescence signal is the result of action potentials convolving a response function.\nTo infer the underlying spike train is a so-called ""inverse problem."" Various methods\nhave been developed to solve the inverse problem: deconvolution techniques [7, 8],\nmaximum a posteriori (MAP) principle [9], template-matching-based approach [10],\napproximate Bayesian inference [11]. Applications and packages are available online:\nCellSort [12], CaImAn [13], Suite2P [14], MLspike [15], CASCADE [6] and so on. It\nhas been great progress with these works. However, general difficulties in terms of\nspike inference still exist. The main aspects are outlined below.\n, \n• Overexpression.\nFluorescence overexpression potentially leads to toxicity.\nNeuron nuclear is bright due to the accumulation of calcium indicators.\n• Photobleaching. The fluorescence signal fades during the experiment due to\nthe photochemical destruction of the fluorophore.\n• Nonlinearity of fluorescence. The saturation curve of the calcium-binding\nratio can be linearly approximated only within a limited range. Therefore the\namplitude of observed fluorescence signals does not always linearly represent\nthe number of underlying action potentials.\n• Sample resolution. With high microscopy resolution, the single action po-\ntential is more visible. The low resolution of imaging could lead to missing\naction potentials.\n• Noise. Deconvolution is very sensitive to noise. Fluorescence signals contain\nmuch noise from experimental settings, background activity, interference of\nthe calcium-binding domain, and calcium buffering with the cellular spiking\nprocess.\n• Neuron type. Different neuron type demonstrates different response kernels.\nE.g., inhibitory neurons tend to have shorter decay.\n• High computation cost.\nDeconvolution is a computation intensive tech-\nnique.\nApplying it to a population of neurons needs high computation re-\nsources.\nDeconvolution has to be performed to extract spike trains from fluorescence signals.\nThis method, however, is prone to be numerically unstable and noise sensitive. Fo-\ncusing on correlation analysis and to avoid the deconvolution problem, in this thesis,\na new approach is proposed to indirectly infer the cross-correlation of underlying\nspike trains.\n]",[]
5,1.2 Neural coding: from firing rate to correlation,16-17,"[\nIn characterizing single neuron activity, the firing rate is the main indicator.\nIt\ngives information on whether the neuron is sensitive or not involved in a certain\n, \nactivity. Especially when the subject is experiencing some events on behavior level,\nby observing single neuron activity or population activity, researchers can attribute\nsingle neuron event to that even in macro-world, for example, ""Grandmother"" neuron.\nBesides single neurons, neuronal population activity is also of interest. To describe\na population of neurons, mean firing rate, coefficient of variation (CV) of inter-spike-\ninterval can be applied. From single neuron to neuronal population, on the one hand,\nprecise spiking time is important for neural coding. On the other hand, the neuron\ndoes not fire alone. The coordinated activities of neurons reflect a certain response\npattern. There are assumptions about the role of correlated neuronal activities, such\nas correlations as a side-effect of synaptic interaction, as an additional dimension\nof coding in the brain, or as it regulates the flow of information [16]. No doubt,\ncorrelation is the essential information and modulates the firing rate. Neurons in\nthe network are sensitive to this correlated activity as input [17]. Only synchronized\ninput can trigger spikes in feed-forward inhibition networks, resulting in sparse and\nprecise response [18].\nSources of correlations can be shared input to neurons or\ndirect synaptic connections. Correlation itself does not mean causation. Correlated\nactivities of neurons do not indicate which neuron drives which neuron. Correlations\nhave something to do with time scale.\nOn different time scales, there might be\ndifferent correlations.\n]",[]
6,1.2.1 Correlation quantification,17-18,"[\nIn this thesis, we mainly discuss the correlation between paired neurons. There are\ndifferent ways to quantify correlations of neuronal activity, among which the most\nwell-known are Pearson correlation coefficient, correlation function, and joint peri-\nstimulus time scatter diagram (JPSTH) [19]. The Pearson correlation coefficient\nis a linear correlation coefficient with normalized value scope [-1, 1]. Negative val-\nues mean negative correlations, while positive values indicate positive correlations.\nAbsolute value towards 1 indicates strong relations, while around zero means two\nvariables are hardly linearly correlated. The Pearson correlation coefficient cannot\ncapture a nonlinear relationship between two variables. With JPSTH it is to create\na two-dimensional scatter diagram of the firings of the two neurons relative to each\nstimulus onset. Each dot in Figure 7 right panel is a coincidence of the two spike\ntrains during one stimulus period. Repeating this for each trial, dot density is built\non the diagonal. With subsequent dynamic correction, including subtraction and\nscaling, reintegration along the diagonal generates the ultimate JPSTH. The cross-\n, \ncorrelation function is commonly used to measure the similarity of two signals. A\ncross-correlation function is generated by shifting one signal from the other and cal-\nculating the expectation value of the dot product of two signals within the overlap.\nProminence peaks in the curve demonstrate correlations between the two signals\nwith certain time lags (Figure 7, middle panel).\nPearson correlation\nCorrelation function\nJoint JPSTH\nFigure 7: Correlation quantification. Left: Pearson correlation with a positive\ncorrelation coefficient. Middle: correlation function. Right: JPSTH [19].\n]","[image_18_1.png, image_18_2.png, image_18_3.png]"
7,1.3 Question statement,18-19,"[\nCalcium imaging with high-resolution microscopy is a powerful tool in neuroscience\nresearch. A reliable analysis of fluorescence signals is important to correctly reveal\nthe biological meaning on a neuronal level. Among all analyses, inferring spike trains\nfrom fluorescence signals is challenging but a prerequisite of other steps. On the\nother hand, neuronal information lies in firing rates and is delivered by correlations.\nA common methodology for correlation analysis is first to extract discrete spikes\nor spike rates via deconvolution and then calculate neuronal correlations. However,\nthe reconstruction of spike trains by deconvolution is very sensitive to noise, not\nstable, and linked with high computational costs. In this thesis, we propose and\ndemonstrate a new method to obtain spike train correlations by deconvolution of\nthe correlations of fluorescence signals, avoiding the reconstruction of spike trains\naltogether.\n, \n]",[]
8,2 Materials and Methods,19-19,[\n],[]
9,2.1 Software and hardware,19-19,"[\nThis thesis proposes a new method to solve the ""inverse problem"" - get spike train\ncorrelation from recorded fluorescence signals. Implementation of the method, sur-\nrogate data modeling, and experimental data analysis were completed using Python\n(version 3.8) in Spyder (version 4.0.1) on a desktop with Intel Core i5-6500 CPU @\n3.2GHz and 16GB RAM. Experimental data (*.tiff) was pre-processed in Suite2P\n(version 0.10.1) [14], exported as a python data file (*.npy), and then loaded into\nSpyder for further processing. Simulation of neurons was carried out with NEST 3.0\n[20].\n]",[]


In [15]:
max_title_length = max(len(toc[i][1]) for i in range(len(toc)))

for i in range(len(toc)):
    title = toc[i][1]  
    start_page = toc[i][2]
    if i != len(toc)-1:
        end_page = toc[i+1][2]
    else:
        end_page = len(pdf_document)

    print(f"{i:2}  {title:{max_title_length}}  {start_page:2}  {end_page:2}")


 0  1 Introduction                                          9  10
 1  1.1 Calcium Imaging                                    10  11
 2  1.1.1 Intracellular calcium dynamics and its modeling  11  13
 3  1.1.2 Calcium indicators                               13  15
 4  1.1.3 Data analysis and its difficulties               15  16
 5  1.2 Neural coding: from firing rate to correlation     16  17
 6  1.2.1 Correlation quantification                       17  18
 7  1.3 Question statement                                 18  19
 8  2 Materials and Methods                                19  19
 9  2.1 Software and hardware                              19  19
10  2.2 Fluorescence signal modeling                       19  20
11  2.2.1 Point process                                    20  22
12  2.2.2 Kernel                                           22  24
13  2.2.3 Exact integration as fast convolution            24  25
14  2.3 Signal characterization and correlation inference  25  25
15  2.3.1 

In [None]:
def extract_image_from_page(page, page_number, title_no):
    image_list = page.get_images(full=True)
    if image_list is None:
        return None
    else:
        images_name = []
        for img_index, img_info in enumerate(image_list):
            image_coor_y1 = page.get_image_bbox(image_list[img_index])[1]
            title_coor_y1 = page.search(toc[title_no][1])[0][1]
            next_title_coor_y1 = page.search(toc[title_no+1][1])[0][1]

            if image_coor_y1 > title_coor_y1:
                if 


            xref = img_info[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_filename = f"image_{page_number+1}_{img_index + 1}.png"  # page number starts from 0 in fitz and 1 in pdf
            images_name.append(image_filename)
            with open(image_filename, "wb") as image_file:
                image_file.write(image_bytes)
        return images_name

In [None]:
test_page = pdf_document.load_page(10)
image_list = test_page.get_images(full=True)
#image_list
a = test_page.get_image_bbox(image_list[0])

for i, j in enumerate(image_list):
    print('iii:', i, 'jjj:', j)

a

In [None]:
import fitz

pdf_document = fitz.open("li.pdf")
toc = pdf_document.get_toc(simple=True)
page_number = 9  # You mentioned you want to find coordinates on page 10
title_index = 1  # Replace 'i' with the actual index of the title you are interested in

# Load the specified page
page = pdf_document.load_page(page_number)

# Get the coordinates of the title from the TOC
title = toc[title_index][1]

# Find the text block on the page that corresponds to the title
text_block = page.search_for(title)

# Get the coordinates of the text block
title_coordinates = text_block[0]  # Assuming there is only one occurrence of the title on the page

# Now title_coordinates contains (x0, y0, x1, y1) of the title on the page
print("Title Coordinates:", title_coordinates)

# If you want to print the text block content as well, you can do:
title_text = page.get_text("text", clip=title_coordinates)
print("Title Text:", title_text)


In [None]:
#following version tries not to search through all lines.
def separate_header(text):
    # Define a regex pattern to match the modified header format
    header_pattern = r'^\d+\s.*\s\d+$'  # Matches lines with a number, text, number
    
    # Find the header pattern at the beginning of the text
    match = re.match(header_pattern, text)
    
    if match:
        header = match.group(0)
        header_end_index = match.end()
        main_text = text[header_end_index:].strip()  # Remove header and any leading/trailing whitespace
    else:
        # If no header pattern found, return the entire text as main text
        header = ''
        main_text = text.strip()  # Remove any leading/trailing whitespace
    
    return header, main_text