# **Nougat** : Neural Optical Understanding for Academic Documents

## Lukas Blecher et al. [Paper](https://arxiv.org/pdf/2308.13418.pdf), [Project](https://facebookresearch.github.io/nougat/)



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Installing the required libraries

In [None]:
!pip install nougat-ocr -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.2/82.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.5/431.5 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.7/138.7 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.3/276.3 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

### Download a sample PDF file

In [None]:
import os
import requests


def get_pdf(pdf_link):
    """Download a PDF from a given link and save it to a local folder."""

    # Send a GET request to the PDF link
    response = requests.get(pdf_link)

    if response.status_code == 200:
        # Save the PDF content to a local file
        with open("input/nougat.pdf", 'wb') as pdf_file:
            pdf_file.write(response.content)
        print("PDF downloaded successfully.")
    else:
        print("Failed to download the PDF.")

if not os.path.exists("input"):
    os.mkdir("input")

### Downloading model weights

In [None]:
from nougat.utils.checkpoint import get_checkpoint
CHECKPOINT = get_checkpoint('nougat')

downloading nougat checkpoint version 0.1.0-small to path nougat


config.json: 100%|██████████| 557/557 [00:00<00:00, 780kb/s]
pytorch_model.bin: 100%|██████████| 956M/956M [00:58<00:00, 17.1Mb/s]
special_tokens_map.json: 100%|██████████| 96.0/96.0 [00:00<00:00, 161kb/s]
tokenizer.json: 100%|██████████| 2.04M/2.04M [00:00<00:00, 18.4Mb/s]
tokenizer_config.json: 100%|██████████| 106/106 [00:00<00:00, 263kb/s]


### Writing inference functions

In [None]:
import re
import requests
import subprocess
import uuid

def nougat_ocr(file_name):
    """Run nougat OCR on the given PDF file."""
    cli_command = [
        'nougat',
        '--out', 'output',
        'pdf', file_name,
        '--checkpoint', CHECKPOINT,
        '--markdown'
    ]
    subprocess.run(cli_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)


def paper_read(pdf_file_name):
    nougat_ocr(pdf_file_name)

    file_name = pdf_file_name.split('/')[-1][:-4]
    with open(f'output/{file_name}.mmd', 'r') as file:
        content = file.read()

    return content


In [None]:
content = paper_read('/content/drive/My Drive/AI/Franc/dataset/D1/raw/C24.pdf')

In [None]:
content

'## Chapter 2 Increasing VO2max\n\nMost runners and running coaches believe that it is essential to build a base of strength, aerobic capacity, and fitness prior to embarking on a rigorous training program. A popular conception is that this base should include gradually increasing distance, most of which is conducted at easy to moderate tempos. It is generally believed that a runner\'s body is not yet ready for high-quality work during an early, base portion of the training year and thus must be gradually acclimated to higher volume and intensity. The easy running is thought to provide a foundation of strength and serve as an upgrade of aerobic fitness that helps smooth the transition into higher-quality effort.[1] However, evidence shows that this traditional approach and its alleged benefits may not be optimal for improving VO2max and running-specific strength.\n\n## 3 Weaknesses of Traditional Approaches to Base Training\n\nIn their book _Better Training for Distance Runners_, David

In [None]:
!pip install transformers



In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.6.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
from typing import List

import spacy

# nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_lg")


# def remove_stop_words(text: str) -> str:
#     doc = nlp(text)
#     text_parts = [token.text for token in doc if not token.is_stop]
#     return " ".join(text_parts)


def split_sentences(text: str) -> List[str]:
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences


def group_sentences_semantically(sentences: List[str], threshold: float) -> List[str]:
    docs = [nlp(sentence) for sentence in sentences]
    segments = []

    start_idx = 0
    end_idx = 1
    segment = [sentences[start_idx]]

    while end_idx < len(docs):
        if docs[start_idx].similarity(docs[end_idx]) >= threshold:
            segment.append(sentences[end_idx])
        else:
            segments.append(" ".join(segment))
            start_idx = end_idx
            segment = [sentences[start_idx]]

        end_idx += 1

    if segment:
        segments.append(" ".join(segment))

    return segments


def split_text(text: str) -> List[str]:
    # text = remove_stop_words(text)
    sentences = split_sentences(text)

    return group_sentences_semantically(sentences, 0.9)


In [None]:
chunks = split_text(content)

In [None]:
print(len(chunks))
max_length = max(len(s) for s in chunks)
print(max_length)


118
778


In [None]:
print(chunks[80])
print(chunks[13])
print(chunks[27])
print(chunks[76])
print(chunks[7])

With their emphasis on increased volume, traditional base periods may actually increase injury rates.


Coach Arthur Lydiard was also a proponent of base periods that keep intensity at a moderate level while gradually increasing mileage.[4]
Although new capillary growth is often considered to be one of the slower adaptive processes associated with endurance training, an interesting finding was that capillaries began to proliferate _around_ muscles even before the sinews exhibited increased concentrations of _intracellular_
It's clear from such research that traditional base training is an inefficient way to build an aerobic base, that is, to expand VO2max.

## 4 Comparing High-Intensity and Traditional Base Training

It might be argued that conventional base training is nonetheless an effective way to boost running strength and thus decrease the risk of subsequent injury during more intense phases of training, but this contention ignores the fact that the strength gained in training is

In [None]:
from typing import List
import torch
import spacy
from transformers import LEDForConditionalGeneration, LEDTokenizer

nlp = spacy.load("en_core_web_lg")


def initial_chunking(text, tokenizer, max_length=256):
    words = text.split(' ')
    chunks = []
    chunk = ""

    for word in words:
        temp_chunk = f"{chunk} {word}"
        temp_ids = tokenizer.encode(temp_chunk)

        if len(temp_ids) < max_length:
            chunk = temp_chunk
        else:
            chunks.append(chunk)
            chunk = word

    chunks.append(chunk)
    return chunks


def split_sentences(text: str) -> List[str]:
    document = nlp(text)
    return [sent.text for sent in document.sents]


def group_sentences_with_overlap(sentences: List[str], size: int, overlap: int) -> List[str]:
    groups = []
    sentence_index = 0

    while sentence_index + size < len(sentences):
        groups.append(" ".join(sentences[sentence_index:sentence_index + size]))
        sentence_index += (size - overlap)

    return groups


def summarize(chunk, model, tokenizer, max_length):
    segments = []
    input_ids = tokenizer(chunk, return_tensors="pt").input_ids.to("cuda")
    global_attention_mask = torch.zeros_like(input_ids)
    global_attention_mask[:, 0] = 1

    sequences = model.generate(input_ids, global_attention_mask=global_attention_mask)
    summary = tokenizer.batch_decode(sequences)
    segments.append(summary)

    return segments

# Initialize Longformer tokenizer and model
# allenai/led-large-16384-arxiv
# patrickvonplaten/led-large-16384-pubmed

longformer_tokenizer = LEDTokenizer.from_pretrained('patrickvonplaten/led-large-16384-pubmed')
longformer_model = LEDForConditionalGeneration.from_pretrained('patrickvonplaten/led-large-16384-pubmed').to("cuda")

sentences = split_sentences(content)
groups = group_sentences_with_overlap(sentences, 5, 2)
chunk = summarize(groups[0], longformer_model, longformer_tokenizer, 1024)


In [None]:
print(len(groups))
max_length = max(len(s) for s in groups)
print(max_length)


45
1268


In [None]:
chunk

[['</s><s><s> easy running is thought to provide a foundation of strength and serve as an upgrade of aerobic fitness that helps smooth the transition into higher-quality effort. \n however, evidence shows that this traditional approach and its alleged benefits may not be optimal for improving VO2max and running-specific strength. in this article \n, we review the literature on the benefits of easy running in terms of increasing running strength, aerobic capacity, and fitness prior to embarking on a rigorous training program. </s>']]