# PDF to Text

In [1]:
!pip install pdfplumber



In [2]:
import pdfplumber

In [3]:
file = "sample_pdfs/MutualInformation.pdf"
pdf = pdfplumber.open(file)
parsed_text = "\n\n".join([page.extract_text() for page in pdf.pages])
print(parsed_text)

1
PatternRecognitionLetters
journalhomepage: www.elsevier.com
Fast computation of mutual information in the frequency domain with applications to
global multimodal image alignment
JohanO¨fverstedta,∗∗,JoakimLindblada,NatasˇaSladojea
1 aDepartmentofInformationTechnology,UppsalaUniversity,La¨gerhyddsva¨gen2,75237Uppsala,Sweden
2
0
2 ABSTRACT
 
n
Multimodalimagealignmentistheprocessofﬁndingspatialcorrespondencesbetweenimagesformed
u
J bydiﬀerentimagingtechniquesorunderdiﬀerentconditions, tofacilitateheterogeneousdatafusion
8  andcorrelativeanalysis.Theinformation-theoreticconceptofmutualinformation(MI)iswidelyused
2 as a similarity measure to guide multimodal alignment processes, where most works have focused
  on local maximization of MI that typically works well only for small displacements; this points to
 
]
a need for global maximization of MI, which has previously been computationally infeasible due to
V
the high run-time complexity of existing algorithms. We propose an eﬃcient algo

# Extract all URLs in a website

In [4]:
import requests
from bs4 import BeautifulSoup
from time import time
from tqdm import tqdm

In [5]:
class URLExplorer:

    def __init__(self, base_url, is_interesting_url):
        self.reference_dag = {}
        self.base_url = base_url
        self.unexplored_urls = [base_url]
        self.is_interesting_url = is_interesting_url
        self.timings = {}
        self.failed_urls = []

    def get_referenced_urls(self, url):
        request = requests.get(url)
        soup = BeautifulSoup(request.text, "html.parser")
        return [
            link.get("href")
            for link in soup.find_all("a")
            if link.get("href") and self.is_interesting_url(link.get("href"))
        ]

    def explore_url(self, url):
        if url in self.failed_urls:
            return []

        try:
            start = time()
            referenced_urls = self.get_referenced_urls(url)
            end = time()
            self.timings[url] = end - start

        except:
            self.failed_urls.append(url)
            return []

        self.reference_dag[url] = referenced_urls
        self.unexplored_urls += [
            url for url in referenced_urls if url not in self.reference_dag.keys()
        ]
        if url in self.unexplored_urls:
            self.unexplored_urls.remove(url)

    def explore_full_website(self, limit=None):
        with tqdm() as pbar:
            while self.unexplored_urls:
                n_failed = len(self.failed_urls)
                n_explored = len(self.reference_dag.keys()) + n_failed
                n_unexplored = len(self.unexplored_urls)
                n_total = n_explored + n_unexplored
                text = f"We have currently explored {n_explored}/{n_total} URLs ({n_failed} failures"
                if n_failed > 0:
                    text += f",namely {self.failed_urls[-1]})"
                else:
                    text += ")"
                pbar.set_description(text)
                pbar.update(1)
                self.explore_url(self.unexplored_urls.pop())
                if len(self.reference_dag.keys()) > limit:
                    return

    def number_of_connections(self):
        return sum([len(v) for v in self.reference_dag.values()])

    def __repr__(self):
        return f"{self.base_url} Explorer. {len(self.reference_dag.keys())} nodes explored with {self.number_of_connections()} connections"

In [10]:
def is_interesting_url(url):
    return all(
        [
            url.startswith("https://arxiv.org"),
            "ct?url=" not in url,
            not url.startswith("https://arxiv.org/search"),
            not url.startswith("https://arxiv.org/format"),
        ]
    )


explorer = URLExplorer(
    base_url="https://arxiv.org/", is_interesting_url=is_interesting_url
)
explorer

https://arxiv.org/ Explorer. 0 nodes explored with 0 connections

In [11]:
explorer.explore_full_website(limit=30)
explorer

We have currently explored 30/445 URLs (0 failures): : 49it [00:59,  1.21s/it]


https://arxiv.org/ Explorer. 31 nodes explored with 731 connections

In [12]:
sorted(explorer.timings.items(), key=lambda x: x[1], reverse=True)

[('https://arxiv.org/pdf/2203.03556', 30.036967039108276),
 ('https://arxiv.org/covid19search', 1.6784424781799316),
 ('https://arxiv.org/', 0.9685404300689697),
 ('https://arxiv.org/support/moderation_help', 0.9072434902191162),
 ('https://arxiv.org/corr/subjectclasses', 0.7228176593780518),
 ('https://arxiv.org/about', 0.7123024463653564),
 ('https://arxiv.org/corr', 0.5931897163391113),
 ('https://arxiv.org/help/submit', 0.5829927921295166),
 ('https://arxiv.org/help/endorsement#request', 0.5820391178131104),
 ('https://arxiv.org/help', 0.580359697341919),
 ('https://arxiv.org/login', 0.5691497325897217),
 ('https://arxiv.org/help/contact', 0.568894624710083),
 ('https://arxiv.org/about/people/developers', 0.5512149333953857),
 ('https://arxiv.org/help/policies/privacy_policy', 0.5424740314483643),
 ('https://arxiv.org/help/web_accessibility', 0.5417683124542236),
 ('https://arxiv.org/help/license', 0.5416324138641357),
 ('https://arxiv.org/help/view', 0.5414862632751465),
 ('https:

In [9]:
[print(x) for x in explorer.reference_dag.keys()]

https://arxiv.org/
https://arxiv.org/help/web_accessibility
https://arxiv.org/help/policies/privacy_policy
https://arxiv.org/help/license
https://arxiv.org/help/subscribe
https://arxiv.org/help/contact
https://arxiv.org/help
https://arxiv.org/about
https://arxiv.org/login
https://arxiv.org/user/register?submit=Register+for+the+first+time
https://arxiv.org/user/lost_password
https://arxiv.org/support/moderation_help
https://arxiv.org/licenses/nonexclusive-distrib/1.0/
https://arxiv.org/cookies
https://arxiv.org/help/view
https://arxiv.org/help/email-protection
https://arxiv.org/help/endorsement#request
https://arxiv.org/help/moderation
https://arxiv.org/help/policies/instructions_for_submission
https://arxiv.org/help/policies/submission_agreement
https://arxiv.org/licenses/nonexclusive-distrib/1.0/license.html
https://arxiv.org/help/registerhelp
https://arxiv.org/help/submit
https://arxiv.org/corr/subjectclasses
https://arxiv.org/corr
https://arxiv.org/archive/cs
https://arxiv.org/covid

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]