In [1]:
"""
gazpacho is a web scraping library, acts as an interface on top of BeautifulSoup + requests. Combines
HTML parsing plus HTTP requests,  and supports partial string matching.

https://pypi.org/project/gazpacho/
"""
from gazpacho import get, Soup

In [2]:
"""
scrape.world is a tutorial website drafted by the class teacher/gazpacho creator
"""
url = "https://scrape.world/soup"
# Retrieve the raw HTML from tutorial site
# Essentially requests.get(url)
html = get(url)
# Create a Soup object from the HTML
soup = Soup(html)

In [5]:
"""
Here, we are attempting to retrieve all links associate with figures of speech
Inspection of the site shows that all figures of speech links exist in the div
with CSS class 'section-speech'
"""
figures_of_speech = soup.find('div', {'class': 'section-speech'})

In [12]:
"""
We return all a tags as a list

NOTE if returned data is only one element, returns as a single tag
otherwise, returns as list
"""

fos_link_tags = figures_of_speech.find('a')
fos_link_tags

[<a href="https://en.wikipedia.org/wiki/Alphabet_soup_(linguistics)" title="Alphabet soup (linguistics)">Alphabet soup</a>,
 <a href="https://en.wikipedia.org/wiki/Alphabet" title="Alphabet">alphabet</a>,
 <a href="https://en.wiktionary.org/wiki/from_soup_to_nuts" class="extiw" title="wikt:from soup to nuts">From soup to nuts</a>,
 <a href="#cite_note-19">[19]</a>,
 <a href="https://en.wikipedia.org/wiki/Abiogenesis" title="Abiogenesis">Primordial soup</a>,
 <a href="https://en.wikipedia.org/wiki/Soup_kitchen" title="Soup kitchen">Soup kitchen</a>,
 <a href="https://en.wikipedia.org/wiki/Stone_soup" class="mw-redirect" title="Stone soup">Stone soup</a>,
 <a href="https://en.wikipedia.org/wiki/Souperism" title="Souperism">Souperism</a>,
 <a href="https://en.wikipedia.org/wiki/Great_Famine_(Ireland)" title="Great Famine (Ireland)">Irish Great
                     Famine</a>,
 <a href="https://en.wikipedia.org/wiki/Tag_soup" title="Tag soup">Tag soup</a>,
 <a href="https://en.wikipedia.or

In [9]:
"""
We can retrieve specific subtags within each element via grabbing its attributes
So for example, the first element's link can be retrived by the below
"""

fos_link_tags[0].attrs["href"]

'https://en.wikipedia.org/wiki/Alphabet_soup_(linguistics)'

In [11]:
"""
So we can grab all of the links via the below
"""
fos_links = [i.attrs["href"] for i in fos_link_tags]
fos_links     

['https://en.wikipedia.org/wiki/Alphabet_soup_(linguistics)',
 'https://en.wikipedia.org/wiki/Alphabet',
 'https://en.wiktionary.org/wiki/from_soup_to_nuts',
 '#cite_note-19',
 'https://en.wikipedia.org/wiki/Abiogenesis',
 'https://en.wikipedia.org/wiki/Soup_kitchen',
 'https://en.wikipedia.org/wiki/Stone_soup',
 'https://en.wikipedia.org/wiki/Souperism',
 'https://en.wikipedia.org/wiki/Great_Famine_(Ireland)',
 'https://en.wikipedia.org/wiki/Tag_soup',
 'https://en.wikipedia.org/wiki/HTML']

In [14]:
fos_link_tags[0].attrs

{'href': 'https://en.wikipedia.org/wiki/Alphabet_soup_(linguistics)',
 'title': 'Alphabet soup (linguistics)'}

In [15]:
"""
And all titles via the below
"""
fos_titles = [i.attrs.get("title") for i in fos_link_tags]
fos_titles

['Alphabet soup (linguistics)',
 'Alphabet',
 'wikt:from soup to nuts',
 None,
 'Abiogenesis',
 'Soup kitchen',
 'Stone soup',
 'Souperism',
 'Great Famine (Ireland)',
 'Tag soup',
 'HTML']

In [17]:
"""
Further, we can clean out the internal citation link with string matching
"""
cleaned_links = [i.attrs["href"] for i in fos_link_tags if 'https' in i.attrs["href"]]
cleaned_links

['https://en.wikipedia.org/wiki/Alphabet_soup_(linguistics)',
 'https://en.wikipedia.org/wiki/Alphabet',
 'https://en.wiktionary.org/wiki/from_soup_to_nuts',
 'https://en.wikipedia.org/wiki/Abiogenesis',
 'https://en.wikipedia.org/wiki/Soup_kitchen',
 'https://en.wikipedia.org/wiki/Stone_soup',
 'https://en.wikipedia.org/wiki/Souperism',
 'https://en.wikipedia.org/wiki/Great_Famine_(Ireland)',
 'https://en.wikipedia.org/wiki/Tag_soup',
 'https://en.wikipedia.org/wiki/HTML']