In [158]:
from bs4 import BeautifulSoup

In [159]:
html = """
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>First HTML Page</title>
</head>
<body>
  <div id="first">
    <h3 data-example="yes">hi</h3>
    <p>more text.</p>
  </div>
  <ol>
    <li class="special">This list item is special.</li>
    <li class="special">This list item is also special.</li>
    <li>This list item is not special.</li>
  </ol>
  <div data-example="yes">bye</div>
</body>
</html>
"""

In [160]:
html

'\n<!DOCTYPE html>\n<html lang="en">\n<head>\n  <meta charset="UTF-8">\n  <title>First HTML Page</title>\n</head>\n<body>\n  <div id="first">\n    <h3 data-example="yes">hi</h3>\n    <p>more text.</p>\n  </div>\n  <ol>\n    <li class="special">This list item is special.</li>\n    <li class="special">This list item is also special.</li>\n    <li>This list item is not special.</li>\n  </ol>\n  <div data-example="yes">bye</div>\n</body>\n</html>\n'

In [161]:
soup = BeautifulSoup(html, 'html.parser')
soup


<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>First HTML Page</title>
</head>
<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<ol>
<li class="special">This list item is special.</li>
<li class="special">This list item is also special.</li>
<li>This list item is not special.</li>
</ol>
<div data-example="yes">bye</div>
</body>
</html>

In [162]:
print(soup.body)

<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<ol>
<li class="special">This list item is special.</li>
<li class="special">This list item is also special.</li>
<li>This list item is not special.</li>
</ol>
<div data-example="yes">bye</div>
</body>


In [163]:
print(soup.body.div)

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>


In [164]:
print(soup.find('div'))

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>


In [165]:
print(soup.find_all('div'))

[<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>, <div data-example="yes">bye</div>]


In [166]:
for element in soup.find_all('div'):
    print(element)

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<div data-example="yes">bye</div>


In [167]:
soup.find_all(class_='special')

[<li class="special">This list item is special.</li>,
 <li class="special">This list item is also special.</li>]

In [168]:
soup.find(attrs={'data-example': 'yes'})

<h3 data-example="yes">hi</h3>

In [169]:
soup.find_all(attrs={'data-example': 'yes'})

[<h3 data-example="yes">hi</h3>, <div data-example="yes">bye</div>]

In [170]:
soup.select('div')

[<div id="first">
 <h3 data-example="yes">hi</h3>
 <p>more text.</p>
 </div>,
 <div data-example="yes">bye</div>]

In [171]:
soup.select('#first')

[<div id="first">
 <h3 data-example="yes">hi</h3>
 <p>more text.</p>
 </div>]

In [172]:
soup.select('.special')

[<li class="special">This list item is special.</li>,
 <li class="special">This list item is also special.</li>]

In [173]:
soup.select('[data-example]')

[<h3 data-example="yes">hi</h3>, <div data-example="yes">bye</div>]

# teksto isgavimas

In [174]:
element = soup.select('.special')[1]
print(element.get_text())


This list item is also special.


In [175]:
ol_li = soup.select('ol li')
ol_li

[<li class="special">This list item is special.</li>,
 <li class="special">This list item is also special.</li>,
 <li>This list item is not special.</li>]

In [176]:
ol_li[0].get_text()

'This list item is special.'

In [177]:
for item in ol_li:
    print(item.get_text())

This list item is special.
This list item is also special.
This list item is not special.


In [178]:
items = []
for item in ol_li:
    items.append(item.get_text())
items

['This list item is special.',
 'This list item is also special.',
 'This list item is not special.']

In [179]:
hi_bye = soup.select('[data-example]')
for item in hi_bye:
    print(item.get_text())

hi
bye


In [180]:
for item in hi_bye:
    print(item.attrs['data-example'])

yes
yes


In [181]:
for item in hi_bye:
    print(item.name, item.attrs['data-example'])

h3 yes
div yes


In [182]:
soup.find('div')['id']

'first'

In [183]:
soup.find('div').attrs['id']

'first'

In [184]:
soup.div.contents

['\n', <h3 data-example="yes">hi</h3>, '\n', <p>more text.</p>, '\n']

In [185]:
li = soup.find('li')
li

<li class="special">This list item is special.</li>

In [186]:
li.next_sibling.next_sibling

<li class="special">This list item is also special.</li>

In [187]:
li.next_element

'This list item is special.'

In [188]:
[element in li.parent.next_elements]

[True]

In [189]:
for element in li.parent.next_elements:
    print(element)



<li class="special">This list item is special.</li>
This list item is special.


<li class="special">This list item is also special.</li>
This list item is also special.


<li>This list item is not special.</li>
This list item is not special.




<div data-example="yes">bye</div>
bye








In [190]:
li.parent.parent

<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<ol>
<li class="special">This list item is special.</li>
<li class="special">This list item is also special.</li>
<li>This list item is not special.</li>
</ol>
<div data-example="yes">bye</div>
</body>

In [191]:
li.find_next_siblings()

[<li class="special">This list item is also special.</li>,
 <li>This list item is not special.</li>]

In [192]:
li.find_next_siblings(class_='')

[<li>This list item is not special.</li>]

In [193]:
li.find_parent().find_previous_sibling().attrs['id']

'first'

In [194]:
soup.body.next_element.next_element.next_element.next_element.attrs['data-example']

'yes'

In [195]:
import requests 

In [196]:
google = requests.get('https://google.com/')
google

<Response [200]>

In [197]:
google.status_code

200

In [198]:
google.content

b'<!DOCTYPE html><html lang="lt" dir="ltr"><head><style nonce="YsHj0J0Mzqf0LUIQzFCKEQ">\na, a:link, a:visited, a:active, a:hover {\n  color: #1a73e8;\n  text-decoration: none;\n}\nbody {\n  font-family: Roboto,RobotoDraft,Helvetica,Arial,sans-serif;\n  text-align: center;\n  -ms-text-size-adjust: 100%;\n  -moz-text-size-adjust: 100%;\n  -webkit-text-size-adjust: 100%;\n}\n.box {\n  border: 1px solid #dadce0;\n  box-sizing: border-box;\n  border-radius: 8px;\n  margin: 24px auto 5px auto;\n  max-width: 800px;\n  padding: 24px;\n}\nh1 {\n  color: #2c2c2c;\n  font-size: 24px;\n  hyphens: auto;\n  margin: 24px 0;\n}\n.icaCallout {\n  background-color: #f8f9fa;\n  padding: 12px 16px;\n  border-radius: 10px;\n  margin-bottom: 10px;\n}\np, .sub, .contentText, .icaCallout {\n  color: #5f6368;;\n  font-size: 14px;\n  line-height: 20px;\n  letter-spacing: 0.2px;\n  text-align: left;\n}\n.signin {\n  text-align: right;\n}\n.saveButtonContainer,\n.saveButtonContainerNarrowScreen {\n  width: 100%;\

In [199]:
if google.status_code == 200:
    google_soup = BeautifulSoup(google.content, "html.parser")
else:
    google_soup = google.status_code

In [200]:
google_soup.body.select('.box')[0].select_one('img').attrs['src']

'//www.gstatic.com/images/branding/googlelogo/1x/googlelogo_color_68x28dp.png'

In [201]:
python = requests.get('https://python.org')
python.status_code

200

In [202]:
pysuop = BeautifulSoup(python.content, "html.parser")
pysuop.body

<body class="python home" id="homepage">
<div id="touchnav-wrapper">
<div class="do-not-print" id="nojs">
<p><strong>Notice:</strong> While JavaScript is not essential for this website, your interaction with the content will be limited. Please turn JavaScript on for the full experience. </p>
</div>
<!--[if lte IE 8]>
            <p>
                <strong>Notice:</strong> Your browser is <em>ancient</em>. Please
                <a href="http://browsehappy.com/">upgrade to a different browser</a> to experience a better web.
            </p>
        </div>
        <![endif]-->
<!-- Sister Site Links -->
<div class="top-bar do-not-print" id="top">
<nav class="meta-navigation container" role="navigation">
<div class="skip-link screen-reader-text">
<a href="#content" title="Skip to content">Skip to content</a>
</div>
<a aria-hidden="true" class="jump-link" href="#python-network" id="close-python-network">
<span aria-hidden="true" class="icon-arrow-down"><span>▼</span></span> Close
        

In [203]:
pymenu = pysuop.select_one('.menu').select('li')
pymenu

[<li class="python-meta current_item selectedcurrent_branch selected">
 <a class="current_item selectedcurrent_branch selected" href="/" title="The Python Programming Language">Python</a>
 </li>,
 <li class="psf-meta">
 <a href="/psf-landing/" title="The Python Software Foundation">PSF</a>
 </li>,
 <li class="docs-meta">
 <a href="https://docs.python.org" title="Python Documentation">Docs</a>
 </li>,
 <li class="pypi-meta">
 <a href="https://pypi.org/" title="Python Package Index">PyPI</a>
 </li>,
 <li class="jobs-meta">
 <a href="/jobs/" title="Python Job Board">Jobs</a>
 </li>,
 <li class="shop-meta">
 <a href="/community-landing/">Community</a>
 </li>]

In [204]:
for item in pymenu:
    link = item.select_one('a')
    # print(f"{link.attrs['title']} = {link.get_text()}")
    if 'title' in link.attrs:
        print(link.attrs['title'], end=" = ")
    print(link.get_text(), end=" goes to ")
    print(link.attrs['href'])


The Python Programming Language = Python goes to /
The Python Software Foundation = PSF goes to /psf-landing/
Python Documentation = Docs goes to https://docs.python.org
Python Package Index = PyPI goes to https://pypi.org/
Python Job Board = Jobs goes to /jobs/
Community goes to /community-landing/


In [205]:
r_prusija = requests.get('https://lt.wikipedia.org/wiki/Pr%C5%ABsija')
if r_prusija.status_code == 200:
    prusija = BeautifulSoup(r_prusija.content, "html.parser")
else:
    prusija = r_prusija.status_code
prusija

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="lt">
<head>
<meta charset="utf-8"/>
<title>Prūsija – Vikipedija</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":[",\t."," \t,"],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","sausio","vasario","kovo","balandžio","gegužės","birželio","liepos","rugpjūčio","rugsėjo","spalio","lapkričio","gruodžio"],"wgRequestId":"ebf83f1b-622a-4496-847d-3022a0d45242","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Prūsija","wgTitle":"Prūsija","wgCurRevisionId":6720439,"wgRevisionId":6720439,"wgArticleId":653,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages with non-numeric formatnum arguments","Straipsniai be šaltinių nuo 2004 m. sausio","Straipsniai be šaltinių pažymėti nuo 2020 m. lapkričio","Straipsniai 

In [210]:
prusija_info = {}
#headings = []
for heading in prusija.select('h2'):
    title = heading.select_one('span').get_text()
    print(title)
    content = heading.find_next('p').get_text()
    print(content)
    prusija_info[title] = content
for heading in prusija.select('h3'):
    title = heading.select_one('.mw.headline').get_text()
    content = heading.find_next('p').get_text()
    prusija_info[title] = content

for title, content in prusija_info.items():
    print(f"---{title}---")
    print(f'{content}\n')
    

Geografija
Prūsijos regionas užėmė teritoriją, šiuo metu padalintą tarp trijų valstybių. Šiaurėje jis buvo ribojamas Baltijos jūros, ir krantas čia yra labai sudėtingas, turi keletą didelių lagūnų (Kuršių marios, Aistmarės), nerijų. Rytuose Prūsija ribojosi su Lietuva, pietuose – su Mazovija, pietryčiuose – su Palenke, o vakaruose – su Pomerelija (nuo kurios skyrė Vysla). Pastarasis, dar vadinamas Vakarų Prūsija, gali būti laikoma išplėstinio Prūsijos regiono dalimi.

Istorija
Regionas patyrė labai sudėtingą istoriją, čia du kartus visiškai keitėsi etninė sudėtis.



AttributeError: 'NoneType' object has no attribute 'get_text'

In [208]:
import requests
from bs4 import BeautifulSoup
from random import shuffle

html = requests.get('http://delfi.lt').text
soup = BeautifulSoup(html, "html.parser")

title_tags = soup.select('.CBarticleTitle')
titles = [i.get_text() for i in title_tags]
bad_words = ['COVID', 'mirt', 'NVSC', 'skiep']

first_parts = []
second_parts = []
for title in titles:
     if ':' in title:
         if not any(word in title for word in bad_words):
             splitted = title.split(":")
             first_parts.append(splitted[0])
             second_parts.append(splitted[1])

shuffle(second_parts)

for i in range(len(first_parts)):
    print(first_parts[i], ":", second_parts[i])

Garsiam profesoriui – neigiamas teismo sprendimas :  ekspertai Beno Gudelio retoriką smerkia, bet pripažįsta – vartotojai už tai nebaudžia
Erdoganas :  nerimą kelia duomenys iš Rusijos kaimynių – pokytis staigus ir netikėtas
Paleckio bendražygio išpuolis Antakalnio kapinėse :  darbai, kurių neatlikus, kyla pavojus jūsų sveikatai ir net gyvybei
Šimonytė – apie į skandalą patekusią Bilotaitę :  darbai, kurių neatlikus, kyla pavojus jūsų sveikatai ir net gyvybei
Kvepia žiūrėjimu paromis :  kodėl žmonėms vis sunkiau kam nors paskambinti telefonu
Šildymo, geriamojo ir karšto vandens išlaidų kompensacijos :  spekuliantai jau trina rankomis
Karas Ukrainoje. Zelenskis :  po antrojo smūgio Krymo tiltui bus dar įdomiau
Neįprastas maršrutas :  transliacijų platformos lapkritį atgaivino legendinius serialus bei personažus ir pažėrė naujų filmų
Jei tai žinotumėte, gerai pagalvotumėte, prieš darydamosi ilgalaikį makiažą :  uogienės ir degtinė brangs
Ukrainos ekonomika be paramos neišsivers :  uždrau