In [22]:
import requests
from bs4 import BeautifulSoup
import json
import re

url = 'https://en.wikipedia.org/wiki/Pakistan'
#url = 'https://www.youtube.com/results?search_query=integrate+python+with+html'
#url = 'https://www.w3schools.com/python/'

response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')

def filter_links(href):
    return href and href.startswith('http')

links = [link.get('href') for link in soup.find_all('a')]
images = [image['src'] for image in soup.find_all('img')]
paragraphs = [paragraph.text.strip() for paragraph in soup.find_all('p')]
tables = []

for table in soup.find_all('table'):
    table_data = []
    for row in table.find_all('tr'):
        row_data = []
        for cell in row.find_all(['td', 'th']):
            row_data.append(cell.text.strip())

        table_data.append(row_data)
    tables.append(table_data)

lists = []
for ul in soup.find_all('ul'):
    list_data = [li.text.strip() for li in ul.find_all('li')]
    lists.append(list_data)

forms = []
for form in soup.find_all('form'):
    form_data = {
        "action": form.get('action'),
        "method": form.get('method'),
        "inputs": [{input_tag.get('name'): input_tag.get('value')} for input_tag in form.find_all('input')]
    }
    forms.append(form_data)

options = []
for select in soup.find_all('select'):
    select_data = {
        "name": select.get('name'),
        "options": [option.text.strip() for option in select.find_all('option')]
    }
    options.append(select_data)

buttons = [button.text.strip() for button in soup.find_all('button')]

labels = [label.text.strip() for label in soup.find_all('label')]

h_tags = {f"h{i}": [tag.text.strip() for tag in soup.find_all(f'h{i}')] for i in range(1, 6)}

meta_tags = [str(tag) for tag in soup.find_all('meta')]


data = {
    "headings": h_tags,
    "links": links,
    "images": images,
    "paragraphs": paragraphs,
    "tables": tables,
    "lists": lists,
    "forms": forms,
    "options": options,
    "buttons": buttons,
    "labels": labels,
    "meta_tags": meta_tags
}

with open('scraped_data.json', 'w', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=4)

tag_counts = {}
for tag in soup.find_all():
    tag_counts[tag.name] = tag_counts.get(tag.name, 0) + 1

print("Tag Counts:")
for tag, count in tag_counts.items():
    print(f"{tag}: {count}")


Tag Counts:
html: 1
head: 1
meta: 20
title: 1
script: 5
link: 824
body: 1
a: 6383
div: 625
header: 2
nav: 11
input: 8
label: 6
span: 3984
button: 17
ul: 312
li: 2199
img: 221
form: 1
h2: 16
main: 1
h1: 1
style: 25
p: 174
table: 53
tbody: 53
tr: 308
td: 408
b: 712
i: 755
th: 239
sup: 859
br: 124
audio: 1
source: 6
track: 12
abbr: 41
h3: 35
figure: 55
figcaption: 55
h4: 14
video: 1
blockquote: 3
cite: 691
small: 39
map: 1
area: 8
h5: 3
ol: 3
bdi: 177
q: 99
code: 8
dl: 1
dd: 1
noscript: 1
footer: 1
