## Matnni tozalash va tahrirlash

### HTML Tag, tinish belgilar hamdan raqamlarni matndan olib tashlash

In [48]:
import re

text = "<p>Bu mening bosh <b>sahifam</b> unda matn hamda 12345... kabi raqamlar bor!!!</p>"

# HTML tags cleaning
text = re.sub(r'<.*?>','',text)

text

'Bu mening bosh sahifam unda matn hamda 12345... kabi raqamlar bor!!!'

In [49]:
# Matndan tinish belgilarini olib tashlash
text = re.sub(r'[^\w\s]', '', text)

text

'Bu mening bosh sahifam unda matn hamda 12345 kabi raqamlar bor'

In [50]:
# Matndan raqamlarni olib tashlash
text = re.sub('\d+', '', text)

text

'Bu mening bosh sahifam unda matn hamda  kabi raqamlar bor'

In [51]:
text = 'Bu mening    yangi matnim unda \nbir necha \n\nkabi belgilar\n\n\nbor'

print(f'Bu mening asosiy matnim:\n{text}')

Bu mening asosiy matnim:
Bu mening    yangi matnim unda 
bir necha 

kabi belgilar


bor


In [52]:
# Bosh joy va yangi qator belgilarini olib tashlash
cleaned_text = re.sub('\s+', ' ', text)
cleaned_text

'Bu mening yangi matnim unda bir necha kabi belgilar bor'

## Matnni to'liq kichik harflarga o'tkazish

In [53]:
text = "Bu MENing TurLI KatTaLIKDAgi harflARGA eGA matniM"

# Matnni to'liq kichik harflarga o'tkazish
cleaned_text = text.lower()
cleaned_text

'bu mening turli kattalikdagi harflarga ega matnim'

### Emaillar bilan ishlash

In [54]:
text = "Biz bilan bog'lanish uchun support@example.com yoki miu@gmail.com ga yozing"

cleaned_text = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
cleaned_text

['support@example.com', 'miu@gmail.com']

### Telefon raqamlari bilan ishlash

In [55]:
text = "Mening telefon raqamim +998 88 008 45 06 dir"
cleaned_text = re.findall(r"\+?\d{1,3}[-.\s]?\(?\d{1,3}\)?[-.\s]?\d{2,4}[-.\s]?\d{2,4}[-.\s]?\d{0,4}", text)
cleaned_text

['+998 88 008 45 06']

### Lotin alifbosida mavjud bo'lmagan belgilarni olib tashlash

In [56]:
text = "Hello W√∏rld! –ü—Ä–∏–≤–µ—Ç! „Åì„Çì„Å´„Å°„ÅØ!"

cleaned_text = re.sub(r"[^\x00-\x7F]", '', text)
cleaned_text

'Hello Wrld! ! !'

### HTML faylni yuklash

In [57]:
file = "mening_html_sahifam.html"

In [58]:
with open(file , "r", encoding = "utf-8") as file:
    html_content = file.read()
html_content

'<!DOCTYPE html>\n<html lang="uz">\n<head>\n    <meta charset="UTF-8" />\n    <meta http-equiv="X-UA-Compatible" content="IE=edge" />\n    <meta name="viewport" content="width=device-width, initial-scale=1.0" />\n    <meta name="author" content="kunu.uz">\n    <meta name="description" content="23 sentabrda Toshkent havoning ifloslanish darajasi bo‚Äòyicha dunyoning yirik shaharlari orasida 6-o‚Äòrinni egallab turibdi.">\n<meta name="keywords" content="Yangiliklar,xabarlar,eng so‚Äònggi yangiliklar, Dunyo Sport">\n<meta property="og:title" content="Toshkent havosining iflosligi bo‚Äòyicha dunyoda 6-o‚Äòringa tushdi">\n<meta property="og:description" content="23 sentabrda Toshkent havoning ifloslanish darajasi bo‚Äòyicha dunyoning yirik shaharlari orasida 6-o‚Äòrinni egallab turibdi.">\n<meta property="og:type" content="article">\n<meta property="og:url" content="https://kun.uz/news/2025/09/23/toshkent-havosining-iflosligi-boyicha-dunyoda-6-oringa-tushdi">\n<meta property="og:locale" con

In [59]:
from bs4 import BeautifulSoup
import re

In [60]:
def cleaning_and_preparing_text(file):
    # Faylni yuklab olish
    with open(file, 'r', encoding = 'utf-8') as file:
        html_content = file.read()

    # Beautiful Soup yormida HTML taglarni tahlil qilish
    soup = BeautifulSoup(html_content, "html.parser")

    # Matnni olish
    text = soup.get_text(separator = ' ', strip = True)

    # Tinish belgilaridan tozalash
    text = re.sub(r'[^\w\s]', '', text)

    # Raqam va sonalrdan tozalash
    text = text = re.sub('\d+', '', text)

    # Ortiqcha probel va enterlardan tozalash
    text = re.sub('\s+', ' ', text).lower().strip()

    return text

In [62]:
file = "regex.html"

cleaned_text = cleaning_and_preparing_text(file)

In [63]:
cleaned_text

'python regex tutorials references exercises certificates menu search field see more sign in get certified upgrade for teachers spaces get certified upgrade for teachers spaces my wschools tutorials references exercises certificates spaces get certified plus academy all our services logout tutorials tutorials filter input html and css learn html tutorial reference learn css tutorial reference learn rwd tutorial learn bootstrap overview learn wcss tutorial reference learn sass tutorial reference learn colors tutorial reference learn icons tutorial reference learn svg tutorial reference learn canvas tutorial reference learn graphics tutorial learn utf and emojis reference learn how to tutorial data analytics learn ai tutorial learn generative ai tutorial learn chatgpt tutorial learn chatgpt tutorial learn google bard tutorial learn machine learning tutorial learn dsa tutorial learn data science tutorial learn numpy tutorial learn pandas tutorial learn scipy tutorial learn matplotlib tuto