In [23]:
import httpx
import json
import os

In [24]:
docs_config_dir = "doc_base/RuBQ_2.0/"
test_queries_path = os.path.join(docs_config_dir, 'RuBQ_2.0_test.json')
dev_queries_path = os.path.join(docs_config_dir, 'RuBQ_2.0_dev.json')
paragraphs_path = os.path.join(docs_config_dir, 'RuBQ_2.0_paragraphs.json')

In [25]:
with open(test_queries_path, 'r', encoding='utf-8') as f:
    test_queries = json.load(f)

with open(dev_queries_path, 'r', encoding='utf-8') as f:
    dev_queries = json.load(f)

with open(paragraphs_path, 'r', encoding='utf-8') as f:
    paragraphs = json.load(f)

In [26]:
test_queries

[{'uid': 0,
  'question_text': 'Что может вызвать цунами?',
  'query': 'SELECT ?answer \nWHERE {\n  wd:Q8070 wdt:P828 ?answer\n}',
  'answer_text': 'Землетрясение',
  'question_uris': ['http://www.wikidata.org/entity/Q8070'],
  'question_props': ['wdt:P828'],
  'answers': [{'type': 'uri',
    'label': 'землетрясение',
    'value': 'http://www.wikidata.org/entity/Q7944',
    'wd_names': {'ru': ['землетрясение', 'җир тетрәве'],
     'en': ['seism',
      'earthquake',
      'seismic activity',
      'fore shocks',
      'tremor',
      'earthquakes',
      'earth quake',
      'earthtemblor',
      'foreshock',
      'aftershock',
      'quake',
      'temblor',
      'earth temblor',
      'foreshocks',
      'after shocks',
      'earth quakes',
      'after shock',
      'earthtremor',
      'convulsion',
      'earth tremor',
      'shock',
      'fore shock',
      'aftershocks']},
    'wp_names': ['землетрясениям']},
   {'type': 'uri',
    'label': 'метеорит',
    'value': 'http://

In [27]:
paragraphs

[{'uid': 0,
  'ru_wiki_pageid': 58311,
  'text': 'ЦСКА — советский и российский профессиональный хоккейный клуб из Москвы, выступающий в Континентальной хоккейной лиге. Основан в 1946 году под названием ЦДКА (Центральный дом Красной Армии). В 1951 году переименован в ЦДСА (Центральный дом Советской Армии), а в 1954 в ЦСК МО (Центральный спортивный клуб Министерства обороны), под которым выступал до 1959 года, и с тех пор носит название ЦСКА (Центральный Спортивный Клуб Армии).'},
 {'uid': 1,
  'ru_wiki_pageid': 58311,
  'text': 'В первом сезоне в составе Континентальной хоккейной лиги ЦСКА выиграл дивизион Тарасова, но в плей-офф с трудом обыграл «Ладу» (3-2 по сумме встреч) и всухую проиграл «Динамо» (0-3). В конце сезона тренерский тандем Быков-Захаркин покинул команду, аргументировав своё решение желанием сосредоточиться на работе в сборной России, однако уже через несколько недель подписали контракт с командой «Салават Юлаев», таким образом продолжив совмещать работу в сборной и в 

### Загрузка вики-страниц для параграфов

In [38]:
from typing import List
import sys
from tqdm import tqdm
from bs4 import BeautifulSoup
import re


def extract_table_data(soup_object, table_format='md'):
    tables = soup_object.find_all('table')
    tables_data = []
    
    for table in tables:
        headers = []
        dashes = []

        # build headers:
        for th in table.find_all('th'):
            item = ' ' + th.text.strip() + ' '
            headers.append(item)
            dashes.append('-' * len(item))

        headers = [' ' + th.text.strip() + ' ' for th in table.find_all('th')]
        rows = []

        # build table data:
        for row in table.find_all('tr'):
            row = [' ' + td.text.strip() + ' '  for td in row.find_all('td')]
            if row:
                rows.append(row)
        table_data = '|' + '|'.join(headers) + '|'
        cells_data = []

        for row in rows:
            cells_data.append('|' + '|'.join(row) + '|')

        # join to single string:
        if table_format == 'md':
            dashes_line = '|' + '|'.join(dashes) + '|'
            table_data = table_data + '\n' + dashes_line + '\n' + '\n'.join(cells_data)
        else:
            table_data = table_data + '\n' + '\n'.join(cells_data)

        tables_data.append(table_data)
        new_div = soup_object.new_tag('div')
        new_div.string = table_data

        table.replace_with(new_div)
        

def clear_html_doc(data: str):
    def drop_html_artifacts(text: str) -> str:
        text = re.sub(r'\xa0', ' ', text)
        text = re.sub(r'\t', ' ', text)
        text = re.sub(r'(\s){2,}', r'\1', text)
        text = re.sub(r'\n{3,}', '\n\n', text)
        return text.strip()
    
    soup = BeautifulSoup(data, 'lxml')  # lxml parser for performance
    # drop garbage
    unwanted_selectors = [
        'script', 'style', 'noscript',  # Scripts and styles
        '.mw-jump-link',  # Skip to content links
        '#mw-navigation', '#p-search', '#p-tb', '#p-lang',  # Navigation elements
        '.navbox', '.navigation-box',  # Navigation boxes
        '.printfooter',  # Print footer
        '.catlinks',  # Category links
        '.mw-editsection',  # Edit section links
        '.reference',  # Reference links (we'll handle these differently)
        '.mw-cite-backlink',  # Citation backlinks
        '#coordinates',  # Coordinate info
        '.metadata',  # Metadata
        '.dablink',  # Disambiguation links
        '.hatnote',  # Hat notes
        '.ambox',  # Article message boxes
        '#toc',  # Table of contents
    ]
    
    # Remove unwanted elements:
    for selector in unwanted_selectors:
        for element in soup.select(selector):
            element.decompose()
    
    # extract table data:
    extract_table_data(soup)

    content_area = soup.find('body')

    if not content_area:
        return ""
    
    # Configurable rules
    skip_section_keywords = {
        "см. также", "см также", "примечания", "ссылки", "источники", "литература"
    }
    skip_class_prefixes = ("navbox", "navigation", "mw-", "toc", "references")
    min_length = {"p": 10, "div": 50}  # thresholds for short junk filtering

    content_elements = []
    skip_sections = False    
    clean_text = lambda x: x.get_text(strip=True)

    for el in content_area.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6',
                                     'p', 'div', 'li', 'dd', 'dt']):
        tag = el.name
        text = clean_text(el) 

        if not text:
            continue

        # headers:
        if tag.startswith("h"):
            text_clean = text.lower()

            if any(text_clean.startswith(skip_section) for skip_section in skip_section_keywords):
                skip_sections = True
                continue

            if any(x in text_clean for x in ("править", "edit", "содержание")): # TODO: maybe fix 'править'
                continue

            level = int(tag[1])
            content_elements.append(f"{'#' * level} {text}")
            skip_sections = False  # reset at new valid section
            continue

        # Skip other tags if in skipped section
        if skip_sections:
            continue

        # Paragraphs
        if tag == "p":
            if len(text) > min_length["p"]:
                content_elements.append(text)
        elif tag in ("li", "dd"):
            content_elements.append(f"* {text}")

        # Divs (with filtering)
        elif tag == "div":
            classes = el.get("class", [])
            el_ids = el.get("id", "").split()

            if "formatted-table" in classes:
                content_elements.append(text)
            elif (len(text) > min_length["div"]
                  and not any(cl.startswith(prefix) for cl in classes + el_ids for prefix in skip_class_prefixes)):
                content_elements.append(text)
          
    return drop_html_artifacts(content_area.get_text())    


def load_source_pages(paragraphs: List[dict], dst_dir: str='doc_base/wiki_docs/'):
    if not os.path.isdir(dst_dir):
        os.mkdir(dst_dir)
    meta_data = dict()
    error_pages = []
    pages_set = set([item['ru_wiki_pageid'] for item in paragraphs])
    pbar = tqdm(total=len(pages_set))

    for item in paragraphs:
        wiki_page_id = item["ru_wiki_pageid"]

        if wiki_page_id in meta_data:
            # page exists - update meta data adding new paragraph
            meta_data[wiki_page_id].append(item['uid'])
        else:
            # page is not loaded yet
            meta_data[wiki_page_id] = [item['uid']]
            wiki_url = f'https://ru.wikipedia.org/w/index.php?curid={wiki_page_id}'

            try:
                resp = httpx.get(wiki_url, timeout=20)
                resp.raise_for_status()
                data = resp.text
                data = clear_html_doc(data)

                with open(f'{wiki_page_id}.txt', 'w', encoding='utf-8') as f:
                    f.write(data)
                
            except httpx.RequestError as req_exc:
                msg = f'request error for url={wiki_url}: {str(req_exc)}'
                print(msg, file=sys.stderr)
                error_pages.append({'url': wiki_url, 'exc_type': 'request error', 'msg': msg})
            except httpx.HTTPStatusError as st_exc:
                msg = f'invalid status for url={wiki_url}: {str(st_exc)}'
                print(msg, file=sys.stderr)
                error_pages.append({'url': wiki_url, 'exc_type': 'status error', 'msg': msg})
            pbar.update(1)

        break # TODO: remove later

    # write exceptions info:
    with open('load_errors.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(error_pages))

    # write metadata info:
    meta_data_json = []
    for key, val in meta_data.items():
        wiki_url = f'https://ru.wikipedia.org/w/index.php?curid={key}'
        item = {'url': wiki_url, 'page_id': key, 'paragraphs_ids': val}
        meta_data_json.append(item)

    with open('meta_data.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(meta_data_json))

load_source_pages(paragraphs)



  0%|          | 1/9105 [00:00<2:21:35,  1.07it/s]


In [None]:
# import the necessary libraries
from bs4 import BeautifulSoup
import requests 

# make a GET request to the target website
response = requests.get("https://www.scrapingcourse.com/table-parsing")
# retrieve the response
html = response.text

# create a BeautifulSoup object
soup = BeautifulSoup(html, 'lxml')

# select the table element
table = soup.find('table')

# extract headers 
headers = [th.text.strip() for th in table.find_all('th')]

# extract table body
rows = []
for row in table.find_all('tr')[1:]:  
    cells = [td.text.strip() for td in row.find_all('td')]
    rows.append(cells)

# log data
print("Headers:", headers)
print("table_body:")
for row in rows:
    print(row)

Headers: ['Product ID', 'Name', 'Category', 'Price', 'In Stock']
table_body:
[]
['001', 'Laptop', 'Electronics', '$999.99', 'Yes']
['002', 'Smartphone', 'Electronics', '$599.99', 'Yes']
['003', 'Headphones', 'Audio', '$149.99', 'No']
['004', 'Coffee Maker', 'Appliances', '$79.99', 'Yes']
['005', 'Running Shoes', 'Sports', '$89.99', 'Yes']
['006', 'Smart Watch', 'Electronics', '$249.99', 'Yes']
['007', 'Blender', 'Appliances', '$39.99', 'No']
['008', 'Yoga Mat', 'Sports', '$29.99', 'Yes']
['009', 'Wireless Mouse', 'Electronics', '$24.99', 'Yes']
['010', 'Desk Lamp', 'Home', '$34.99', 'Yes']
['011', 'Portable Speaker', 'Audio', '$79.99', 'No']
['012', 'Electric Toothbrush', 'Personal Care', '$49.99', 'Yes']
['013', 'Backpack', 'Accessories', '$59.99', 'Yes']
['014', 'Air Purifier', 'Home', '$129.99', 'No']
['015', 'Gaming Console', 'Electronics', '$399.99', 'Yes']


In [None]:
extract_table_data(soup)

 Product ID | Name | Category | Price | In Stock  |0|1|5| 
 |G|a|m|i|n|g| |C|o|n|s|o|l|e| 
 |E|l|e|c|t|r|o|n|i|c|s| 
 |$|3|9|9|.|9|9| 
 |Y|e|s| 


In [20]:
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="ie=edge" http-equiv="X-UA-Compatible"/>
<title>Table Parsing Challenge - ScrapingCourse.com</title>
<!-- Bootstrap CSS -->
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css" rel="stylesheet"/>
<script async="" defer="" src="https://challenges.cloudflare.com/turnstile/v0/api.js"></script>
<!-- Google tag (gtag.js) -->
<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-NZGD14H87G"></script>
<script>
        window.dataLayer = window.dataLayer || [];
        function gtag(){dataLayer.push(arguments);}
        gtag('js', new Date());
        gtag('config', 'G-NZGD14H87G');
    </script>
<link as="style" href="https://www.scrapingcourse.com/build/assets/app-5Cdbk7yA.css" rel="preload"/><link href="https://www.scrapingcourse.com/build/assets/app-D2jpX1vH.js" rel="modulepreload"/><link href="h