In [1]:
# Columns to add to the data:

# posts
# total wikipedia mentions
# total mentions in URLs
# total replies
# total reply score

In [None]:
# new tables - post URLs, comment urls

# columns: post_id, created_at, updated_at, raw url, parsed url, domain, parsed title, processed title, article IDs???, 
# columns: comment_id, post_id, parent_id, created_at, last_modfied_at, raw url, parsed url, domain, article, redirecttitel, article IDs???

In [1]:
import markdown
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse
import re
import pandas as pd
import requests
import asyncio
import aiohttp
from unicodedata import category
import time
import pickle
import os
import datetime

In [2]:
HEADERS = {'user-agent': 'wikireddit p.gildersleve@exeter.ac.uk'}
MAX_CONCURRENCY = 100

if os.path.exists('.temp/rd_url_cache_p.pkl'):
    with open('.temp/rd_url_cache_p.pkl', 'rb') as f:
        RD_URL_CACHE_P = pickle.load(f)
else:
    RD_URL_CACHE_P = {True: {}, False: {}}
    
if os.path.exists('.temp/rd_url_cache_tp.pkl'):
    with open('.temp/rd_url_cache_tp.pkl', 'rb') as f:
        RD_URL_CACHE_TP = pickle.load(f)
else:
    RD_URL_CACHE_TP = {}

# --- Helper Functions ---

def clean_wikipedia_domain(link):
    if 'wikipedia.org' not in link:
        return None
    try:
        parsed = urlparse(link)
    except ValueError as ex:
        print(ex)
        return None
    domain = parsed.netloc.lower()
    while domain and category(domain[-1])[0]=='P':
        domain = domain[:-1]
    # get subdomain
    # subdomain = domain.split('wikipedia.org')[0]
    # if (subdomain not in subdomains) and subdomain:
    #     return None
    # Domain must be exactly 'wikipedia.org' or end with '.wikipedia.org'
    if domain == 'wikipedia.org' or domain.endswith('.wikipedia.org'):
        return urlunparse(('https', domain, parsed.path, parsed.params, parsed.query, parsed.fragment))
    else:
        # print(link, 'not wikipedia')
        return None

URL_REGEX = re.compile(
    r'(https?://[^\s<>\[\]{}|]+(?:wikipedia\.org)[^\s<>\[\]{}|]*)|((?:[a-z0-9-]+\.)*wikipedia\.org[^\s<>\[\]{}|]*)',
    re.IGNORECASE
)

def extract_plain_links(text):
    plain_links = []
    for match in URL_REGEX.finditer(text):
        candidate = match.group(1) if match.group(1) else match.group(2)
        plain_links.append(candidate.strip())
    return plain_links

def normalize_links(links):
    normalized = []
    for link in links:
        if not re.match(r'^https?://', link, re.IGNORECASE):
            link = 'https://' + link
        normalized.append(link)
    return normalized

def filter_wikipedia_links(links):
    wiki_links = set()
    for link in links:
        cleanlink = clean_wikipedia_domain(link)
        if cleanlink:
            wiki_links.add(cleanlink)
    return wiki_links

def extract_links_from_text(text):
    # Convert Markdown to HTML and extract markdown-parsed URLs
    html = markdown.markdown(text, extensions=['extra'])
    soup = BeautifulSoup(html, 'html.parser')
    a_tags = soup.find_all('a', href=True)
    markdown_links = [y.strip().replace(' ', '_') for a in a_tags for y in re.split(r'(?<=\s[^\w\s])|(?<=[^\w\s])\s', a['href'].strip())]
    markdown_links = [y for x in markdown_links for y in extract_plain_links(x)]

    # Remove these <a> tags to avoid double counting
    for a_tag in a_tags:
        a_tag.decompose()
    cleaned_text = soup.get_text()

    # Extract plain links
    plain_links = extract_plain_links(cleaned_text)

    # Combine, normalize, and deduplicate
    all_links = set(normalize_links(markdown_links + plain_links))

    # Filter to Wikipedia links
    wiki_links = filter_wikipedia_links(all_links)
    return list(wiki_links)

def reextract_links(text):
    plain_links = extract_plain_links(text)
    all_links = set(normalize_links(plain_links))
    wiki_links = filter_wikipedia_links(all_links)
    return list(wiki_links)

# --- Async Validation Functions ---
async def async_validate_link(session, url, timeout=5, retries=10, allow_redirects=False):
    try:
        async with session.head(url, allow_redirects=allow_redirects, timeout=timeout) as r:
            if 200 <= r.status < 400:
                return True, r.status, str(r.url)
            elif r.status == 429 and retries > 0:
                await asyncio.sleep(1)  # Wait for 1 second before retrying
                return await async_validate_link(session, url, timeout, retries - 1, allow_redirects=allow_redirects)
            else:
                async with session.get(url, allow_redirects=allow_redirects, timeout=timeout) as r2:
                    if 200 <= r2.status < 400:
                        return True, r2.status, str(r2.url)
                    else:
                        return False, r2.status, str(r2.url)
    except Exception as ex:
        print(ex)
    return False, -1, None

async def async_validate_url_with_punctuation(session, url, timeout=5, retries=10, allow_redirects=False):
    is_valid, status, processed_url = await async_validate_link(session, url, timeout, retries, allow_redirects)
    if is_valid:
        return is_valid, status, processed_url

    # Try removing trailing punctuation
    while url and category(url[-1])[0]=='P':
        url = url[:-1]
        is_valid, status, processed_url = await async_validate_link(session, url, timeout, retries, allow_redirects)
        if is_valid:
            return is_valid, status, processed_url

    return False, status, processed_url

async def async_validate_url_with_textpunctuation(session, url, timeout=5):
    is_valid, status, processed_url = await async_validate_link(session, url, timeout)
    if is_valid:
        return is_valid, status, processed_url

    # Remove trailing alphanumeric and punctuation until punctuation is hit
    original_processed_url = processed_url
    # print('av', url)
    url = url.rstrip('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:\'"_')
    if url[-5:] != 'wiki/' and url[-5:] != '.org/':
        is_valid, status, processed_url = await async_validate_url_with_punctuation(session, url, timeout=5)
        if is_valid:
            return is_valid, status, processed_url

    return False, status, original_processed_url

async def validate_urls_main(urls, retries=10, allow_redirects=False, max_concurrency=MAX_CONCURRENCY):
    semaphore = asyncio.Semaphore(max_concurrency)
    async with aiohttp.ClientSession(headers=HEADERS) as session:
        async def validate(url):
            if url in RD_URL_CACHE_P[allow_redirects]:
                return url, RD_URL_CACHE_P[allow_redirects][url]  # Return cached result
            
            # ṬODO, add str vaidatopm here for domain

            async with semaphore:
                result = await async_validate_url_with_punctuation(session, url,
                                                                      retries=retries,
                                                                      allow_redirects=allow_redirects)
                if result[1] != 429:
                    RD_URL_CACHE_P[allow_redirects][url] = result  # Cache result
                return url, result
        tasks = [validate(u) for u in urls]
        results = await asyncio.gather(*tasks)
        return results

async def validate_urls_main2(urls, max_concurrency=MAX_CONCURRENCY):
    semaphore = asyncio.Semaphore(max_concurrency)
    async with aiohttp.ClientSession(headers=HEADERS) as session:
        async def validate(url):
            if url in RD_URL_CACHE_TP:
                return url, RD_URL_CACHE_TP[url]  # Return cached result
            async with semaphore:
                result = await async_validate_url_with_textpunctuation(session, url)
                if result[1] != 429:
                    RD_URL_CACHE_TP[url] = result  # Cache result
                return url, result
        tasks = [validate(u) for u in urls]
        results = await asyncio.gather(*tasks)
        return results

# --- Processing ---

async def get_links_df(posts_df, column='body'):

    # print(len(posts_df))
    posts_df['extracted_links'] = posts_df[column].dropna().apply(extract_links_from_text)
    # print(posts_df['extracted_links'].iloc[0])
    # print(posts_df['extracted_links'].iloc[-1])
    links_df = posts_df.explode('extracted_links').rename(columns={'extracted_links':'extracted_url'}).reset_index(drop=True)
    links_df = links_df[~links_df['extracted_url'].isna()].drop_duplicates().copy()
    # print(len(links_df))
    # print('egl', links_df)

    return await process_links(links_df)

async def process_links(links_df, retry_count=0, max_retries=5, backoff_factor=3):
    chop_index = list(links_df.columns).index('extracted_url') + 1
    # 1) Initial validation
    urls = links_df['extracted_url'].unique()
    # print(urls)
    results = await validate_urls_main(urls)
    # print(len(results))
    df_results = pd.DataFrame([[x[0], *x[1]] for x in results],
                              columns=['extracted_url', 'valid_1', 'status_1', 'processed_url_1']).drop_duplicates('extracted_url')
    # print(len(df_results))
    # links_df.to_hdf('data/links_df_test.h5', key='df', mode='w')
    # df_results.to_hdf('data/df_results_test.h5', key='df', mode='w')
    links_df = links_df.merge(df_results, on='extracted_url', how='left').drop_duplicates()
    # links_df.to_hdf('data/links_df_test1.h5', key='df', mode='w')
    # print(len(links_df))
    # 2) Reextract for invalid URLs
    invalid_urls = links_df.loc[links_df['valid_1']==False, 'extracted_url'].unique()

    if len(invalid_urls) > 0:
        # Create a mapping DataFrame of extracted_url -> reextracted_url
        invalid_urls_series = pd.Series(invalid_urls, name='extracted_url')
        reextracted_list = []
        for u in invalid_urls_series:
            reex_urls = reextract_links(u)
            for rurl in reex_urls:
                reextracted_list.append((u, rurl))

        if reextracted_list:
            reextracted_df = pd.DataFrame(reextracted_list, columns=['extracted_url', 'reextracted_url']).drop_duplicates('extracted_url')
            # reextracted_df.to_hdf('data/reextracted_df_test.h5', key='df', mode='w')
            # print('re', reextracted_df)
            # Validate reextracted URLs
            re_urls = reextracted_df['reextracted_url'].unique()
            # print('ru', re_urls)
            results2 = await validate_urls_main(re_urls)
            df_results2 = pd.DataFrame([[x[0], *x[1]] for x in results2],
                                       columns=['reextracted_url', 'valid_2', 'status_2', 'processed_url_2']).drop_duplicates('reextracted_url')
            # print('r2', df_results2)
            # Merge second attempt results into reextracted_df
            # df_results2.to_hdf('data/df_results2_test.h5', key='df', mode='w')
            reextracted_df = reextracted_df.merge(df_results2, on='reextracted_url', how='left').drop_duplicates()
            # print('re2', reextracted_df)
            # Now merge reextracted results back to links_df using 'extracted_url'
            # prin t('l0', links_df)
            # reextracted_df.to_hdf('data/reextracted_df_test2.h5', key='df', mode='w')
            links_df = links_df.merge(reextracted_df, on='extracted_url', how='left', suffixes=('', '_second')).drop_duplicates()
            # print('lr', links_df)
            # Check still invalid after reextraction
            # Consider invalid if original attempt and reattempt still fail
            # links_df.to_hdf('data/links_df_test2.h5', key='df', mode='w')
            # print('ldf', len(links_df))
            # print('re', len(re_urls))
            # print('r2', len(results2))
            # print('rdf', len(reextracted_df))
            still_invalid_df = links_df.loc[
                (links_df['valid_1']==False) &
                ((links_df['valid_2'].isna()) | (links_df['valid_2']==False))
            ][['reextracted_url']].drop_duplicates().copy()
            # print('si', still_invalid_df)
            if len(still_invalid_df) > 0:
                # Validate still invalid URLs with textpunctuation
                # We need to look up the reextracted URLs again for these still invalid URLs
                # or possibly re-run the logic if reextraction is required again.
                # For now, let's assume we can directly use 'reextracted_df' to get them.

                # Filter reextracted_df for those matching still_invalid original URLs
                re_urls_third = still_invalid_df['reextracted_url'].unique()
                # print('r3', re_urls_third)
                results3 = await validate_urls_main2(re_urls_third)
                df_results3 = pd.DataFrame([[x[0], *x[1]] for x in results3],
                                           columns=['reextracted_url', 'valid_3', 'status_3', 'processed_url_3']).drop_duplicates('reextracted_url')

                # Merge third attempt results
                still_invalid_df = still_invalid_df.merge(df_results3, on='reextracted_url', how='left').drop_duplicates()
                
                # Merge back into links_df by original_url
                links_df = links_df.merge(still_invalid_df, on='reextracted_url', how='left', suffixes=('', '_third')).drop_duplicates()

    # Now links_df contains:
    # - original extracted_url from the posts
    # - initial validation results (valid, status, processed_url)
    # - second attempt validation results (valid_re, status_re, processed_url_re) via reextraction
    # - third attempt validation results (valid_third, status_third, processed_url_third)

    # You can now decide on final validity and merge results back into posts if needed.

    # add cols to links_df, if not present
    cols = ['extracted_url', 'valid_1', 'status_1',
       'processed_url_1', 'reextracted_url', 'valid_2', 'status_2', 'processed_url_2',
       'valid_3', 'status_3', 'processed_url_3']
    for col in cols:
        if col not in links_df.columns:
            links_df[col] = None

    # get final validity, prefer the last valid url
    if len(links_df) > 0:
        links_df['end_processed_valid'] = links_df['valid_3'].fillna(
            links_df['valid_2'].fillna(
                links_df['valid_1']
            )
        )

        links_df['end_processed_status'] = links_df['status_3'].fillna(
            links_df['status_2'].fillna(
                links_df['status_1']
            )
        )
        
        links_df['end_processed_url'] = links_df['processed_url_3'].fillna(
            links_df['processed_url_2'].fillna(
                links_df['processed_url_1']
            )
        )
        
        # print('l', links_df)
        # get redirects for 3xx errors
        error3xxs = links_df[(links_df['status_1']>=300)&(links_df['status_1']<400)|
                             (links_df['status_2']>=300)&(links_df['status_2']<400)|
                             (links_df['status_3']>=300)&(links_df['status_3']<400)
                             ][['end_processed_url']].drop_duplicates().copy()

        rd_results = await validate_urls_main(error3xxs['end_processed_url'], retries=100000000, allow_redirects=True)
        rd_df = pd.DataFrame([[x[0], *x[1]] for x in rd_results],
                             columns=['end_processed_url', 'valid_rd', 'status_rd', 'redirected_url']
                             ).drop_duplicates('end_processed_url')
        # print('rd', rd_df)
                             
        links_df = links_df.merge(rd_df, on='end_processed_url', how='left').drop_duplicates()

        #  rerun for 429 errors
        error429s = links_df[(links_df['status_1']==429)|
                                (links_df['status_2']==429)|
                                (links_df['status_3']==429)|
                                (links_df['status_rd']==429)
                                ][links_df.columns[:chop_index]].drop_duplicates().copy()
        links_df = links_df[(links_df['status_1']!=429)&
                            (links_df['status_2']!=429)&
                            (links_df['status_3']!=429)&
                            (links_df['status_rd']!=429)
                            ].drop_duplicates()

        if len(error429s) > 0:
            if retry_count < max_retries:
                delay = backoff_factor ** retry_count
                print(f"429 errors: {len(error429s)}. Retrying after {delay} seconds...")
                await asyncio.sleep(delay)
                # Retry recursively with increased retry_count
                new_links = await process_links(error429s.copy(), retry_count=retry_count + 1, max_retries=max_retries, backoff_factor=backoff_factor)
                links_df = pd.concat([links_df, new_links]).drop_duplicates()
            else:
                print(f"Max retries reached for {len(error429s)} URLs. Skipping...")
                error429s_extra = links_df[(links_df['status_1']==429)|
                                (links_df['status_2']==429)|
                                (links_df['status_3']==429)|
                                (links_df['status_rd']==429)
                                ][links_df.columns[:chop_index]].drop_duplicates().copy()
                error429s = pd.concat([error429s, error429s_extra])
                links_df = links_df[(links_df['status_1']!=429)&
                            (links_df['status_2']!=429)&
                            (links_df['status_3']!=429)&
                            (links_df['status_rd']!=429)
                            ].drop_duplicates()
                # print(f"429 errors: {len(l1429)} -> {len(error429s)}")
                # save 429s
                if len(error429s) > 0:
                    print("Saving 429s")
                    error429s.to_hdf('data/429s.h5', key='df', mode='a')


        links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])
        links_df['final_status'] = links_df['status_rd'].fillna(links_df['end_processed_status'])
        links_df['final_url'] = links_df['redirected_url'].fillna(links_df['end_processed_url'])

        print("Saving cache. Total size: ", len(RD_URL_CACHE_P[True]) + len(RD_URL_CACHE_P[False])
                + len(RD_URL_CACHE_TP))
        with open('.temp/rd_url_cache_p.pkl', 'wb') as f:
            pickle.dump(RD_URL_CACHE_P, f)
        with open('.temp/rd_url_cache_tp.pkl', 'wb') as f:
            pickle.dump(RD_URL_CACHE_TP, f)

        return links_df.drop_duplicates().reset_index(drop=True)
    else:
        return links_df.drop_duplicates().reset_index(drop=True)


In [3]:
# RD_URL_CACHE_P = {True: {}, False: {}}
# RD_URL_CACHE_TP = {}

# print("Saving cache. Total size: ", len(RD_URL_CACHE_P[True]) + len(RD_URL_CACHE_P[False])
#         + len(RD_URL_CACHE_TP))
# with open('.temp/rd_url_cache_p.pkl', 'wb') as f:
#     pickle.dump(RD_URL_CACHE_P, f)
# with open('.temp/rd_url_cache_tp.pkl', 'wb') as f:
#     pickle.dump(RD_URL_CACHE_TP, f)


Saving cache. Total size:  0


In [3]:
# For example:
posts = pd.read_hdf('data/posts.h5').reset_index(drop=True)
ptest = posts.iloc[:1000].copy()  # limit for testing

# Run the async function
links_df = await get_links_df(ptest, column='body')
links_df[['id', 'subreddit_id', 
       'created_at', 'updated_at', 'extracted_url', 'valid_1', 'status_1',
       'processed_url_1', 'reextracted_url', 'valid_2', 'status_2',
       'processed_url_2', 'valid_3', 'status_3', 'processed_url_3',
       'end_processed_valid', 'end_processed_status', 'end_processed_url',
       'valid_rd', 'status_rd', 'redirected_url', 'final_valid',
       'final_status', 'final_url']]

  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  2909113


Unnamed: 0,id,subreddit_id,created_at,updated_at,extracted_url,valid_1,status_1,processed_url_1,reextracted_url,valid_2,...,processed_url_3,end_processed_valid,end_processed_status,end_processed_url,valid_rd,status_rd,redirected_url,final_valid,final_status,final_url
0,t3_evva62,t5_2qh16,2020-01-29 23:16:54.860,2020-01-29 23:38:45.353244,https://en.wikipedia.org/wiki/Unit_731,True,200,https://en.wikipedia.org/wiki/Unit_731,,,...,,True,200.0,https://en.wikipedia.org/wiki/Unit_731,,,,True,200.0,https://en.wikipedia.org/wiki/Unit_731
1,t3_ini49k,t5_2qh4r,2020-09-06 07:48:47.668,2020-09-06 13:53:40.374281,https://en.m.wikipedia.org/wiki/Shanghaiing,True,200,https://en.m.wikipedia.org/wiki/Shanghaiing,,,...,,True,200.0,https://en.m.wikipedia.org/wiki/Shanghaiing,,,,True,200.0,https://en.m.wikipedia.org/wiki/Shanghaiing
2,t3_g7m4zn,t5_2qhhq,2020-04-25 02:51:48.375,2020-05-10 12:06:23.576164,https://en.wikipedia.org/wiki/List_of_public_c...,True,200,https://en.wikipedia.org/wiki/List_of_public_c...,,,...,,True,200.0,https://en.wikipedia.org/wiki/List_of_public_c...,,,,True,200.0,https://en.wikipedia.org/wiki/List_of_public_c...
3,t3_htkzqv,t5_2qiu7,2020-07-18 18:00:51.386,2020-11-16 19:25:23.710963,https://en.wikipedia.org/wiki/Space_suit,True,200,https://en.wikipedia.org/wiki/Space_suit,,,...,,True,200.0,https://en.wikipedia.org/wiki/Space_suit,,,,True,200.0,https://en.wikipedia.org/wiki/Space_suit
4,t3_g8zoux,t5_2qixm,2020-04-27 12:22:58.413,2020-09-10 08:46:05.205640,https://en.wikipedia.org/wiki/Captain_Ron,True,200,https://en.wikipedia.org/wiki/Captain_Ron,,,...,,True,200.0,https://en.wikipedia.org/wiki/Captain_Ron,,,,True,200.0,https://en.wikipedia.org/wiki/Captain_Ron
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1777,t3_et72k8,t5_12s4iu,2020-01-24 07:25:18.972,1970-01-01 00:00:00.000000,https://en.wikipedia.org/wiki/Dog-whistle_poli...,True,200,https://en.wikipedia.org/wiki/Dog-whistle_poli...,,,...,,True,200.0,https://en.wikipedia.org/wiki/Dog-whistle_poli...,,,,True,200.0,https://en.wikipedia.org/wiki/Dog-whistle_poli...
1778,t3_gz0b53,t5_26d5z6,2020-06-08 14:26:24.555,2020-06-09 15:04:15.331131,https://en.wikipedia.org/wiki/Wayside_Inn_(Arl...,True,200,https://en.wikipedia.org/wiki/Wayside_Inn_(Arl...,,,...,,True,200.0,https://en.wikipedia.org/wiki/Wayside_Inn_(Arl...,,,,True,200.0,https://en.wikipedia.org/wiki/Wayside_Inn_(Arl...
1779,t3_f0k46o,t5_2dar36,2020-02-08 00:59:57.389,2020-02-15 19:02:45.696495,https://en.m.wikipedia.org/wiki/Deutsche_Welle,True,200,https://en.m.wikipedia.org/wiki/Deutsche_Welle,,,...,,True,200.0,https://en.m.wikipedia.org/wiki/Deutsche_Welle,,,,True,200.0,https://en.m.wikipedia.org/wiki/Deutsche_Welle
1780,t3_jyu3i1,t5_2qh0z,2020-11-22 11:18:13.206,2021-01-20 13:10:09.555905,https://en.m.wikipedia.org/wiki/List_of_presid...,True,200,https://en.m.wikipedia.org/wiki/List_of_presid...,,,...,,True,200.0,https://en.m.wikipedia.org/wiki/List_of_presid...,,,,True,200.0,https://en.m.wikipedia.org/wiki/List_of_presid...


In [4]:
for n, x in enumerate(['valid_1', 'valid_2', 'valid_3']):
    for m, y in enumerate(['valid_1', 'valid_2', 'valid_3']):
        if m > n:
            print(x, y)
            display(links_df[links_df[x] & links_df[y]])

valid_1 valid_2


Unnamed: 0,id,subreddit_id,title,body,url,author_id,nsfw,score,upvote_ratio,distinguished,...,processed_url_3,end_processed_valid,end_processed_status,end_processed_url,valid_rd,status_rd,redirected_url,final_valid,final_status,final_url


valid_1 valid_3


Unnamed: 0,id,subreddit_id,title,body,url,author_id,nsfw,score,upvote_ratio,distinguished,...,processed_url_3,end_processed_valid,end_processed_status,end_processed_url,valid_rd,status_rd,redirected_url,final_valid,final_status,final_url


valid_2 valid_3


Unnamed: 0,id,subreddit_id,title,body,url,author_id,nsfw,score,upvote_ratio,distinguished,...,processed_url_3,end_processed_valid,end_processed_status,end_processed_url,valid_rd,status_rd,redirected_url,final_valid,final_status,final_url


In [5]:
posts = pd.read_hdf('data/posts.h5').reset_index(drop=True)

In [11]:
#  run on titles

outcols = ['id', 'extracted_url', 'valid_1', 'status_1',
       'processed_url_1', 'reextracted_url', 'valid_2', 'status_2',
       'processed_url_2', 'valid_3', 'status_3', 'processed_url_3',
       'end_processed_valid', 'end_processed_url',
       'valid_rd', 'status_rd', 'redirected_url', 'final_valid',
       'final_status', 'final_url']

size = len(posts)//10
titlelinks = []
for batch in range(0, len(posts), size):
    print(f"Batch {batch//size + 1}", size)
    try:
        titlelinks.append(pd.read_hdf(f'data/titlelinks_batches.h5', key=f'/batch_{batch//size + 1}'))
    except (FileNotFoundError, KeyError) as ex:
        print(ex)
        links_df = await get_links_df(posts.iloc[batch:batch+size].copy(), column='title')
        if len(links_df) > 0:
            print('Saving...')
            links_df[outcols].to_hdf(f'data/titlelinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')
            titlelinks.append(links_df[outcols])

titlelinks = pd.concat(titlelinks).reset_index(drop=True)
if os.path.exists('data/titlelinks.h5'):
    os.remove('data/titlelinks.h5')
titlelinks.to_hdf('data/titlelinks.h5', key='df', mode='w')

Batch 1 33589
File data/titlelinks_batches.h5 does not exist


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/titlelinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Saving cache. Total size:  1956
Saving...
Batch 2 33589
'No object named /batch_2 in the file'
Cannot connect to host wikipediaen.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host wikipediaen.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host wikipediaen.wikipedia.:443 ssl:default [Name or service not known]
Cannot connect to host wikipediaen.wikipedia:443 ssl:default [Name or service not known]


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/titlelinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Saving cache. Total size:  2162
Saving...
Batch 3 33589
'No object named /batch_3 in the file'
Cannot connect to host imagenl.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host imagenl.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host imagenl.m.wikipedia.:443 ssl:default [Name or service not known]
Cannot connect to host imagenl.m.wikipedia:443 ssl:default [Name or service not known]


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/titlelinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Saving cache. Total size:  2327
Saving...
Batch 4 33589
'No object named /batch_4 in the file'


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/titlelinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Saving cache. Total size:  2515
Saving...
Batch 5 33589
'No object named /batch_5 in the file'


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/titlelinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Saving cache. Total size:  2707
Saving...
Batch 6 33589
'No object named /batch_6 in the file'
Cannot connect to host www.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.m.wikipe

  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/titlelinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Saving cache. Total size:  2877
Saving...
Batch 7 33589
'No object named /batch_7 in the file'


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/titlelinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Saving cache. Total size:  3042
Saving...
Batch 8 33589
'No object named /batch_8 in the file'
Cannot connect to host windowen.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host windowen.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host windowen.wikipedia.:443 ssl:default [Name or service not known]
Cannot connect to host windowen.wikipedia:443 ssl:default [Name or service not known]


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/titlelinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Saving cache. Total size:  3193
Saving...
Batch 9 33589
'No object named /batch_9 in the file'
Cannot connect to host trends.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host trends.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host trends.wikipedia.:443 ssl:default [Name or service not known]
Cannot connect to host trends.wikipedia:443 ssl:default [Name or service not known]


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/titlelinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Saving cache. Total size:  3336
Saving...
Batch 10 33589
'No object named /batch_10 in the file'


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(


Saving cache. Total size:  3464
Saving...
Batch 11 33589
'No object named /batch_11 in the file'


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/titlelinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  titlelinks.to_hdf('data/titlelinks.h5'

In [2]:
# problem IDs t3_udac4z, t3_ud2feb

In [2]:
posts = pd.read_hdf('data/posts.h5').reset_index(drop=True)

In [6]:
# run on bodies
outcols = ['id', 'extracted_url', 'valid_1', 'status_1',
       'processed_url_1', 'reextracted_url', 'valid_2', 'status_2',
       'processed_url_2', 'valid_3', 'status_3', 'processed_url_3',
       'end_processed_valid', 'end_processed_url',
       'valid_rd', 'status_rd', 'redirected_url', 'final_valid',
       'final_status', 'final_url']
size = len(posts)//10

bodylinks = []
for batch in range(0, len(posts), size):
   print(f"Batch {batch//size + 1}", size)
   try:
      bodylinks.append(pd.read_hdf(f'data/bodylinks_batches.h5', key=f'/batch_{batch//size + 1}'))
   except (FileNotFoundError, KeyError) as ex:
      print(ex)
      links_df = await get_links_df(posts.iloc[batch:batch+size].copy(), column='body')
      if len(links_df) > 0:
         print('saving')
         links_df[outcols].to_hdf(f'data/bodylinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')
         bodylinks.append(links_df[outcols])

bodylinks = pd.concat(bodylinks).reset_index(drop=True)
if os.path.exists('data/bodylinks.h5'):
   os.remove('data/bodylinks.h5')
bodylinks.to_hdf('data/bodylinks.h5', key='df', mode='w')

Batch 1 33589
Batch 2 33589
'No object named /batch_2 in the file'
Cannot connect to host n.wikipedia.org:443 ssl:default [Name or service not known]

Cannot connect to host n.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]

label empty or too long
Cannot connect to host n.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host n.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host n.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host n.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host n.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host n.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
label empty or too long


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(




429 errors: 161. Retrying after 1 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  2924344


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  2924344
saving
Batch 3 33589
'No object named /batch_3 in the file'


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/bodylinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host imple.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host imageen.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host n.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]

Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host n.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host imageen.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host imple.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host imageen.wikipedia.:443 ssl:default [Name or service not known]
Cannot connect to host imageen.wikipedia:44

  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(





429 errors: 257. Retrying after 1 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  2939313


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  2939313
saving
Batch 4 33589
'No object named /batch_4 in the file'


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/bodylinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Cannot connect to host restorehealthen.wikipedia.org:443 ssl:default [Name or service not known]
label empty or too long
Cannot connect to host restorehealthen.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host restorehealthen.wikipedia.:443 ssl:default [Name or service not known]
Cannot connect to host restorehealthen.wikipedia:443 ssl:default [Name or service not known]
label empty or too long


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(



429 errors: 834. Retrying after 1 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  2956212


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  2956212
saving
Batch 5 33589
'No object named /batch_5 in the file'


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/bodylinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')



Cannot connect to host nds.nl.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host nds.nl.m.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host nds.nl.m.wikipedia.:443 ssl:default [Name or service not known]
Cannot connect to host nds.nl.m.wikipedia:443 ssl:default [Name or service not known]


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(





429 errors: 1125. Retrying after 1 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  2971014


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  2971014
saving
Batch 6 33589
'No object named /batch_6 in the file'


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/bodylinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Cannot connect to host 2fen.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]

Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host 2fen.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host 2fen.wikipedia.:443 ssl:default [Name or service not known]
Cannot connect to host 2fen.wikipedia:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:44

  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(


429 errors: 1264. Retrying after 1 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  2987446


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  2987446
saving
Batch 7 33589
'No object named /batch_7 in the file'


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/bodylinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Cannot connect to host \(language).wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host \(language).wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host \(language).wikipedia.org:443 ssl:default [Name or service not known]
label empty or too long

Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]





Cannot connect to host \(language).wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host \(language).wikipedia.org:443 ssl:default [Name or service not known]
label empty or too long
Cannot connect to host \(language).wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host \(language).wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host \(language).wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host \(language).wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to hos

  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(



429 errors: 108. Retrying after 1 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  3002677


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  3002677
saving
Batch 8 33589
'No object named /batch_8 in the file'


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/bodylinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]

Cannot connect to host en.www.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.w

  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(


429 errors: 1109. Retrying after 1 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  3017330


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  3017330
saving
Batch 9 33589
'No object named /batch_9 in the file'


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/bodylinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')




Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host r.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host r.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host apc.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host r.wikipedia.org:443 

  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(



429 errors: 1305. Retrying after 1 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  3033074


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  3033074
saving
Batch 10 33589
'No object named /batch_10 in the file'


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/bodylinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Cannot connect to host donatecrypto.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www

  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(



429 errors: 1247. Retrying after 1 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  3047686


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  3047686
saving
Batch 11 33589
'No object named /batch_11 in the file'


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/bodylinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')
  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])
  links_df['final_status'] = links_df['status_rd'].fillna(links_df['end_processed_status'])


Saving cache. Total size:  3047691
saving


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'status_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/bodylinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')
  bodylinks = pd.concat(bodylinks).reset_index(drop=True)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  bodylinks.to_hdf('data/bodylinks.h5', key='df', mode='w')

In [7]:
# confirm only link info is included

for i in bodylinks.loc[bodylinks['extracted_url'].str.len().nlargest(50).index, 'extracted_url']:
    print(i)

https://ru.wikipedia.org/wiki/%D0%A2%D0%B5%D0%BE%D1%80%D0%B8%D1%8F_%D0%BF%D1%80%D0%B5%D0%B4%D1%81%D1%82%D0%B0%D0%B2%D0%BB%D0%B5%D0%BD%D0%B8%D0%B9#:~:text=%D0%A2%D0%B5%D0%BE%D1%80%D0%B8%D1%8F%20%D0%BF%D1%80%D0%B5%D0%B4%D1%81%D1%82%D0%B0%D0%B2%D0%BB%D0%B5%D0%BD%D0%B8%D0%B9%20%E2%80%94%20%D1%80%D0%B0%D0%B7%D0%B4%D0%B5%D0%BB%20%D0%BC%D0%B0%D1%82%D0%B5%D0%BC%D0%B0%D1%82%D0%B8%D0%BA%D0%B8%2C%20%D0%B8%D0%B7%D1%83%D1%87%D0%B0%D1%8E%D1%89%D0%B8%D0%B9,%D0%B2%D0%B8%D0%B4%D0%B5%20%D0%BB%D0%B8%D0%BD%D0%B5%D0%B9%D0%BD%D1%8B%D1%85%20%D0%BF%D1%80%D0%B5%D0%BE%D0%B1%D1%80%D0%B0%D0%B7%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B9%20%D0%B2%D0%B5%D0%BA%D1%82%D0%BE%D1%80%D0%BD%D1%8B%D1%85%20%D0%BF%D1%80%D0%BE%D1%81%D1%82%D1%80%D0%B0%D0%BD%D1%81%D1%82%D0%B2.&text=%D0%9D%D0%B0%D0%B8%D0%B1%D0%BE%D0%BB%D0%B5%D0%B5%20%D0%B8%D0%B7%D0%B2%D0%B5%D1%81%D1%82%D0%BD%D0%BE%D0%B9%20(%D0%B8%20%D0%B8%D1%81%D1%82%D0%BE%D1%80%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%B8%20%D0%B2%D0%BE%D0%B7%D0%BD%D0%B8%D0%BA%D1%88%D0%B5%D0%B9%20%D0%BF%D0%B5%D1

In [4]:
comments = pd.concat([pd.read_hdf(f'data/comments_{x}.h5') for x in range(1,5)]).reset_index(drop=True)

In [None]:
# run on comments
outcols = ['id', 'extracted_url', 'valid_1', 'status_1',
         'processed_url_1', 'reextracted_url', 'valid_2', 'status_2',
         'processed_url_2', 'valid_3', 'status_3', 'processed_url_3',
         'end_processed_valid', 'end_processed_url',
         'valid_rd', 'status_rd', 'redirected_url', 'final_valid',
         'final_status', 'final_url']
size = len(comments)//100

commentlinks = []
for batch in range(0, len(comments), size):
    print(f"Batch {batch//size + 1}", size)
    try:
        commentlinks.append(pd.read_hdf(f'data/commentlinks_batches.h5', key=f'/batch_{batch//size + 1}'))
    except (FileNotFoundError, KeyError) as ex:
        print(ex)
        while True:
            try:
                links_df = await get_links_df(comments.iloc[batch:batch+size].copy(), column='body')
                MAX_CONCURRENCY = min(200, int(MAX_CONCURRENCY*(2**0.5)))
                break
            except MemoryError as ex:
                print(ex)
                MAX_CONCURRENCY = MAX_CONCURRENCY//2
        if len(links_df) > 0:
            print('saving')
            links_df[outcols].to_hdf(f'data/commentlinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')
            commentlinks.append(links_df[outcols])

commentlinks = pd.concat(commentlinks).reset_index(drop=True)
if os.path.exists('data/commentlinks.h5'):
    os.remove('data/commentlinks.h5')
commentlinks.to_hdf('data/commentlinks.h5', key='df', mode='w')

Batch 1 102643
Batch 2 102643
Batch 3 102643
Batch 4 102643
Batch 5 102643
Batch 6 102643
Batch 7 102643
Batch 8 102643
Batch 9 102643
Batch 10 102643
Batch 11 102643
Batch 12 102643
Batch 13 102643
Batch 14 102643
Batch 15 102643
Batch 16 102643
Batch 17 102643
Batch 18 102643
Batch 19 102643
Batch 20 102643
Batch 21 102643
Batch 22 102643
Batch 23 102643
Batch 24 102643
Batch 25 102643
'No object named /batch_25 in the file'
Invalid IPv6 URL

label empty or too long
Cannot connect to host en.www.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]


Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host en.www.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host en.www.wikipedia.org:443 ssl:default [Name or service not know

  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(


429 errors: 2665. Retrying after 1 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(


429 errors: 184. Retrying after 3 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  1416190


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  1416190


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  1416190
saving
Batch 26 102643
'No object named /batch_26 in the file'


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/commentlinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')






Cannot connect to host ww.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
label empty or too long
label empty or too long



Cannot connect to host ww.wikipedia.org:443 ssl:default [Name or service not known]

Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
label empty or too long
label empty or too long
label empty or too long
label empty or too long


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(


429 errors: 2908. Retrying after 1 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(


429 errors: 104. Retrying after 3 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  1451645


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  1451645


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  1451645
saving
Batch 27 102643
'No object named /batch_27 in the file'


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/commentlinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


'Paddy Cap' does not appear to be an IPv4 or IPv6 address
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host en.www.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host em.wikipedia.org:443 ssl:default [Name or service not known]

label empty or too long

Cannot connect to host imageen.m.wikipedia.org:443 ssl:default [Name or service not known]

Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]

Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host en.www.wikipedia.org:443 ssl:default [Name or service not known]

Cannot connect to host em.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host em.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host em.wikipedia.org:443 ssl:default [Name or service not known]
label empty or too long
label empty or too lon

  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(






429 errors: 2688. Retrying after 1 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(



429 errors: 111. Retrying after 3 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(


429 errors: 32. Retrying after 9 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['status_2'].fillna(
  links_df['end_processed_status'] = links_df['status_3'].fillna(


Saving cache. Total size:  1483090


  links_df = pd.concat([links_df, new_links]).drop_duplicates()
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  1483090


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  1483090


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  1483090
saving
Batch 28 102643
'No object named /batch_28 in the file'


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/commentlinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')



Cannot connect to host e.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host diafr.wikipedia.org:443 ssl:default [Name or service not known]
label empty or too long
Cannot connect to host n.wikipedia.org:443 ssl:default [Name or service not known]


Cannot connect to host en.www.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host n.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host e.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host e.wikipedia.:443 ssl:default [Name or service not known]
Cannot connect to host e.wikipedia:443 ssl:default [Name or service not known]
Cannot connect to host diafr.wikipedia.org:443 ssl:default [Name or service not known]
Cannot connect to host diafr.wikipedia.:443 ssl:default [Name or service not known]
Cannot connect to host diafr.wikipedia:443 ssl:default [Name or service not known]
label empty or too long
Cannot connect to host n.wiki

  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(





429 errors: 2512. Retrying after 1 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(




429 errors: 101. Retrying after 3 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(


429 errors: 23. Retrying after 9 seconds...


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(
  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  1516321


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  1516321


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  1516321


  links_df['final_valid'] = links_df['valid_rd'].fillna(links_df['end_processed_valid'])


Saving cache. Total size:  1516321
saving
Batch 29 102643
'No object named /batch_29 in the file'


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'extracted_url', 'processed_url_1', 'reextracted_url', 'valid_2',
       'processed_url_2', 'valid_3', 'processed_url_3', 'end_processed_url',
       'valid_rd', 'redirected_url', 'final_url'],
      dtype='object')]

  links_df[outcols].to_hdf(f'data/commentlinks_batches.h5', key=f'/batch_{batch//size + 1}', mode='a')


Invalid IPv6 URL
Invalid IPv6 URL
Invalid IPv6 URL
'this guy' does not appear to be an IPv4 or IPv6 address

label empty or too long
label empty or too long

Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]
label empty or too long
label empty or too long
label empty or too long
Cannot connect to host www.en.wikipedia.org:443 ssl:default [Name or service not known]


  links_df['valid_2'].fillna(
  links_df['end_processed_valid'] = links_df['valid_3'].fillna(


In [8]:
commentlinks = pd.concat([pd.read_hdf(f'data/commentlinks_batches.h5', key=f'/batch_{x}') for x in range(1,5)]).reset_index(drop=True)

In [9]:
# confirm only link info is included

for i in commentlinks['end_processed_url'].loc[commentlinks['end_processed_url'].str.len().nlargest(50).index]:
    print(i)

https://ru.wikipedia.org/wiki/%D0%A6%D0%B5%D0%BD%D1%82%D1%80_%D0%B2%D0%BE%D0%B5%D0%BD%D0%BD%D0%BE-%D1%82%D0%B5%D1%85%D0%BD%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%B8%D1%85_%D0%BF%D1%80%D0%BE%D0%B1%D0%BB%D0%B5%D0%BC_%D0%B1%D0%B0%D0%BA%D1%82%D0%B5%D1%80%D0%B8%D0%BE%D0%BB%D0%BE%D0%B3%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%BE%D0%B9_%D0%B7%D0%B0%D1%89%D0%B8%D1%82%D1%8B_%D0%9D%D0%98%D0%98_%D0%BC%D0%B8%D0%BA%D1%80%D0%BE%D0%B1%D0%B8%D0%BE%D0%BB%D0%BE%D0%B3%D0%B8%D0%B8_%D0%9C%D0%9E_%D0%A0%D0%A4
https://uk.wikipedia.org/wiki/%D0%86%D0%BD%D1%81%D1%82%D0%B8%D1%82%D1%83%D1%82_%D0%BF%D0%BE%D0%BB%D1%96%D1%82%D0%B8%D1%87%D0%BD%D0%B8%D1%85_%D1%96_%D0%B5%D1%82%D0%BD%D0%BE%D0%BD%D0%B0%D1%86%D1%96%D0%BE%D0%BD%D0%B0%D0%BB%D1%8C%D0%BD%D0%B8%D1%85_%D0%B4%D0%BE%D1%81%D0%BB%D1%96%D0%B4%D0%B6%D0%B5%D0%BD%D1%8C_%D1%96%D0%BC%D0%B5%D0%BD%D1%96_%D0%86._%D0%A4._%D0%9A%D1%83%D1%80%D0%B0%D1%81%D0%B0_%D0%9D%D0%90%D0%9D_%D0%A3%D0%BA%D1%80%D0%B0%D1%97%D0%BD%D0%B8
https://ru.wikipedia.org/wiki/%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D1%