In [1]:
import urllib
import requests

In [2]:
from urllib.parse import urlparse
from urllib.parse import urldefrag
import time

In [3]:
from urllib.request import urlopen

def download_from_the_internet(url):
    try:
        return urlopen(url).read().decode('utf-8')
    except KeyboardInterrupt:
        raise
    except urllib.error.HTTPError as e:
        code = e.code
        if code != 404:
            print(e)
        return code
    except Exception as e:
        print(e)

In [4]:
urlparse('https://simple.wikipedia.org/wiki/Main_Page')

ParseResult(scheme='https', netloc='simple.wikipedia.org', path='/wiki/Main_Page', params='', query='', fragment='')

В file_storage.py определил операторы in и len для удобства:

```python
def __contains__(self, key):
    return key in self._dict

def __len__(self):
    return len(self._dict)
```

In [5]:
from file_storage import FileStorage

file_storage = FileStorage('storage')

In [6]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def extract_links_from_html(url, html):
    parser = BeautifulSoup(html)
    return [urljoin(url, link.get('href')) for link in parser.findAll('a')]

In [7]:
import pickle
state_file_path = 'state.pickle'

def dump_state(iteration, urls_to_visit):
    with open(state_file_path, 'wb') as state_file:
        pickle.dump((iteration, urls_to_visit), state_file)
        print('state saved, iteration {}'.format(iteration))

def load_state():
    with open(state_file_path, 'rb') as state_file:
        return pickle.load(state_file)

In [8]:
urls_to_visit = {'https://simple.wikipedia.org/wiki/Main_Page'}
iteration = 0

In [9]:
beginning = 'https://simple.wikipedia.org/wiki/'
ban_patterns = [
    'Help', 'Wikipedia', 'Special', 'File', 'Template', 'Talk',
    'Template_talk', 'User_talk', 'User', 'Meta', 'user', 'MediaWiki',
    'Wikipedia_talk', 'Category_talk', 'Module', 'Media'
]
ban_patterns = [pattern + ':' for pattern in ban_patterns]
ban_patterns.append('Requests_for_comment')

def filter_url(url):
    if not url.startswith(beginning):
        return True
    url_end = url[len(beginning):]
    return any(url_end.startswith(ban_pattern) for ban_pattern in ban_patterns)

In [10]:
from multiprocessing.pool import Pool

In [11]:
import os
cpu_count = os.cpu_count()

В начале я почему-то думал, что там 1,4кк страниц, а не 140к. Поэтому немного заморочился и сделал скачивание и парсинг в несколько потоков. На утро понял, что затупил и можно было проще)

In [12]:
def parse_url(url):
    for attempt in range(2):
        html = download_from_the_internet(url)
        time.sleep(0.2)
        if (isinstance(html, int) and html == 404):
            return None
        elif html is not None:
            break
    else:
        return None

    urls = extract_links_from_html(url, html)
    return url, html, urls


def manage_parsing_result(parsing_result, file_storage, iteration, urls_to_visit):
    new_url, new_html, new_urls = parsing_result
    if new_url in file_storage:
        return iteration
    
    file_storage.write(new_url, new_html)
    defraged_urls = (urldefrag(url).url for url in new_urls)
    fresh_urls = [
        url for url in defraged_urls
        if url not in file_storage and not filter_url(url)
    ]
    urls_to_visit.update(fresh_urls)
    iteration += 1
    dump_state(iteration, urls_to_visit)
    return iteration


def download_wiki(file_storage, iteration, urls_to_visit):
    results = []
    with Pool() as pool:
        while urls_to_visit or results:
            new_url = urls_to_visit.pop()
            if new_url in file_storage:
                continue
            results.append(pool.apply_async(parse_url, (new_url,)))
            for ind, result in enumerate(results):
                if result.ready():
                    completed_result = result.get()
                    results.pop(ind)
                    break
            else:
                if len(results) < cpu_count - 1 and urls_to_visit:
                    continue
                else:
                    parsing_result = None
                    while parsing_result is None and results:
                        parsing_result = results[0].get()
                        results.pop(0)
                    if parsing_result is None:
                        continue
            iteration = manage_parsing_result(parsing_result, file_storage, iteration, urls_to_visit)

In [13]:
iteration, urls_to_visit = load_state()
download_wiki(file_storage, iteration, urls_to_visit)

In [14]:
len(file_storage)

193170

In [16]:
len(file_storage) - len([key for key in file_storage.keys() if key[len(beginning):].startswith('Category:')])

151127

In [26]:
download_from_the_internet('https://simple.wikipedia.org/wiki/R50/53:_Very_toxic_to_aquatic_organisms,_may_cause_long-term_adverse_effects_in_the aquatic_environment')

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of R-phrases - Simple English Wikipedia, the free encyclopedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_R-phrases","wgTitle":"List of R-phrases","wgCurRevisionId":4482389,"wgRevisionId":4482389,"wgArticleId":269558,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Safety","Chemistry lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June

In [44]:
def redirect_url(page):
    if 'wgRedirectedFrom"' in page:
        page_name_pattern = 'wgPageName":"'
        begin_ind = page.index(page_name_pattern)
        begin_ind += len(page_name_pattern)
        end_ind = page.index('"', begin_ind)
        return beginning + page[begin_ind:end_ind]

In [54]:
def is_article(page):
    pattern = '"wgIsArticle":'
    begin_ind = page.index(pattern)
    begin_ind += len(pattern)
    end_ind = page.index(',', begin_ind)
    answer = page[begin_ind:end_ind]
    if answer == "true":
        return True
    elif answer == 'false':
        return False
    else:
        return answer

In [65]:
is_article(download_from_the_internet('https://simple.wikipedia.org/wiki/Special:CategoryTree'))

False

In [64]:
is_article(file_storage.read('https://simple.wikipedia.org/wiki/Category:Establishments_in_France'))

True

In [68]:
is_article(file_storage.read('https://simple.wikipedia.org/wiki/Cat'))

True

Wow, wiki считает категории статьями?

In [58]:
for url, page in file_storage.items():
    a = is_article(page)
    if isinstance(a, str):
        print(a, url)
        break
    elif not a:
        print(url)

Жаль, тут ничего лишнего не нашли(

Уберем редиректы

In [72]:
file_storage_without_redirects = FileStorage('without_redirects')

In [73]:
for url, page in file_storage.items():
    redirect = redirect_url(page)
    if redirect is not None:
        if redirect not in file_storage:
            print(redirect)
            file_storage_without_redirects.write(redirect, page)
    else:
        file_storage_without_redirects.write(url, page)

https://simple.wikipedia.org/wiki/Polish–Soviet_War
https://simple.wikipedia.org/wiki/Centralism
https://simple.wikipedia.org/wiki/Niccolò_Machiavelli
https://simple.wikipedia.org/wiki/Sinéad_O'Connor
https://simple.wikipedia.org/wiki/Chloë_Moretz
https://simple.wikipedia.org/wiki/Pago_Pago,_American_Samoa
https://simple.wikipedia.org/wiki/Njord
https://simple.wikipedia.org/wiki/Bandage
https://simple.wikipedia.org/wiki/Gdańsk
https://simple.wikipedia.org/wiki/Saikaidō_Seamounts
https://simple.wikipedia.org/wiki/Motörhead
https://simple.wikipedia.org/wiki/Árpád_Göncz
https://simple.wikipedia.org/wiki/Diego_Velázquez
https://simple.wikipedia.org/wiki/Bahá'í_Faith
https://simple.wikipedia.org/wiki/Görlitz
https://simple.wikipedia.org/wiki/Wilhelm_Röntgen
https://simple.wikipedia.org/wiki/12_Super_Éxitos
https://simple.wikipedia.org/wiki/Jehovah's_Witnesses
https://simple.wikipedia.org/wiki/National_Chiao_Tung_University
https://simple.wikipedia.org/wiki/Encyclopædia_Britannica
https://si

https://simple.wikipedia.org/wiki/Folie_à_Deux_(album)
https://simple.wikipedia.org/wiki/Diet_(nutrition)
https://simple.wikipedia.org/wiki/1997–98_NHL_season
https://simple.wikipedia.org/wiki/Musa_ibn_Jafar
https://simple.wikipedia.org/wiki/People's_Party_for_Freedom_and_Democracy
https://simple.wikipedia.org/wiki/Marián_Gáborík
https://simple.wikipedia.org/wiki/Zürich
https://simple.wikipedia.org/wiki/Élisabeth_of_France_(1764–1794)
https://simple.wikipedia.org/wiki/History_of_the_United_States_(1789–1849)
https://simple.wikipedia.org/wiki/Iran–Contra_affair
https://simple.wikipedia.org/wiki/Mineral_exploration
https://simple.wikipedia.org/wiki/Barthélemy_Dumortier
https://simple.wikipedia.org/wiki/Song_Ikp'il
https://simple.wikipedia.org/wiki/Qur'an
https://simple.wikipedia.org/wiki/Sarton
https://simple.wikipedia.org/wiki/Valparaíso
https://simple.wikipedia.org/wiki/Ole_Gunnar_Solskjær
https://simple.wikipedia.org/wiki/Ion_Creangă
https://simple.wikipedia.org/wiki/Adolfo_Suárez
htt

https://simple.wikipedia.org/wiki/Ratchet_\u0026_Clank_(series)
https://simple.wikipedia.org/wiki/Hartsfield–Jackson_Atlanta_International_Airport
https://simple.wikipedia.org/wiki/Valence,_Drôme
https://simple.wikipedia.org/wiki/Pak_Chŏng_Hŭi
https://simple.wikipedia.org/wiki/Planetshine
https://simple.wikipedia.org/wiki/Portland_Winterhawks
https://simple.wikipedia.org/wiki/Gnome
https://simple.wikipedia.org/wiki/The_National_Grange_of_the_Order_of_Patrons_of_Husbandry
https://simple.wikipedia.org/wiki/Giorgio_Ardisson
https://simple.wikipedia.org/wiki/Austrian_People's_Party
https://simple.wikipedia.org/wiki/Ii
https://simple.wikipedia.org/wiki/Tromsø
https://simple.wikipedia.org/wiki/Saarbrücken_(district)
https://simple.wikipedia.org/wiki/Upton_Park
https://simple.wikipedia.org/wiki/King's_Cross_St._Pancras_tube_station
https://simple.wikipedia.org/wiki/Bee_hummingbird
https://simple.wikipedia.org/wiki/Takeshi_Ono
https://simple.wikipedia.org/wiki/Blue_Öyster_Cult
https://simple.w

https://simple.wikipedia.org/wiki/UTC+01:00
https://simple.wikipedia.org/wiki/Radiography
https://simple.wikipedia.org/wiki/Volapük
https://simple.wikipedia.org/wiki/Sumatran_tiger
https://simple.wikipedia.org/wiki/Córdoba
https://simple.wikipedia.org/wiki/King's_Sutton_railway_station
https://simple.wikipedia.org/wiki/Paul_Cézanne
https://simple.wikipedia.org/wiki/Young's_double-slit_experiment
https://simple.wikipedia.org/wiki/Legendary_Pokémon
https://simple.wikipedia.org/wiki/Aşgabat
https://simple.wikipedia.org/wiki/Orgy
https://simple.wikipedia.org/wiki/Johann_Friedrich_Blumenbach
https://simple.wikipedia.org/wiki/Canon_(official)
https://simple.wikipedia.org/wiki/Spanish–American_War
https://simple.wikipedia.org/wiki/Iðunn
https://simple.wikipedia.org/wiki/Obsessive–compulsive_disorder
https://simple.wikipedia.org/wiki/Bogotá
https://simple.wikipedia.org/wiki/Thomas_Müller
https://simple.wikipedia.org/wiki/St_Stephen's_Cathedral,_Vienna
https://simple.wikipedia.org/wiki/Martial_

https://simple.wikipedia.org/wiki/Nîmes
https://simple.wikipedia.org/wiki/A_Midsummer_Night's_Dream
https://simple.wikipedia.org/wiki/Gameplay_of_Pokémon
https://simple.wikipedia.org/wiki/Esox
https://simple.wikipedia.org/wiki/Regierender_Bürgermeister
https://simple.wikipedia.org/wiki/Queen's_University
https://simple.wikipedia.org/wiki/Glaze
https://simple.wikipedia.org/wiki/Sir_David's_Long-beaked_Echidna
https://simple.wikipedia.org/wiki/Sněžka
https://simple.wikipedia.org/wiki/Pokémon
https://simple.wikipedia.org/wiki/1980–81_NHL_season
https://simple.wikipedia.org/wiki/Besham
https://simple.wikipedia.org/wiki/Qur'an
https://simple.wikipedia.org/wiki/Brain_stem
https://simple.wikipedia.org/wiki/Magan_(civlization)
https://simple.wikipedia.org/wiki/Dáil_Éireann
https://simple.wikipedia.org/wiki/Höchstetten
https://simple.wikipedia.org/wiki/Marie-Antoine_Carême
https://simple.wikipedia.org/wiki/Louise_Françoise_de_Bourbon,_Duchess_of_Bourbon
https://simple.wikipedia.org/wiki/Cupa_Ro

https://simple.wikipedia.org/wiki/Wrocław
https://simple.wikipedia.org/wiki/Cardinal_number
https://simple.wikipedia.org/wiki/Zsa_Zsa_Gábor
https://simple.wikipedia.org/wiki/January–February_2019_North_American_cold_wave
https://simple.wikipedia.org/wiki/Marie_Thérèse_of_Austria
https://simple.wikipedia.org/wiki/1810–1819_Atlantic_hurricane_seasons
https://simple.wikipedia.org/wiki/Pimlico_tube_station
https://simple.wikipedia.org/wiki/Torrent_salamander
https://simple.wikipedia.org/wiki/Marley_\u0026_Me_(movie)
https://simple.wikipedia.org/wiki/Hawker
https://simple.wikipedia.org/wiki/Twice-cooked_pork
https://simple.wikipedia.org/wiki/Idrissa_Ouédraogo
https://simple.wikipedia.org/wiki/Côte_d'Ivoire
https://simple.wikipedia.org/wiki/Céline_Dion
https://simple.wikipedia.org/wiki/Legendary_Pokémon
https://simple.wikipedia.org/wiki/Jorge_Batlle_Ibáñez
https://simple.wikipedia.org/wiki/Estrildid_finch
https://simple.wikipedia.org/wiki/Jorge_González
https://simple.wikipedia.org/wiki/Fath

https://simple.wikipedia.org/wiki/2010–2011_Queensland_floods
https://simple.wikipedia.org/wiki/Saints_Cyril_and_Methodius
https://simple.wikipedia.org/wiki/Carlo_Maderno
https://simple.wikipedia.org/wiki/Hooke's_law
https://simple.wikipedia.org/wiki/Earth's_magnetic_field
https://simple.wikipedia.org/wiki/Liège
https://simple.wikipedia.org/wiki/Stanisław_Ulam
https://simple.wikipedia.org/wiki/Yūki_Ōgimi
https://simple.wikipedia.org/wiki/Military–industrial_complex
https://simple.wikipedia.org/wiki/Ricardo_Domínguez
https://simple.wikipedia.org/wiki/Rivers_of_Côte_d'Ivoire
https://simple.wikipedia.org/wiki/Amborella
https://simple.wikipedia.org/wiki/Mülheim
https://simple.wikipedia.org/wiki/Villeneuve-d'Ascq
https://simple.wikipedia.org/wiki/León_(province)
https://simple.wikipedia.org/wiki/Ardèche
https://simple.wikipedia.org/wiki/Bouïra_Province
https://simple.wikipedia.org/wiki/Canton_of_Zürich
https://simple.wikipedia.org/wiki/The_Mamas_\u0026_the_Papas
https://simple.wikipedia.org

https://simple.wikipedia.org/wiki/Help:Contents
https://simple.wikipedia.org/wiki/CW_Leonis
https://simple.wikipedia.org/wiki/Drum_stick
https://simple.wikipedia.org/wiki/Príncipe
https://simple.wikipedia.org/wiki/Pokémon
https://simple.wikipedia.org/wiki/António_Egas_Moniz
https://simple.wikipedia.org/wiki/Fédération_Internationale_de_l'Automobile
https://simple.wikipedia.org/wiki/1830–1839_Atlantic_hurricane_seasons
https://simple.wikipedia.org/wiki/Dorsal
https://simple.wikipedia.org/wiki/People's_Choice_Award
https://simple.wikipedia.org/wiki/Meroë
https://simple.wikipedia.org/wiki/Barnes_\u0026_Noble
https://simple.wikipedia.org/wiki/Jiří_Fischer
https://simple.wikipedia.org/wiki/Kingdom_of_Italy_(1861–1946)
https://simple.wikipedia.org/wiki/Cliché
https://simple.wikipedia.org/wiki/Gabriel_García_Márquez
https://simple.wikipedia.org/wiki/Ta'if
https://simple.wikipedia.org/wiki/President_of_China
https://simple.wikipedia.org/wiki/Paris_Opéra
https://simple.wikipedia.org/wiki/Sächsi

https://simple.wikipedia.org/wiki/Legendary_Pokémon
https://simple.wikipedia.org/wiki/ABS-CBN_Corporation
https://simple.wikipedia.org/wiki/Mogilyov
https://simple.wikipedia.org/wiki/Post_(album)
https://simple.wikipedia.org/wiki/Onryō
https://simple.wikipedia.org/wiki/Beekeeping
https://simple.wikipedia.org/wiki/Legendary_Pokémon
https://simple.wikipedia.org/wiki/St._George's_Chapel_at_Windsor_Castle
https://simple.wikipedia.org/wiki/4×4=12
https://simple.wikipedia.org/wiki/Thames_Embankment
https://simple.wikipedia.org/wiki/Víctor_Jara
https://simple.wikipedia.org/wiki/AirAsia
https://simple.wikipedia.org/wiki/Françoise_d'Aubigné,_Marquise_de_Maintenon
https://simple.wikipedia.org/wiki/Robert_Baker
https://simple.wikipedia.org/wiki/1983_Invasion_of_Grenada
https://simple.wikipedia.org/wiki/Sächsische_Schweiz_(tourism)
https://simple.wikipedia.org/wiki/Seattle–Tacoma_International_Airport
https://simple.wikipedia.org/wiki/Prince_du_sang
https://simple.wikipedia.org/wiki/King's_Cross_s

https://simple.wikipedia.org/wiki/Cosimo_II_de'_Medici,_Grand_Duke_of_Tuscany
https://simple.wikipedia.org/wiki/Joaquín_Turina
https://simple.wikipedia.org/wiki/Graubünden
https://simple.wikipedia.org/wiki/Woodward's_wallaroo
https://simple.wikipedia.org/wiki/Spiritual_séance
https://simple.wikipedia.org/wiki/Children's_literature
https://simple.wikipedia.org/wiki/Legendary_Pokémon
https://simple.wikipedia.org/wiki/Railway_track
https://simple.wikipedia.org/wiki/Jan_Purkyně
https://simple.wikipedia.org/wiki/Aesch,_Zürich
https://simple.wikipedia.org/wiki/Saatly_Rayon
https://simple.wikipedia.org/wiki/Russian_Women's_Football_Championship
https://simple.wikipedia.org/wiki/Snell's_law
https://simple.wikipedia.org/wiki/Osoyoos,_British_Columbia
https://simple.wikipedia.org/wiki/Niccolò_Machiavelli
https://simple.wikipedia.org/wiki/The_Strand
https://simple.wikipedia.org/wiki/Eid_festival
https://simple.wikipedia.org/wiki/Golden_Gate_(Gdańsk)
https://simple.wikipedia.org/wiki/City_Clerk_of

https://simple.wikipedia.org/wiki/Crested_penguin
https://simple.wikipedia.org/wiki/Sofía_Vergara
https://simple.wikipedia.org/wiki/Pokémon
https://simple.wikipedia.org/wiki/IFK_Norrköping
https://simple.wikipedia.org/wiki/Provisional_government
https://simple.wikipedia.org/wiki/Phosphorus_trichloride
https://simple.wikipedia.org/wiki/Timiş
https://simple.wikipedia.org/wiki/Querétaro
https://simple.wikipedia.org/wiki/Encyclopædia_Britannica,_Inc.
https://simple.wikipedia.org/wiki/Belém
https://simple.wikipedia.org/wiki/Yasujirō_Ozu
https://simple.wikipedia.org/wiki/Marie-Antoine_Carême
https://simple.wikipedia.org/wiki/Mark_of_Cornwall
https://simple.wikipedia.org/wiki/Sulzberg,_Oberallgäu
https://simple.wikipedia.org/wiki/Qazigund
https://simple.wikipedia.org/wiki/Nagtglas's_African_dormouse
https://simple.wikipedia.org/wiki/Mario_Benjamín_Menéndez
https://simple.wikipedia.org/wiki/Pokémon_Ruby_and_Sapphire
https://simple.wikipedia.org/wiki/Zimbabwe-Rhodesia
https://simple.wikipedia.o

https://simple.wikipedia.org/wiki/Boötes
https://simple.wikipedia.org/wiki/Condé_Nast_Building
https://simple.wikipedia.org/wiki/Castile_and_León
https://simple.wikipedia.org/wiki/1969–70_NHL_season
https://simple.wikipedia.org/wiki/Jon_Batiste
https://simple.wikipedia.org/wiki/Agustinia
https://simple.wikipedia.org/wiki/Paris_Opéra
https://simple.wikipedia.org/wiki/Beauvau_family
https://simple.wikipedia.org/wiki/Paris–Le_Bourget_Airport
https://simple.wikipedia.org/wiki/Kyūshū
https://simple.wikipedia.org/wiki/Saskatchewan_River
https://simple.wikipedia.org/wiki/Mezquita_de_Córdoba
https://simple.wikipedia.org/wiki/The_Seven_Samurai
https://simple.wikipedia.org/wiki/Timişoara
https://simple.wikipedia.org/wiki/Br'er_Rabbit
https://simple.wikipedia.org/wiki/Pokémon_Ruby_and_Sapphire
https://simple.wikipedia.org/wiki/Stiff_Upper_Lip_(album)
https://simple.wikipedia.org/wiki/When_You're_Gone
https://simple.wikipedia.org/wiki/African_Red_Toad
https://simple.wikipedia.org/wiki/Saint-Just-d

https://simple.wikipedia.org/wiki/Playford
https://simple.wikipedia.org/wiki/Cox's_Bazar_District
https://simple.wikipedia.org/wiki/Eva_Perón
https://simple.wikipedia.org/wiki/Equilibrium_market_price
https://simple.wikipedia.org/wiki/Hasan_Abdal
https://simple.wikipedia.org/wiki/Blood_pheasant
https://simple.wikipedia.org/wiki/Obsessive–compulsive_disorder
https://simple.wikipedia.org/wiki/Newton's_law_of_universal_gravitation
https://simple.wikipedia.org/wiki/Tiffany
https://simple.wikipedia.org/wiki/Peja_Stojaković
https://simple.wikipedia.org/wiki/Aleksey_Alekseyevich_Brusilov
https://simple.wikipedia.org/wiki/Ashern
https://simple.wikipedia.org/wiki/Egg_white
https://simple.wikipedia.org/wiki/\
https://simple.wikipedia.org/wiki/Occam's_razor
https://simple.wikipedia.org/wiki/AT\u0026T_Stadium
https://simple.wikipedia.org/wiki/Chiriku_Hachiman-gū
https://simple.wikipedia.org/wiki/Entertainment_Tonight_Canada
https://simple.wikipedia.org/wiki/Here_I_Am_(song)
https://simple.wikipedi

https://simple.wikipedia.org/wiki/Pasiphaë_(moon)
https://simple.wikipedia.org/wiki/Kashima-jingū
https://simple.wikipedia.org/wiki/Aardwolf
https://simple.wikipedia.org/wiki/Charleville-Mézières
https://simple.wikipedia.org/wiki/Sebastián_Aguilera_de_Heredia
https://simple.wikipedia.org/wiki/UTC+10
https://simple.wikipedia.org/wiki/William_Forsythe
https://simple.wikipedia.org/wiki/Châtillon,_Jura
https://simple.wikipedia.org/wiki/Ultraviolet–visible_absorption
https://simple.wikipedia.org/wiki/DNA_damage_theory_of_ageing
https://simple.wikipedia.org/wiki/Georges_Vézina
https://simple.wikipedia.org/wiki/The_Grant
https://simple.wikipedia.org/wiki/Walk_This_Way
https://simple.wikipedia.org/wiki/Saikaidō_Seamounts
https://simple.wikipedia.org/wiki/Paul_Cézanne
https://simple.wikipedia.org/wiki/Dioxin
https://simple.wikipedia.org/wiki/Heckler_\u0026_Koch_MP_series
https://simple.wikipedia.org/wiki/Intimacy
https://simple.wikipedia.org/wiki/Jorge_Batlle_Ibáñez
https://simple.wikipedia.org

https://simple.wikipedia.org/wiki/Breakout_(album)
https://simple.wikipedia.org/wiki/Chari-Baguirmi
https://simple.wikipedia.org/wiki/2011–12_Fußball-Bundesliga
https://simple.wikipedia.org/wiki/Ōnojō
https://simple.wikipedia.org/wiki/Co-factor
https://simple.wikipedia.org/wiki/Galli
https://simple.wikipedia.org/wiki/Operación_Puerto_doping_case
https://simple.wikipedia.org/wiki/Patrick_M'Boma
https://simple.wikipedia.org/wiki/Thallium_halide
https://simple.wikipedia.org/wiki/Jentink's_dormouse
https://simple.wikipedia.org/wiki/Místico
https://simple.wikipedia.org/wiki/Rock_en_español
https://simple.wikipedia.org/wiki/Medellín
https://simple.wikipedia.org/wiki/Beyti_(meat_dish)
https://simple.wikipedia.org/wiki/Royal_Colleges_of_Surgeons
https://simple.wikipedia.org/wiki/Bose–Einstein_condensate
https://simple.wikipedia.org/wiki/Khloé_Kardashian
https://simple.wikipedia.org/wiki/35_(number)
https://simple.wikipedia.org/wiki/Anna_Maria_Luisa_de'_Medici
https://simple.wikipedia.org/wiki/

https://simple.wikipedia.org/wiki/Mihai_Răzvan_Ungureanu
https://simple.wikipedia.org/wiki/Let's_Play
https://simple.wikipedia.org/wiki/Emerson,_Lake_\u0026_Palmer
https://simple.wikipedia.org/wiki/Cosimo_III_de'_Medici,_Grand_Duke_of_Tuscany
https://simple.wikipedia.org/wiki/Room_Service
https://simple.wikipedia.org/wiki/Crash_Holly
https://simple.wikipedia.org/wiki/Tom_Clancy's_Rainbow_Six:_Vegas_2
https://simple.wikipedia.org/wiki/Saint_Catherine's_Monastery
https://simple.wikipedia.org/wiki/2014–15_Bundesliga_(women)
https://simple.wikipedia.org/wiki/Nazirite
https://simple.wikipedia.org/wiki/Opéra-Comique
https://simple.wikipedia.org/wiki/Lowell_P._Weicker,_Jr.
https://simple.wikipedia.org/wiki/DIY
https://simple.wikipedia.org/wiki/Scholz's_Star
https://simple.wikipedia.org/wiki/Réaumur_scale
https://simple.wikipedia.org/wiki/Severan_dynasty
https://simple.wikipedia.org/wiki/André_the_Giant
https://simple.wikipedia.org/wiki/Gríðr
https://simple.wikipedia.org/wiki/Quantum_theory
ht

https://simple.wikipedia.org/wiki/Drop_goal
https://simple.wikipedia.org/wiki/Domino's_Pizza
https://simple.wikipedia.org/wiki/Groupement_des_industries_françaises_aéronautiques_et_spatiales
https://simple.wikipedia.org/wiki/Hansō_Sōshitsu
https://simple.wikipedia.org/wiki/2012_London_mayoral_election
https://simple.wikipedia.org/wiki/Nēnē
https://simple.wikipedia.org/wiki/Rimouski_Océanic
https://simple.wikipedia.org/wiki/Aristide_Cavaillé-Coll
https://simple.wikipedia.org/wiki/Münchausen_syndrome
https://simple.wikipedia.org/wiki/ʻIolani_Palace
https://simple.wikipedia.org/wiki/Barcelona_Metro_line_6
https://simple.wikipedia.org/wiki/Kazimierz_Świątek
https://simple.wikipedia.org/wiki/Hertzsprung–Russell_diagram
https://simple.wikipedia.org/wiki/Closed_source
https://simple.wikipedia.org/wiki/Nuss_Procedure
https://simple.wikipedia.org/wiki/Leonese_Language_Day
https://simple.wikipedia.org/wiki/Railway_track
https://simple.wikipedia.org/wiki/Qaumī_Tarāna
https://simple.wikipedia.org/

In [74]:
len(file_storage_without_redirects)

168476

Получается, если считать категории статьями - выходит много, а если нет - мало)
В любом случае, не знаю что еще можно убрать.