In [29]:
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [40]:
class Page:
    
    def __init__(self, myStart):
        self.start = myStart
        self.nodes = []
        self.edges = []
        self.mainTags = [] # tags of the start article

    def findTags(node):
        response = urllib.request.urlopen('https://en.wikipedia.org/wiki/' + node)
        html = response.read()
        soup = bs(html,'lxml')
        seg = soup.find('div', {'id':'catlinks'})
        for link in seg.find_all('a', href=True):
            url = link.get('href')
            if(url.startswith('/wiki/Category:')):
                cat = url[15:]              
                self.mainTags.append(cat)
        
    def findAll(self, n):
        '''
        finds all nodes n branches away from center
        @param center is a string representing a wikipedia article title
        @param n is an int
        '''
        tier = 0
        self.findNext(self.start, 0, n)

    def findNext(self, center, tier, n):
        if(tier + 1 < n):
            fooNodes = self.findTier(center)
            for s in fooNodes:
                if( s not in self.nodes):
                    self.nodes.append(s)
                    self.edges.append( sorted([center, s]))
                    self.findNext(s, tier + 1, n)

    def findTier(self, center):
        '''
        finds all nodes 1 branch away from center
        @param center is a string representing a wikipedia article title
        '''
        nodeList = []
        response = urllib.request.urlopen('https://en.wikipedia.org/wiki/' + center)
        html = response.read()
        soup = bs(html,'lxml')
        for par in soup.find_all('p'):
            for link in par.find_all('a', href=True):
                url = link.get('href')
                if(url.startswith('/wiki/')) and (':' not in url):
                    if('#' in url):
                        hashLoc = url.find('#')
                        curSubj = url[6:hashLoc]
                    else:
                        curSubj = url[6:]
                    nodeList.append(curSubj)
        return nodeList

    def getNodes(self):
        return self.nodes
    def getEdges(self):
        return self.edges

In [38]:
pg = Page('science')
pg.findAll(3)

print('nodes------------------------------------------------------------------------------------------')
print(len(pg.getNodes()))
for e in pg.getNodes():
    print(e)
    
print('edges------------------------------------------------------------------------------------------')
print(len(pg.getEdges()))
for e in pg.getEdges():
    print(e)

nodes------------------------------------------------------------------------------------------
11226
Latin_language
Classical_language
Italic_languages
Indo-European_languages
Latin_alphabet
Old_Italic_script
Greek_alphabet
Phoenician_alphabet
Latium
Italian_Peninsula
Roman_Republic
Roman_Empire
Vulgar_Latin
Romance_languages
Italian_language
Portuguese_language
Spanish_language
French_language
Romanian_language
List_of_Latin_words_with_English_derivatives
List_of_English_words_of_Italian_origin
List_of_English_words_of_French_origin
English_language
Ancient_Greek
Theology
List_of_Latin_and_Greek_words_commonly_used_in_systematic_names
List_of_medical_roots,_suffixes_and_prefixes
Old_Latin
Standard_language
Classical_Latin
Epigraphy
Plautus
Terence
Late_Latin
Medieval_Latin
The_Renaissance
Renaissance_Latin
New_Latin
Vernacular
Ecclesiastical_Latin
Holy_See
Roman_Rite
Catholic_Church
Holy_orders_(Catholic_Church)
Sacred_language
Fusional_language
Grammatical_gender
Grammatical_case
Gr

Strapwork
Peter_Lu
Paul_Steinhardt
Quasicrystalline
Penrose_tiling
Zellige
Moroccan_architecture
Muqarnas
Ibn_Mu%CA%BF%C4%81dh_al-Jayy%C4%81n%C4%AB
Law_of_sines
Right_triangle
Alhazen
Paraboloid
Ibn_al-Haytham
History_of_scientific_method
Physician
Rhazes
Abd_al-Rahman_al-Sufi
Book_of_Fixed_Stars
Andromeda_constellation
List_of_nearest_galaxies
Nasir_al-Din_al-Tusi
Tusi-couple
Equant
Ibn_al-Shatir
Copernican_heliocentrism
Emission_theory
Al-Biruni
Earth_radius
Cardiovascular_system
Commentary_on_Anatomy_in_Avicenna%27s_Canon
Pulmonary_artery
Pulmonary_vein
Marcello_Malpighi
Prussian_State_Library
Pulmonary_circulation
Michael_Servetus
Nervous_system
Nerve
Motor_neuron
Sensory_neuron
Cranial_nerves
Spinal_nerves
Optic_nerve
Hypoglossal_nerve
Cervical_nerves
Thoracic_nerves
Lumbar_nerves
Sacral_nerves
Theory_of_evolution
Conway_Zirkle
Al-Jahiz
Kitab_al-Hayawan
Ibn_Khaldun
Muqaddimah
Ban%C5%AB_M%C5%ABs%C4%81
Book_of_Ingenious_Devices
Automaton
Flute
Program_(machine)
Steam_power
Leprosy
D

Kutrigurs
Decline_of_Greco-Roman_polytheism
Hellenistic_philosophy
Romanos_the_Melodist
Divine_Liturgy
Isidore_of_Miletus
Anthemius_of_Tralles
Sophia_(wisdom)
Justin_II
Lombards
Tiberius_II_Constantine
Avars_(Carpathians)
Sirmium
Khosrau_II
Maurice%27s_Balkan_campaigns
Mesopotamia_(Roman_province)
Sassanids
Jerusalem
True_Cross
Ctesiphon
Acheiropoieta
Sergius_I_of_Constantinople
Siege_of_Constantinople_(626)
Theodore_(brother_of_Heraclius)
Shahin_Vahmanzadegan
Battle_of_Nineveh_(627)
Armenia
Battle_of_Yarmouk
Muslim_conquest_of_the_Levant
Greek_fire
Umayyad_Caliphate
Bulgars
Khazars
Constantine_IV
Asparukh_of_Bulgaria
First_Bulgarian_Empire
Justinian_II
Macedonia_(region)
Bulgarians
Tervel_of_Bulgaria
Leo_III_the_Isaurian
Constantine_V
Revolt_of_Thomas_the_Slav
Emirate_of_Crete
Petronas_the_Patrician
Battle_of_Lalakaon
Umar_al-Aqta
Emir
Malatya
Krum
Omurtag_of_Bulgaria
Treaty_of_815
Leo_V_the_Armenian
Iconoclasm
Iconodule
Irene_of_Athens
Second_Council_of_Nicaea
Theophanes_the_Confesso

Natural_number
Differential_calculus
Electron_neutrino
Differential_equation
Mathematical_function
Poisson_statistics
Integrating_factor
Calcium-40
Argon-40
Avogadro%27s_constant
Mole_(unit)
Time_constant
Mean_lifetime
Arithmetic_mean
1_E19_s_and_more
Chemical_bonds
Atomic_orbital
Natural_nuclear_fission_reactor
Silicon-32
Manganese-54
Radium-226
Radon-222
Solar_flare
Highly_charged_ion
Storage_ring
Modulation
GSI_anomaly
GSI_Helmholtz_Centre_for_Heavy_Ion_Research
Darmstadt
Eigenstates
Nuclear_force
Macroscopic
Coulomb%27s_law
Quantum_fluctuation
Quantum_tunneling
Hydrogen-7
Tellurium-128
Half_life
Entropy
Quantum_states
Avalanche
Ground_state
Activation_energy
Big_Bang_theory
Tritium
Nucleosynthesis
Supernova
Radiogenic_nuclide
Stable_isotope
Extinct_radionuclide
Stable_nuclide
Radioisotopic_labeling
Random
Hardware_random-number_generator
Le%C3%B3_Szil%C3%A1rd
Solar_system
Nebula
Accretion_(astrophysics)
Radiogenic
Rocks
Atmosphere
Crust_(geology)
Mantle_(geology)
Earth%27s_internal

Mercury-Redstone_3
Orientation_(geometry)
Retro-rocket
NASA_Distinguished_Service_Medal
President_of_the_United_States
Jerome_Wiesner
Bay_of_Pigs_invasion
Wikisource
We_choose_to_go_to_the_Moon
Rice_University
Lyndon_B._Johnson_Space_Center
Gus_Grissom
Mercury-Redstone_4
John_Glenn
Mercury-Atlas_6
Ticker-tape_parade
New_York_City
Charles_Lindbergh
Aurora_7
Mercury-Atlas_8
Faith_7
Vostok_2
Vostok_3
Vostok_4
Space_rendezvous
Vostok_5
Vostok_6
Valentina_Tereshkova
Soviet_air_show
United_Nations_General_Assembly
Nikita_Khrushchev
Assassination_of_John_F._Kennedy
Kennedy_Space_Center
Extra-vehicular_activity
S.P._Korolev_Rocket_and_Space_Corporation_Energia
Extravehicular_activity
Voskhod_1
Apollo_Command_Module
Shirt-sleeve_environment
Leonid_Brezhnev
Voskhod_2
Pavel_Belyayev
Alexey_Leonov
Perm
Vladimir_Chelomey
OKB-52
Soyuz_7K-L1
N1_(rocket)
Soyuz_7K-LOK
LK_(spacecraft)
Apollo_spacecraft
Saturn_V
Translunar_injection
United_Nations_Committee_on_the_Peaceful_Uses_of_Outer_Space
Outer_Space

16th_arrondissement_of_Paris
Civil_servant
Decree
President_of_France
Albert_Lebrun
CNRS_Gold_medal
Institut_national_de_physique_nucl%C3%A9aire_et_de_physique_des_particules
Ad_hoc
Alain_Fuchs
Max_Planck_Society
Kaiser_Wilhelm_Society
Basic_research
%E2%82%AC
Otto_Hahn
Walther_Bothe
Fritz_Haber
Times_Higher_Education_Supplement
AT%26T_Corporation
Argonne_National_Laboratory
Compete.com
Eingetragener_Verein
Peter_Gruss
Schloss_Ringberg
Kreuth
Bavaria
Duke_in_Bavaria
Florida_Atlantic_University
Interdisciplinarity
Transdisciplinarity
Bethesda_Statement_on_Open_Access_Publishing
Berlin_Declaration_on_Open_Access_to_Knowledge_in_the_Sciences_and_Humanities
Self-archiving
Open_access_journals
Max_Planck_Digital_Library
Open_access_repository
Der_Spiegel
The_Left_(Germany)
Geographic_coordinate_system
Deutsche_Forschungsgemeinschaft
Prize
Bonn
Gottfried_Wilhelm_Leibniz_Prize
History_of_Germany_(1945%E2%80%931990)
National_Natural_Science_Foundation_of_China
Sino-German_Center_for_Research_P

Matthias_Gross
S._Lochlann_Jain
Society_for_Social_Studies_of_Science
European_Inter-University_Association_on_Society,_Science_and_Technology
Society_for_the_History_of_Technology
History_of_Science_Society
Philosophy_of_Science_Association
American_Anthropological_Association
American_Political_Science_Association
American_Sociological_Association
Social_Studies_of_Science
Science,_Technology_%26_Human_Values
Technology_and_Culture
Deliberative_Democracy
Representative_democracy
Aristotle%E2%80%99s_writings
Net_neutrality
Modernization_theory
Lobbying
Dial-up
Federal_Communications_Commission
Water_privatization
Veil_of_ignorance
Blue_skies_research
Stem_cell
Return_on_investment
Sputnik
Air_Force_Office_of_Scientific_Research
British_Petroleum
Donald_Braben
Mark_Walport
Wellcome_Trust
Imperial_College,_London
Research_council
Harold_Kroto
Times_Higher_Education
Pounds_sterling
Organisation_for_Economic_Co-operation_and_Development
Paradigm
Scientific_debate
Race_and_intelligence
Bea

Photoreceptor_protein
Photopsin
Rod_cell
Adaptation_(eye)
Wavelengths
Nocturnal
Deep_sea
Heterochromatin
Euchromatin
Outer_nuclear_layer
Image_intensification
Thermal_imaging
Starlight
Low_light_level_television
Photocathode
Night_vision_goggles
Near_infrared
Infrared_sensing_in_snakes
Thermographic_camera
Thermal_radiation
Night_vision_device
Thermal_imaging_device
Military_forces
Cirrus_Aircraft
Cessna
Binocular_vision
Exit_pupil
Pupil
Atropine
Automotive_night_vision
Search_and_rescue
Mountain_rescue
Search_and_rescue_dog
Urban_search_and_rescue
Combat_search_and_rescue
Air-sea_rescue
International_Search_and_Rescue_Advisory_Group
UNCLOS
Vergulde_Draeck
Sikorsky_Aircraft
Breeze-Eastern
Penfield_Reef
Korean_Air_Lines_Flight_007
Sakhalin_Island
Air_France_Flight_447
Black_boxes
Malaysia_Airlines_Flight_370
Australian_Transport_Safety_Bureau
Fugro
Alzheimer%27s_disease
Autism
Dementia
Earthquake
Tornado
Hurricane
Flying_boat
Floatplane
Amphibious_helicopter
Geneva_Convention_on_the_Hig

['Maya_(illusion)', 'Physical_universe']
['Physical_universe', 'Physicalism']
['Monism', 'Physical_universe']
['Metaphysical_naturalism', 'Physical_universe']
['Social_sciences', 'science']
['Discipline_(academia)', 'Social_sciences']
['Social_sciences', 'Society']
['Social_relation', 'Social_sciences']
['Individuals', 'Social_sciences']
['Economics', 'Social_sciences']
['Political_science', 'Social_sciences']
['Human_geography', 'Social_sciences']
['Psychology', 'Social_sciences']
['Social_sciences', 'Sociology']
['Anthropology', 'Social_sciences']
['Archaeology', 'Social_sciences']
['Jurisprudence', 'Social_sciences']
['Linguistics', 'Social_sciences']
['Outline_of_social_science', 'Social_sciences']
['Positivism', 'Social_sciences']
['Modern_science', 'Social_sciences']
['Antipositivism', 'Social_sciences']
['Falsifiable', 'Social_sciences']
['Eclecticism', 'Social_sciences']
['Methodology', 'Social_sciences']
['Quantitative_research', 'Social_sciences']
['Social_research', 'Social_

['Modern_era', 'Queen_Victoria']
['Modern_era', 'Reform_Act_1832']
['Modern_era', 'Pax_Britannica']
['Modern_era', 'Splendid_isolation']
['Argentina', 'Modern_era']
['Informal_empire', 'Modern_era']
['Anglo-Zulu_War', 'Modern_era']
['Modern_era', 'Zulu_Empire']
['Modern_era', 'Steamboat']
['Modern_era', 'Telegraphy']
['All_Red_Line', 'Modern_era']
['Colonial_Fiji', 'Modern_era']
['Bourbon_Restoration', 'Modern_era']
['July_Monarchy', 'Modern_era']
['Modern_era', 'Second_French_Empire']
['French_Third_Republic', 'Modern_era']
['Modern_era', 'Slavery']
['Haitian_Revolution', 'Modern_era']
['Barbary_pirates', 'Modern_era']
['Modern_era', 'Slavery_Abolition_Act']
['Modern_era', 'Slave_trade']
['Emancipation_reform_of_1861_in_Russia', 'Modern_era']
['Emancipation_Proclamation', 'Modern_era']
['Lei_%C3%81urea', 'Modern_era']
['Modern_era', 'Scramble_for_Africa']
['Berlin_West_Africa_Conference', 'Modern_era']
['Africa', 'Modern_era']
['Colony', 'Modern_era']
['Berlin_Conference', 'Modern_era

['Byzantine_empire', 'House_of_Hohenstaufen']
['Byzantine_empire', 'Philip_of_Swabia']
['Alexios_IV_Angelos', 'Byzantine_empire']
['Alexios_IV', 'Byzantine_empire']
['Byzantine_empire', 'Prostitute']
['Baldwin_I_of_Constantinople', 'Byzantine_empire']
['Byzantine_empire', 'Latin_Empire']
['Byzantine_empire', 'Thomas_Morosini']
['Byzantine_empire', 'Empire_of_Nicaea']
['Byzantine_empire', 'Empire_of_Trebizond']
['Byzantine_empire', 'Despotate_of_Epirus']
['Alexios_I_of_Trebizond', 'Byzantine_empire']
['Byzantine_empire', 'Sultanate_of_Rum']
['Battle_of_K%C3%B6se_Da%C4%9F', 'Byzantine_empire']
['Anatolian_beyliks', 'Byzantine_empire']
['Byzantine_empire', 'Ghazi_(warrior)']
['Byzantine_empire', 'Osman_I']
['Byzantine_empire', 'Laskaris']
['Byzantine_empire', 'Recapture_of_Constantinople']
['Byzantine_empire', 'Michael_VIII_Palaiologos']
['Andronikos_II_Palaiologos', 'Byzantine_empire']
['Andronikos_III_Palaiologos', 'Byzantine_empire']
['Byzantine_empire', 'Catalan_Company']
['Byzantine_

['Einstein', 'Violin_sonata']
['Chamber_music', 'Einstein']
['Einstein', 'K%C3%B6chel_catalogue']
['Alfred_Einstein', 'Einstein']
['Einstein', 'Zoellner_Quartet']
['Einstein', 'Juilliard_Quartet']
['Einstein', 'Why_Socialism%3F']
['Einstein', 'World_government']
['Einstein', 'Pantheism']
['Einstein', 'Spinozism']
['Einstein', 'Personal_God']
['Agnosticism', 'Einstein']
['Afterlife', 'Einstein']
['Einstein', 'Internal_bleeding']
['Abdominal_aortic_aneurysm', 'Einstein']
['Einstein', 'Rudolph_Nissen']
['Einstein', 'Princeton_Hospital']
['Einstein', 'Thomas_Stoltz_Harvey']
['Albert_Einstein%27s_brain', 'Einstein']
['Einstein', 'Human_intelligence']
['Cremation', 'Einstein']
['Einstein', 'Robert_Oppenheimer']
['Bose%E2%80%93Einstein_statistics', 'Einstein']
['Einstein', 'Einstein_refrigerator']
['Einstein', 'History_of_physics']
['Capillary_attraction', 'Einstein']
['Einstein', 'Principle_of_relativity']
['Einstein', 'Hermann_Minkowski']
['Einstein', 'Principle_of_equivalence']
['Einstein'

['Field-programmable_gate_array', 'Integrated_circuits']
['Integrated_circuits', 'Power_network_design_(IC)']
['Integrated_circuits', 'Operational_amplifier']
['Active_filter', 'Integrated_circuits']
['Demodulation', 'Integrated_circuits']
['Frequency_mixer', 'Integrated_circuits']
['Analog-to-digital_converter', 'Integrated_circuits']
['Digital-to-analog_converter', 'Integrated_circuits']
['802.11', 'Integrated_circuits']
['Integrated_circuits', 'Wi-Fi']
['Atheros', 'Integrated_circuits']
['Integrated_circuits', 'Semiconductor']
['Integrated_circuits', 'Periodic_table']
['Integrated_circuits', 'Solid-state_(electronics)']
['Copper(I)_oxide', 'Integrated_circuits']
['Integrated_circuits', 'Monocrystalline_silicon']
['Integrated_circuits', 'Substrate_(printing)']
['Gallium_arsenide', 'Integrated_circuits']
['Integrated_circuits', 'Light-emitting_diode']
['Integrated_circuits', 'Solar_cell']
['Crystalline_structure', 'Integrated_circuits']
['Integrated_circuits', 'Wafer_(electronics)']
[

['Philip_Francis_Nowlan', 'Science_fiction']
['Buck_Rogers', 'Science_fiction']
['Brick_Bradford', 'Science_fiction']
['Flash_Gordon', 'Science_fiction']
['John_W._Campbell', 'Science_fiction']
['Astounding_Science_Fiction', 'Science_fiction']
['Futurians', 'Science_fiction']
['Donald_A._Wollheim', 'Science_fiction']
['Frederik_Pohl', 'Science_fiction']
['James_Blish', 'Science_fiction']
['Judith_Merril', 'Science_fiction']
['E.E._Smith', 'Science_fiction']
['Arthur_C._Clarke', 'Science_fiction']
['A._E._van_Vogt', 'Science_fiction']
['Ray_Bradbury', 'Science_fiction']
['Science_fiction', 'Stanis%C5%82aw_Lem']
['Golden_Age_of_science_fiction', 'Science_fiction']
['Galaxy_(magazine)', 'Science_fiction']
['Beat_generation', 'Science_fiction']
['Science_fiction', 'William_S._Burroughs']
['Frank_Herbert', 'Science_fiction']
['Samuel_R._Delany', 'Science_fiction']
['Roger_Zelazny', 'Science_fiction']
['Harlan_Ellison', 'Science_fiction']
['New_Wave_(science_fiction)', 'Science_fiction']
['L

['Chris_Mooney_(journalist)', 'Harvard_Medical_School']
['Chris_Mooney_(journalist)', 'Goddard_Space_Flight_Center']
['Science_outreach', 'science']
['Science_outreach', 'Umbrella_term']
['Science_museum', 'Science_outreach']
['Public_awareness_of_science', 'Science_outreach']
['Science_outreach', 'Space_Science_Institute']
['NASA_Education_and_Public_Outreach_Group_at_Sonoma_State_University', 'Science_outreach']
['Science_outreach', 'Sonoma_State_University']
['Science_festival', 'Science_outreach']
['Royal_Institution_Christmas_Lectures', 'Science_outreach']
['Astronomy_club', 'Science_outreach']
['Science_outreach', 'Scientific_demonstration']
['Caf%C3%A9_Scientifique', 'Science_outreach']
['Career_fair', 'Science_outreach']
['Robogals', 'Science_outreach']
['Perimeter_Institute', 'Science_outreach']
['Canadian_Space_Agency', 'Science_outreach']
['Science_fair', 'Science_outreach']
['Public_science', 'Science_outreach']
['K%E2%80%9312', 'Science_outreach']
['Phenotype', 'Science_ou