In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://en.wikipedia.org/wiki/Occam%27s_razor'

In [3]:
my_attrs = {'format': 'json'}

In [5]:
resp = requests.get(url, params = my_attrs)

In [6]:
resp.status_code

200

In [7]:
html = resp.content

In [8]:
soup = BeautifulSoup(html, 'html.parser')

In [9]:
definition = soup.select_one('p')

In [10]:
definition.text

'Occam\'s razor (also Ockham\'s razor or Ocham\'s razor; Latin: lex parsimoniae "law of parsimony") is the problem-solving principle that, when presented with competing hypothetical answers to a problem, one should select the answer that makes the fewest assumptions. The idea is attributed to William of Ockham (c. 1287–1347), who was an English Franciscan friar, scholastic philosopher, and theologian.'

In [11]:
bookurl = 'https://en.wikipedia.org/wiki/Book:Machine_Learning_%E2%80%93_The_Complete_Guide'

In [13]:
resp = requests.get(bookurl, params = my_attrs)

In [14]:
html = resp.content

In [15]:
soup = BeautifulSoup(html, 'html.parser')

In [16]:
if resp.status_code == 200:
    sections = soup.select('dl')

In [17]:
sections

[<dl>
 <dt>Introduction and Main Principles</dt>
 <dd><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a></dd>
 <dd><a href="/wiki/Data_analysis" title="Data analysis">Data analysis</a></dd>
 <dd><a href="/wiki/Occam%27s_razor" title="Occam's razor">Occam's razor</a></dd>
 <dd><a href="/wiki/Curse_of_dimensionality" title="Curse of dimensionality">Curse of dimensionality</a></dd>
 <dd><a href="/wiki/No_free_lunch_theorem" title="No free lunch theorem">No free lunch theorem</a></dd>
 <dd><a href="/wiki/Accuracy_paradox" title="Accuracy paradox">Accuracy paradox</a></dd>
 <dd><a href="/wiki/Overfitting" title="Overfitting">Overfitting</a></dd>
 <dd><a class="mw-redirect" href="/wiki/Regularization_(machine_learning)" title="Regularization (machine learning)">Regularization (machine learning)</a></dd>
 <dd><a href="/wiki/Inductive_bias" title="Inductive bias">Inductive bias</a></dd>
 <dd><a href="/wiki/Data_dredging" title="Data dredging">Data dredging</a></dd>


In [18]:
import pandas as pd

In [23]:
first = links[0]

In [27]:
f_link = first.select_one('a')

In [28]:
f_link.attrs

{'href': '/wiki/Machine_learning', 'title': 'Machine learning'}

In [19]:
def parse_sec(sec):
    parsed_dds = []
    dt = sec.select_one('dt')
    if dt is not None:
        sec_title = dt.text
        dds = sec.select('dd')
        if dds is not None:
            parsed_dds = [parse_dd(dd, sec_title) for dd in dds]
    if len(parsed_dds) > 0:
        return parsed_dds

def parse_dd(dd, sec_title):
    link = dd.select_one('a')
    if link is not None:
        attrs = link.attrs
        link_dict = {
            'title': attrs['title'],
            'href': attrs['href'],
            'section': sec_title
        }
        return link_dict

In [20]:
parsed_links = []
for sec in sections:
    parsed_content = parse_sec(sec)
    if parsed_content is not None:
        parsed_links.extend(parsed_content)

In [21]:
len(parsed_links)

561

In [22]:
parsed_links

[{'href': '/wiki/Machine_learning',
  'section': 'Introduction and Main Principles',
  'title': 'Machine learning'},
 {'href': '/wiki/Data_analysis',
  'section': 'Introduction and Main Principles',
  'title': 'Data analysis'},
 {'href': '/wiki/Occam%27s_razor',
  'section': 'Introduction and Main Principles',
  'title': "Occam's razor"},
 {'href': '/wiki/Curse_of_dimensionality',
  'section': 'Introduction and Main Principles',
  'title': 'Curse of dimensionality'},
 {'href': '/wiki/No_free_lunch_theorem',
  'section': 'Introduction and Main Principles',
  'title': 'No free lunch theorem'},
 {'href': '/wiki/Accuracy_paradox',
  'section': 'Introduction and Main Principles',
  'title': 'Accuracy paradox'},
 {'href': '/wiki/Overfitting',
  'section': 'Introduction and Main Principles',
  'title': 'Overfitting'},
 {'href': '/wiki/Regularization_(machine_learning)',
  'section': 'Introduction and Main Principles',
  'title': 'Regularization (machine learning)'},
 {'href': '/wiki/Inductive

In [23]:
baseurl = 'https://en.wikipedia.org'

In [24]:
def add_definition(link_dict):
    if link_dict is not None:
        url = baseurl + link_dict['href']
        print(url)
        response = requests.get(url, params = my_attrs)
        if response.status_code == 200:
            html = response.content
            soup = BeautifulSoup(html, 'html.parser')
            div = soup.select_one('div.mw-parser-output')
#             print(div)
            p_definition = div.select_one('p')
#             print(p_definition)
            definition = p_definition.text
            link_dict['definition'] = definition
            print("Got definition for {}.".format(link_dict['title']))
#             print(definition)
    return link_dict

In [25]:
for link_dict in parsed_links:
    link_dict = add_definition(link_dict)

https://en.wikipedia.org/wiki/Machine_learning
Got definition for Machine learning.
https://en.wikipedia.org/wiki/Data_analysis
Got definition for Data analysis.
https://en.wikipedia.org/wiki/Occam%27s_razor
Got definition for Occam's razor.
https://en.wikipedia.org/wiki/Curse_of_dimensionality
Got definition for Curse of dimensionality.
https://en.wikipedia.org/wiki/No_free_lunch_theorem
Got definition for No free lunch theorem.
https://en.wikipedia.org/wiki/Accuracy_paradox
Got definition for Accuracy paradox.
https://en.wikipedia.org/wiki/Overfitting
Got definition for Overfitting.
https://en.wikipedia.org/wiki/Regularization_(machine_learning)
Got definition for Regularization (machine learning).
https://en.wikipedia.org/wiki/Inductive_bias
Got definition for Inductive bias.
https://en.wikipedia.org/wiki/Data_dredging
Got definition for Data dredging.
https://en.wikipedia.org/wiki/Ugly_duckling_theorem
Got definition for Ugly duckling theorem.
https://en.wikipedia.org/wiki/Uncertai

AttributeError: 'NoneType' object has no attribute 'text'

In [26]:
parsed_links

[{'definition': 'Machine learning is a field of computer science that often uses statistical techniques to give computers the ability to "learn" (i.e., progressively improve performance on a specific task) with data, without being explicitly programmed.[1]',
  'href': '/wiki/Machine_learning',
  'section': 'Introduction and Main Principles',
  'title': 'Machine learning'},
 {'definition': 'Numerical analysis\xa0· Simulation',
  'href': '/wiki/Data_analysis',
  'section': 'Introduction and Main Principles',
  'title': 'Data analysis'},
 {'definition': 'Occam\'s razor (also Ockham\'s razor or Ocham\'s razor; Latin: lex parsimoniae "law of parsimony") is the problem-solving principle that, when presented with competing hypothetical answers to a problem, one should select the answer that makes the fewest assumptions. The idea is attributed to William of Ockham (c. 1287–1347), who was an English Franciscan friar, scholastic philosopher, and theologian.',
  'href': '/wiki/Occam%27s_razor',
 

In [30]:
len(parsed_links)

560

In [29]:
parsed_links = [d for d in parsed_links if d is not None]

In [31]:
ml_def_df = pd.DataFrame(parsed_links)

In [32]:
ml_def_df.to_pickle()

Unnamed: 0,definition,href,section,title
0,Machine learning is a field of computer scienc...,/wiki/Machine_learning,Introduction and Main Principles,Machine learning
1,Numerical analysis · Simulation,/wiki/Data_analysis,Introduction and Main Principles,Data analysis
2,Occam's razor (also Ockham's razor or Ocham's ...,/wiki/Occam%27s_razor,Introduction and Main Principles,Occam's razor
3,The curse of dimensionality refers to various ...,/wiki/Curse_of_dimensionality,Introduction and Main Principles,Curse of dimensionality
4,"In mathematical folklore, the ""no free lunch"" ...",/wiki/No_free_lunch_theorem,Introduction and Main Principles,No free lunch theorem
5,The accuracy paradox for predictive analytics ...,/wiki/Accuracy_paradox,Introduction and Main Principles,Accuracy paradox
6,"In statistics, overfitting is ""the production ...",/wiki/Overfitting,Introduction and Main Principles,Overfitting
7,"In mathematics, statistics, and computer scien...",/wiki/Regularization_(machine_learning),Introduction and Main Principles,Regularization (machine learning)
8,The inductive bias (also known as learning bia...,/wiki/Inductive_bias,Introduction and Main Principles,Inductive bias
9,"Data dredging (also data fishing, data snoopin...",/wiki/Data_dredging,Introduction and Main Principles,Data dredging


Test to see if pickling worked

In [33]:
df = pd.read_pickle('../data/180530_def.pkl')

In [34]:
df

Unnamed: 0,definition,href,section,title
0,Machine learning is a field of computer scienc...,/wiki/Machine_learning,Introduction and Main Principles,Machine learning
1,Numerical analysis · Simulation,/wiki/Data_analysis,Introduction and Main Principles,Data analysis
2,Occam's razor (also Ockham's razor or Ocham's ...,/wiki/Occam%27s_razor,Introduction and Main Principles,Occam's razor
3,The curse of dimensionality refers to various ...,/wiki/Curse_of_dimensionality,Introduction and Main Principles,Curse of dimensionality
4,"In mathematical folklore, the ""no free lunch"" ...",/wiki/No_free_lunch_theorem,Introduction and Main Principles,No free lunch theorem
5,The accuracy paradox for predictive analytics ...,/wiki/Accuracy_paradox,Introduction and Main Principles,Accuracy paradox
6,"In statistics, overfitting is ""the production ...",/wiki/Overfitting,Introduction and Main Principles,Overfitting
7,"In mathematics, statistics, and computer scien...",/wiki/Regularization_(machine_learning),Introduction and Main Principles,Regularization (machine learning)
8,The inductive bias (also known as learning bia...,/wiki/Inductive_bias,Introduction and Main Principles,Inductive bias
9,"Data dredging (also data fishing, data snoopin...",/wiki/Data_dredging,Introduction and Main Principles,Data dredging


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 4 columns):
definition    558 non-null object
href          560 non-null object
section       560 non-null object
title         560 non-null object
dtypes: object(4)
memory usage: 17.6+ KB


In [38]:
df[df['definition'].isnull()]

Unnamed: 0,definition,href,section,title
294,,/wiki/Comparison_of_general_and_generalized_li...,Regression analysis,Comparison of general and generalized linear m...
402,,/w/index.php?title=Counterpropagation_network&...,Neural Networks,Counterpropagation network (page does not exist)


In [41]:
type(html) == bytes

True

In [43]:
type(soup) == BeautifulSoup

True