In [2]:
import requests

In [16]:
baseUrl = 'https://www.ssa.gov/OACT/babynames/index.html'

In [3]:
s = requests.session()

In [5]:
r = s.get(baseUrl)

In [7]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html5lib')

In [28]:
form = soup.find('form')
form

<form action="/cgi-bin/popularnames.cgi" method="post" name="popnames" onsubmit="return submitIt();">
                <p>
                  <input id="year" maxlength="4" name="year" pattern="\d{4}" required="" size="5" style="width:100px" title="Birth Year: Must be 4 numbers" type="text" value="2016"/>
                  <label for="year" style="display:inline;">  Birth Year</label><br/>
                </p>
                <p>
                  <select id="rank" name="top" size="1" style="width:100px">
                    <option value="20">Top 20</option>
                    <option value="50">Top 50</option>
                    <option value="100">Top 100</option>
                    <option value="500">Top 500</option>
                    <option value="1000">Top 1000</option>
                  </select>
                  <label for="rank" style="display:inline;">  Popularity</label><br/>
                </p>
                <fieldset>
                  <legend>Name rankings may in

Here I notice that the form has a javascript submitter, so I want to check if that does anything important before going on.

In [10]:
scripts = soup.findAll('script')
scripts

[<script src="/framework/js/ssa.internet.head.js"></script>,
 <script src="chkinput.js"></script>,
 <script src="/framework/js/ssa.internet.body.js"></script>]

"chkinput.js" sounds promising...

In [38]:
import urllib
urllib.parse.urljoin(baseUrl, scripts[1]['src'])

'https://www.ssa.gov/OACT/babynames/chkinput.js'

In [39]:
print(s.get(_).text)

// submitIt used by form for top N names and tests for valid year
function submitIt() {
 if(IsNum(document.popnames.year.value)) {
   alert("Use numeric characters only!")
   document.popnames.year.focus()
   document.popnames.year.select()
   return false
 } 
}

// submitBaby used by form for a given baby name and
// test for valid name & valid number of years
function submitBaby() {
 if(IsNum(document.babyname.nyrs.value)) {
   alert("Use numeric characters only!")
   document.babyname.nyrs.focus()
   document.babyname.nyrs.select()
   return false
 }
 
 if(IsName(document.babyname.name.value)) {
   alert("Use alphabetic characters only!")
   document.babyname.name.focus()
   document.babyname.name.select()
   return false
 }  
}

function IsNum(u) {
// var num is set as non-digit
   var num = /\D/;
   return num.test(u);
}

function IsName(u) {
// var alpha is set as non-alphanumeric
   var alpha = /\W/;
   return alpha.test(u);
}



(Note to self: would love to figure out how to make the above highlighted)

Ok, `submitIt` just does validation. We can ignore that. So, back to the form:

In [29]:
form

<form action="/cgi-bin/popularnames.cgi" method="post" name="popnames" onsubmit="return submitIt();">
                <p>
                  <input id="year" maxlength="4" name="year" pattern="\d{4}" required="" size="5" style="width:100px" title="Birth Year: Must be 4 numbers" type="text" value="2016"/>
                  <label for="year" style="display:inline;">  Birth Year</label><br/>
                </p>
                <p>
                  <select id="rank" name="top" size="1" style="width:100px">
                    <option value="20">Top 20</option>
                    <option value="50">Top 50</option>
                    <option value="100">Top 100</option>
                    <option value="500">Top 500</option>
                    <option value="1000">Top 1000</option>
                  </select>
                  <label for="rank" style="display:inline;">  Popularity</label><br/>
                </p>
                <fieldset>
                  <legend>Name rankings may in

In [36]:
action = urllib.parse.urljoin(baseUrl, form['action'])
print(action)

https://www.ssa.gov/cgi-bin/popularnames.cgi


In [46]:
form.findAll('input')

[<input id="year" maxlength="4" name="year" pattern="\d{4}" required="" size="5" style="width:100px" title="Birth Year: Must be 4 numbers" type="text" value="2016"/>,
 <input id="percent" name="number" type="radio" value="p"/>,
 <input id="number" name="number" type="radio" value="n"/>,
 <input type="submit" value="  Go  "/>,
 <input type="reset" value="Reset"/>]

In [47]:
form.findAll('select')

[<select id="rank" name="top" size="1" style="width:100px">
                     <option value="20">Top 20</option>
                     <option value="50">Top 50</option>
                     <option value="100">Top 100</option>
                     <option value="500">Top 500</option>
                     <option value="1000">Top 1000</option>
                   </select>]

In [49]:
postData = {'year': 1880, 'number': 'n', 'top': 1000}
result = s.post(action, data = postData)
page = BeautifulSoup(result.text, 'html5lib')

In [50]:
tables = page.findAll('table')

In [51]:
len(tables)

4

In [54]:
[(i, len(str(t))) for i, t in enumerate(tables)]

[(0, 357), (1, 95105), (2, 94303), (3, 499)]

In [56]:
names = tables[2]

In [59]:
firstRow = names.find('tr')
firstRow

<tr align="center" valign="bottom">
  <th bgcolor="#eeeeee" scope="col" width="12%">Rank</th>
  <th bgcolor="#99ccff" scope="col" width="22%">Male name</th>
<th bgcolor="#99ccff" scope="col" width="22%">Number of<br/> males</th>
<th bgcolor="pink" scope="col" width="22%">Female name</th>
<th bgcolor="pink" scope="col" width="22%">Number of<br/> females</th></tr>

In [61]:
columns = [th.text for th in firstRow.findAll('th')]
columns

['Rank', 'Male name', 'Number of males', 'Female name', 'Number of females']

In [62]:
rows = firstRow.findNextSiblings('tr')
rows[:5]

[<tr align="right">
  <td>1</td> <td>John</td><td>9,655</td>
  <td>Mary</td>
 <td>7,065</td>
 </tr>, <tr align="right">
  <td>2</td> <td>William</td><td>9,532</td>
  <td>Anna</td>
 <td>2,604</td>
 </tr>, <tr align="right">
  <td>3</td> <td>James</td><td>5,927</td>
  <td>Emma</td>
 <td>2,003</td>
 </tr>, <tr align="right">
  <td>4</td> <td>Charles</td><td>5,348</td>
  <td>Elizabeth</td>
 <td>1,939</td>
 </tr>, <tr align="right">
  <td>5</td> <td>George</td><td>5,126</td>
  <td>Minnie</td>
 <td>1,746</td>
 </tr>]

In [63]:
def getValues(tr):
    return [td.text.replace(',', '') for td in tr.findAll('td')]

In [64]:
getValues(rows[0])

['1', 'John', '9655', 'Mary', '7065']

In [65]:
data = [dict(zip(columns, getValues(row))) for row in rows]

In [66]:
data[0]

{'Female name': 'Mary',
 'Male name': 'John',
 'Number of females': '7065',
 'Number of males': '9655',
 'Rank': '1'}

In [67]:
data[-1]

{'Rank': 'Note: Rank 1 is the most popular\nrank 2 is the next most popular and so forth. All names are from Social Security card applications\n              for births that occurred in the United States.\n'}

In [68]:
data[-2]

{'Female name': '',
 'Male name': 'Layton',
 'Number of females': '0',
 'Number of males': '5',
 'Rank': '1000'}