# Web Scraping HuberTiming website
    This is an overview about how to web scrape, pretty simple, we'll basically
    Load the 'document' (html)
    Parse it (well, not really, you must inspect the webpage to that)
    Extract it
    and transform it into a dataset

Video that helped me: https://www.youtube.com/watch?v=88oMlkWSGz0&list=WL&index=20&t=485s

In [1]:
# Imports needed for

# Web scraping
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup
import re

# DataFrame creation
import pandas as pd

In [2]:
# Open link

url = 'https://www.hubertiming.com/results/2018MLK'
html = uReq(url)

In [3]:
# Passing our html to an bs4 object that can make sense of it for us to use it

soup = BeautifulSoup(html, 'lxml')

In [7]:
# As you can see, here's the html (try and print it without prettify)

print(soup.prettify()[0:502])

<!DOCTYPE html>
<html>
 <head>
  <meta content="Race results for the 2018 MLK Dream Run!" property="og:title"/>
  <meta content="Results of the 2018 MLK Dream Run at Soul District in Portland, OR" property="og:description"/>
  <meta content="https://www.hubertiming.com/results/2018MLKDreamRun.jpg" property="og:image"/>
  <meta content="https://www.hubertiming.com/results/2018MLK" property="og:url"/>
  <meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
  <title>


#### Now We can use bs4 attributes and methods to get what we want from the website 


In [11]:
# title tag
soup.title

<title>Race results for the 2018 MLK Dream Run!</title>

In [12]:
# content from title tag
soup.title.text

'Race results for the 2018 MLK Dream Run!'

#### We can also pass it into a variable and use the same attributes and methods. It's a way to store the data and create a dataset later

In [14]:
title = soup.title
title.text

'Race results for the 2018 MLK Dream Run!'

In [20]:
# .find_all() method returns to us all tags related with the argument we passed

soup.find_all('a')

[<a href="https://www.mlkdreamrun.com/" target="_blank">MLK Dream Run</a>,
 <a href="mailto:timing@hubertiming.com">timing@hubertiming.com</a>,
 <a href="https://www.hubertiming.com/">Huber Timing Home</a>,
 <a class="btn btn-primary btn-lg" href="/results/2018MLK10K" role="button" style="margin: 0px 0px 5px 5px"><i aria-hidden="true" class="fa fa-user"></i> 10K</a>,
 <a class="btn btn-primary btn-lg" href="/results/2018MLK15K" role="button" style="margin: 0px 0px 5px 5px"><i aria-hidden="true" class="fa fa-user"></i> 15K</a>,
 <a class="btn btn-primary btn-lg" href="/results/summary/2018MLK" role="button" style="margin: 0px 0px 5px 5px"><i class="fa fa-stream"></i> Summary</a>,
 <a id="individual" name="individual"></a>,
 <a data-url="/results/2018MLK" href="#tabs-1" id="rootTab" style="font-size: 18px">5K Results</a>,
 <a href="https://www.hubertiming.com/"><img height="65" src="https://www.hubertiming.com//sites/all/themes/hubertiming/images/clockWithFinishSign_small.png" width="50"

#### If you want get all the links in a tag you can pass another argument specifying that

In [18]:
for link in soup.find_all('a', href=True):
    print(link)

<a href="https://www.mlkdreamrun.com/" target="_blank">MLK Dream Run</a>
<a href="mailto:timing@hubertiming.com">timing@hubertiming.com</a>
<a href="https://www.hubertiming.com/">Huber Timing Home</a>
<a class="btn btn-primary btn-lg" href="/results/2018MLK10K" role="button" style="margin: 0px 0px 5px 5px"><i aria-hidden="true" class="fa fa-user"></i> 10K</a>
<a class="btn btn-primary btn-lg" href="/results/2018MLK15K" role="button" style="margin: 0px 0px 5px 5px"><i aria-hidden="true" class="fa fa-user"></i> 15K</a>
<a class="btn btn-primary btn-lg" href="/results/summary/2018MLK" role="button" style="margin: 0px 0px 5px 5px"><i class="fa fa-stream"></i> Summary</a>
<a data-url="/results/2018MLK" href="#tabs-1" id="rootTab" style="font-size: 18px">5K Results</a>
<a href="https://www.hubertiming.com/"><img height="65" src="https://www.hubertiming.com//sites/all/themes/hubertiming/images/clockWithFinishSign_small.png" width="50"/>Huber Timing</a>
<a href="https://facebook.com/hubertimin

In [19]:
# Clean code (easier to understand)

links = soup.find_all('a', href=True)

for link in links:
    print(link)

<a href="https://www.mlkdreamrun.com/" target="_blank">MLK Dream Run</a>
<a href="mailto:timing@hubertiming.com">timing@hubertiming.com</a>
<a href="https://www.hubertiming.com/">Huber Timing Home</a>
<a class="btn btn-primary btn-lg" href="/results/2018MLK10K" role="button" style="margin: 0px 0px 5px 5px"><i aria-hidden="true" class="fa fa-user"></i> 10K</a>
<a class="btn btn-primary btn-lg" href="/results/2018MLK15K" role="button" style="margin: 0px 0px 5px 5px"><i aria-hidden="true" class="fa fa-user"></i> 15K</a>
<a class="btn btn-primary btn-lg" href="/results/summary/2018MLK" role="button" style="margin: 0px 0px 5px 5px"><i class="fa fa-stream"></i> Summary</a>
<a data-url="/results/2018MLK" href="#tabs-1" id="rootTab" style="font-size: 18px">5K Results</a>
<a href="https://www.hubertiming.com/"><img height="65" src="https://www.hubertiming.com//sites/all/themes/hubertiming/images/clockWithFinishSign_small.png" width="50"/>Huber Timing</a>
<a href="https://facebook.com/hubertimin

#### It is important that you have some knowledge about HTML tags, because knowing about them will make your search easier

In [29]:
all_rows = soup.find_all('tr')
all_rows[0:5]

[<tr colspan="2">
 <b>5K:</b>
 </tr>,
 <tr>
 <td>Finishers:</td>
 <td>191</td>
 </tr>,
 <tr>
 <td>Male:</td>
 <td>78</td>
 </tr>,
 <tr>
 <td>Female:</td>
 <td>113</td>
 </tr>,
 <tr class="header">
 <th>Place</th>
 <th>Bib</th>
 <th>Name</th>
 <th>Gender</th>
 <th>Age</th>
 <th>City</th>
 <th>State</th>
 <th>Chip Time</th>
 <th>Chip Pace</th>
 <th>Gender Place</th>
 <th>Age Group</th>
 <th>Age Group Place</th>
 <th>Time to Start</th>
 <th>Gun Time</th>
 </tr>]

In [31]:
''' Each 'tr' tag has a 'td' tag, so we can iterate over them and get the values in the rows '''

for row in all_rows:
    row_list = row.find_all('td')
    
'''
    We didn't append the values in a list, then here there is the last row value.
    And also, we should use a different sintax to get the actual values (without tags)
    You'll see it in the cell below
'''
print(row_list)

[<td>191</td>, <td>1216</td>, <td>

                    ZULMA OCHOA

                </td>, <td>F</td>, <td>40</td>, <td>GRESHAM</td>, <td>OR</td>, <td>1:43:27</td>, <td>33:22</td>, <td>

                    113 of 113

                </td>, <td>F 40-54</td>, <td>

                    37 of 37

                </td>, <td>0:00</td>, <td>1:43:27</td>]


In [32]:
for cell in row_list:
    print(cell.text)

191
1216


                    ZULMA OCHOA

                
F
40
GRESHAM
OR
1:43:27
33:22


                    113 of 113

                
F 40-54


                    37 of 37

                
0:00
1:43:27


#### Now, to actually get all the records, this is one way to do it:

In [55]:
# tag with the rows and columns we need
all_rows = soup.find_all('tr')

# List to store all the values (organized)
data = []


for row in all_rows:
    row_list = row.find_all('td') # tag for each row
    
    data_row = [] # List to store each row 
    
    for cell in row_list:
        data_row.append(cell.text) # Getting the values from 1 row and storing it.
    
    data.append(data_row) # Saving each row (with only values)

    
title = data[0:2]
data = data[1:] # Getting rid of the 0th index
print(data[-2:]) # Showing the last two rows

[['190', '2087', '\r\n\r\n                    LEESHA POSEY\r\n\r\n                ', 'F', '43', 'PORTLAND', 'OR', '1:33:53', '30:17', '\r\n\r\n                    112 of 113\r\n\r\n                ', 'F 40-54', '\r\n\r\n                    36 of 37\r\n\r\n                ', '0:00', '1:33:53'], ['191', '1216', '\r\n\r\n                    ZULMA OCHOA\r\n\r\n                ', 'F', '40', 'GRESHAM', 'OR', '1:43:27', '33:22', '\r\n\r\n                    113 of 113\r\n\r\n                ', 'F 40-54', '\r\n\r\n                    37 of 37\r\n\r\n                ', '0:00', '1:43:27']]


In [56]:
title # 0th row '-'

[[], ['Finishers:', '191']]

In [59]:
data[-1:]

[['191',
  '1216',
  '\r\n\r\n                    ZULMA OCHOA\r\n\r\n                ',
  'F',
  '40',
  'GRESHAM',
  'OR',
  '1:43:27',
  '33:22',
  '\r\n\r\n                    113 of 113\r\n\r\n                ',
  'F 40-54',
  '\r\n\r\n                    37 of 37\r\n\r\n                ',
  '0:00',
  '1:43:27']]

#### Creating a DataFrame
    Notice that the first rows are just informations about the whole dataset.
    If you look at the website, they're not even rows. Those information are
    located on the top of the page, but because of the html structure used to
    create the page, we receive them like this:
    
( Website: https://www.hubertiming.com/results/2018MLK )

In [79]:
df = pd.DataFrame(data)
df.head(6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Finishers:,191.0,,,,,,,,,,,,
1,Male:,78.0,,,,,,,,,,,,
2,Female:,113.0,,,,,,,,,,,,
3,,,,,,,,,,,,,,
4,1,1191.0,\r\n\r\n MAX RANDOLPH\r\n\r...,M,29.0,WASHINGTON,DC,16:48,5:25,\r\n\r\n 1 of 78\r\n\r\n ...,M 21-39,\r\n\r\n 1 of 33\r\n\r\n ...,0:08,16:56
5,2,1080.0,\r\n\r\n NEED NAME KAISER R...,M,25.0,PORTLAND,OR,17:31,5:39,\r\n\r\n 2 of 78\r\n\r\n ...,M 21-39,\r\n\r\n 2 of 33\r\n\r\n ...,0:09,17:40


Solving that is pretty simple:

In [80]:
# data = data[1:] # Getting rid of the 0th index   [We had done this in previews cells, remember?]

data = data[4:] # Getting rid of the those index

# The result
df = pd.DataFrame(data)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,1191,\r\n\r\n MAX RANDOLPH\r\n\r...,M,29,WASHINGTON,DC,16:48,5:25,\r\n\r\n 1 of 78\r\n\r\n ...,M 21-39,\r\n\r\n 1 of 33\r\n\r\n ...,0:08,16:56
1,2,1080,\r\n\r\n NEED NAME KAISER R...,M,25,PORTLAND,OR,17:31,5:39,\r\n\r\n 2 of 78\r\n\r\n ...,M 21-39,\r\n\r\n 2 of 33\r\n\r\n ...,0:09,17:40
2,3,1275,\r\n\r\n DAN FRANEK\r\n\r\n...,M,52,PORTLAND,OR,18:15,5:53,\r\n\r\n 3 of 78\r\n\r\n ...,M 40-54,\r\n\r\n 1 of 27\r\n\r\n ...,0:07,18:22
3,4,1223,\r\n\r\n PAUL TAYLOR\r\n\r\...,M,54,PORTLAND,OR,18:31,5:58,\r\n\r\n 4 of 78\r\n\r\n ...,M 40-54,\r\n\r\n 2 of 27\r\n\r\n ...,0:07,18:38
4,5,1245,\r\n\r\n THEO KINMAN\r\n\r\...,M,22,,,19:31,6:17,\r\n\r\n 5 of 78\r\n\r\n ...,M 21-39,\r\n\r\n 3 of 33\r\n\r\n ...,0:09,19:40


In [82]:
# At the end of the dataset it is possible for us to find extra data too. So to confirm if there are extra data:
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
186,187,1254,\r\n\r\n CYNTHIA HARRIS\r\n...,F,64,PORTLAND,OR,1:07:51,21:53,\r\n\r\n 110 of 113\r\n\r\n...,F 55+,\r\n\r\n 14 of 14\r\n\r\n ...,1:19,1:09:10
187,188,1085,\r\n\r\n EBONY LAWRENCE\r\n...,F,30,PORTLAND,OR,1:08:12,22:00,\r\n\r\n 111 of 113\r\n\r\n...,F 21-39,\r\n\r\n 53 of 53\r\n\r\n ...,0:58,1:09:10
188,189,1170,\r\n\r\n ANTHONY WILLIAMS\r...,M,39,PORTLAND,OR,1:09:11,22:19,\r\n\r\n 78 of 78\r\n\r\n ...,M 21-39,\r\n\r\n 33 of 33\r\n\r\n ...,0:08,1:09:19
189,190,2087,\r\n\r\n LEESHA POSEY\r\n\r...,F,43,PORTLAND,OR,1:33:53,30:17,\r\n\r\n 112 of 113\r\n\r\n...,F 40-54,\r\n\r\n 36 of 37\r\n\r\n ...,0:00,1:33:53
190,191,1216,\r\n\r\n ZULMA OCHOA\r\n\r\...,F,40,GRESHAM,OR,1:43:27,33:22,\r\n\r\n 113 of 113\r\n\r\n...,F 40-54,\r\n\r\n 37 of 37\r\n\r\n ...,0:00,1:43:27


#### The last thing we should do is: getting the headers right 
        We could add them manually (not good option, mainly if the data have lots of headers)
        or...

In [84]:
# Fetching headers
column_headers = soup.find_all('th')
column_headers

[<th>Place</th>,
 <th>Bib</th>,
 <th>Name</th>,
 <th>Gender</th>,
 <th>Age</th>,
 <th>City</th>,
 <th>State</th>,
 <th>Chip Time</th>,
 <th>Chip Pace</th>,
 <th>Gender Place</th>,
 <th>Age Group</th>,
 <th>Age Group Place</th>,
 <th>Time to Start</th>,
 <th>Gun Time</th>]

In [85]:
header_list = []

for header in column_headers:
    header_list.append(header.text)

header_list

['Place',
 'Bib',
 'Name',
 'Gender',
 'Age',
 'City',
 'State',
 'Chip Time',
 'Chip Pace',
 'Gender Place',
 'Age Group',
 'Age Group Place',
 'Time to Start',
 'Gun Time']

In [87]:
''' And now, using pandas we just need to do this: '''

df.columns = header_list

# \(*O*)/
df.head()

Unnamed: 0,Place,Bib,Name,Gender,Age,City,State,Chip Time,Chip Pace,Gender Place,Age Group,Age Group Place,Time to Start,Gun Time
0,1,1191,\r\n\r\n MAX RANDOLPH\r\n\r...,M,29,WASHINGTON,DC,16:48,5:25,\r\n\r\n 1 of 78\r\n\r\n ...,M 21-39,\r\n\r\n 1 of 33\r\n\r\n ...,0:08,16:56
1,2,1080,\r\n\r\n NEED NAME KAISER R...,M,25,PORTLAND,OR,17:31,5:39,\r\n\r\n 2 of 78\r\n\r\n ...,M 21-39,\r\n\r\n 2 of 33\r\n\r\n ...,0:09,17:40
2,3,1275,\r\n\r\n DAN FRANEK\r\n\r\n...,M,52,PORTLAND,OR,18:15,5:53,\r\n\r\n 3 of 78\r\n\r\n ...,M 40-54,\r\n\r\n 1 of 27\r\n\r\n ...,0:07,18:22
3,4,1223,\r\n\r\n PAUL TAYLOR\r\n\r\...,M,54,PORTLAND,OR,18:31,5:58,\r\n\r\n 4 of 78\r\n\r\n ...,M 40-54,\r\n\r\n 2 of 27\r\n\r\n ...,0:07,18:38
4,5,1245,\r\n\r\n THEO KINMAN\r\n\r\...,M,22,,,19:31,6:17,\r\n\r\n 5 of 78\r\n\r\n ...,M 21-39,\r\n\r\n 3 of 33\r\n\r\n ...,0:09,19:40


In [88]:
# Finally, creating a csv file with all data we need.

df.to_csv('race timings.csv')