In [None]:
library(XML)
ubase = "http://www.cherryblossom.org/"

#### From text
menURLs = 
  c("cb99m.htm", 
    "cb003m.htm", 
    "results/2001/oof_m.html",
    "results/2002/oofm.htm", 
    "results/2003/CB03-M.HTM",
    "results/2004/men.htm", 
    "results/2005/CB05-M.htm", 
    "results/2006/men.htm", 
    "results/2007/men.htm", 
    "results/2008/men.htm", 
    "results/2009/09cucb-M.htm",
    "results/2010/2010cucb10m-m.htm", 
    "results/2011/2011cucb10m-m.htm",
    "results/2012/2012cucb10m-m.htm")
####

#### Text URLS
urls = paste(ubase, menURLs, sep="")
urls[1:4]

# 1999: http://www.cherryblossom.org/cb99m.htm
# 2000: http://www.cherryblossom.org/cb003m.htm
# 2001: http://www.cherryblossom.org/results/2001/oof_m.html

#### Textbook Function
extractResTable =
  #
  # Retrieve data from web site, 
  # find the preformatted text,
  # and write lines or return as a character vector.
  #
  function(url = "http://www.cherryblossom.org/results/2009/09cucb-F.htm",
           year = 1999, sex = "male", file = NULL)
  {
    #added encoding for windows users who get an "A" symbol
    doc = htmlParse(url)    
    #doc = htmlParse(url, encoding="UTF-8")
    
    if (year == 2000) {
      # Get preformatted text from 4th font element
      # The top file is ill formed so the <pre> search doesn't work.
      ff = getNodeSet(doc, "//font")
      txt = xmlValue(ff[[4]])
      els = strsplit(txt, "\r\n")[[1]]
    }
    else if (year == 2009 & sex == "male") {
      # Get preformatted text from <div class="Section1"> element
      # Each line of results is in a <pre> element
      div1 = getNodeSet(doc, "//div[@class='Section1']")
      pres = getNodeSet(div1[[1]], "//pre")
      els = sapply(pres, xmlValue)
    }
    else {
      # Get preformatted text from <pre> elements
      pres = getNodeSet(doc, "//pre")
      txt = xmlValue(pres[[1]])
      els = strsplit(txt, "\r\n")[[1]]
#      els2 = strsplit(txt, "\n")[[1]]
    } 
    
    if (is.null(file)) return(els)
    # Write the lines as a text file.
    writeLines(els, con = file)
  }

# Skip over the first pass
#### Individual Input Components for Testing: 1999
url <- 'http://www.cherryblossom.org/results/1999/cb99m.html'
year <- 1999
sex <- "male"
file <- NULL
####

# Skip over the first pass
#### Individual Input Components for Testing: 2000
url <- 'http://www.cherryblossom.org/results/2000/Cb003m.htm'
year <- 2000
sex <- "male"
file <- NULL
####

#### Textbook example with (1) URL
df1 <- extractResTable(url = "http://www.cherryblossom.org/results/2000/Cb003m.htm", year = 2000, sex = "male", file = NULL)
df2 <- extractResTable(url = "http://www.cherryblossom.org/results/1999/cb99m.html", year = 1999, sex = "male", file = NULL)
#df3 <- extractResTableV2(url = "http://www.cherryblossom.org/results/1999/cb99m.html", year = 1999, sex = "male", file = NULL)

#### Textbook extraction of Male tables (results in an error)
years = 1999:2012
menTables = mapply(extractResTable, url = urls, year = years)
#names(menTables) = years # can't run b/c menTables hasn't been created
#sapply(menTables, length) # can't run b/c menTables hasn't been created

menTables <- list()
for(i in 1:length(years)){
  menTables[[i]] <- try(extractResTable(url=urls[i], year=years[i]))
}

# Let's go check out the first two URLs
urls[1] # [1] "http://www.cherryblossom.org/cb99m.htm"
urls[2] # [1] "http://www.cherryblossom.org/cb003m.htm"

#### Revised URLS
menURLsV2 = 
  c("results/1999/cb99m.html", #"cb99m.htm"
    "results/2000/Cb003m.htm", #"cb003m.htm"
    "results/2001/oof_m.html", #"results/2001/oof_m.html"
    "results/2002/oofm.htm", #"results/2002/oofm.htm"
    "results/2003/CB03-M.HTM", #"results/2003/CB03-M.HTM"
    "results/2004/men.htm", #"results/2004/men.htm"
    "results/2005/CB05-M.htm", #"results/2005/CB05-M.htm"
    "results/2006/men.htm", #"results/2006/men.htm"
    "results/2007/men.htm", #"results/2007/men.htm"
    "results/2008/men.htm", #"results/2008/men.htm"
    "results/2009/09cucb-M.htm", #"results/2009/09cucb-M.htm"
    "results/2010/2010cucb10m-m.htm", #"results/2010/2010cucb10m-m.htm"
    "results/2011/2011cucb10m-m.htm", #"results/2011/2011cucb10m-m.htm"
    "results/2012/2012cucb10m-m.htm" #"results/2012/2012cucb10m-m.htm"
  )
####

#### Revised URLS
urlsV2 = paste(ubase, menURLsV2, sep="")
urlsV2[1:4]

#### Modified textbook extraction of Male tables (results in 1999 having (1) record)
menTables = mapply(extractResTable, url = urlsV2, year = years)
names(menTables) = years
sapply(menTables, length)

#### Code to compare and contrast the format of two different years
substr(menTables$'1999', start = 1, stop = 100)
substr(menTables$'2000', start = 1, stop = 100)
menTables$'2000'[1:10]

#### Revised Function
extractResTableV2 =
  #
  # Retrieve data from web site, 
  # find the preformatted text,
  # and write lines or return as a character vector.
  #
  function(url = "http://www.cherryblossom.org/results/2009/09cucb-F.htm",
           year = 1999, sex = "male", file = NULL)
  {
    #added encoding for windows users who get an "A" symbol
    doc = htmlParse(url, encoding="UTF-8")
    
    if (year == 2000) {
      # Get preformatted text from 4th font element
      # The top file is ill formed so the <pre> search doesn't work.
      ff = getNodeSet(doc, "//font")
      txt = xmlValue(ff[[4]])
      els = strsplit(txt, "\r\n")[[1]]
    }
    else if (year == 2009 & sex == "male") {
      # Get preformatted text from <div class="Section1"> element
      # Each line of results is in a <pre> element
      div1 = getNodeSet(doc, "//div[@class='Section1']")
      pres = getNodeSet(div1[[1]], "//pre")
      els = sapply(pres, xmlValue)
    }
    else if (year == 1999 & sex == "male") { # have to add this else if statement
      # Get preformatted text from <pre> elements
      pres = getNodeSet(doc, "//pre")
      txt = xmlValue(pres[[1]])
      els = strsplit(txt, "\n")[[1]]   
    } 
    else {
      # Get preformatted text from <pre> elements
      pres = getNodeSet(doc, "//pre")
      txt = xmlValue(pres[[1]])
      els = strsplit(txt, "\r\n")[[1]]   
    } 
    
    if (is.null(file)) return(els)
    # Write the lines as a text file.
    writeLines(els, con = file)
  }

#### Corrected function to pull down Male tables with consistent format
menTablesV2 = mapply(extractResTableV2, url = urlsV2, year = years)
names(menTablesV2) = years
sapply(menTablesV2, length)

#### Confirmation that the 1999 and other years have consistent formatting
menTablesV2$'1999'[1:10]
menTablesV2[[2]][1:10]

#### Save the outputs
save(menTablesV2, file = "CBMenTextTables.rda")

#### Now we need to investigate the differences between the male and female result pages
# 2000
df_male_2000 <- extractResTableV2(url = "http://www.cherryblossom.org/results/2000/Cb003m.htm", year = 2000, sex = "male", file = NULL)
df_female_2000 <- extractResTableV2(url = "http://www.cherryblossom.org/results/2000/Cb003f.htm", year = 2000, sex = "female", file = NULL)

df_female_2000[1:10]
df_male_2000[1:10]

# 2006
df_male_2006 <- extractResTableV2(url = "http://www.cherryblossom.org/results/2006/men.htm", year = 2006, sex = "male", file = NULL)
df_female_2006 <- extractResTableV2(url = "http://www.cherryblossom.org/results/2006/women.htm", year = 2006, sex = "female", file = NULL)

df_female_2006[1:10]
df_male_2006[1:10]

######################################################################
# Miscellaneous Code

menTables <- list()
for(i in 1:length(years)){
  menTables[[i]] <- try(extractResTable(url=urlsV2[i], year=years[i]))
}

# Breaking down the extractResTableV2 for 1999 - Men
url <- urlsV2[1]
doc = htmlParse(url, encoding="UTF-8")
pres = getNodeSet(doc, "//pre")
txt = xmlValue(pres[[1]])
els = strsplit(txt, "\r\n")[[1]]
els = strsplit(txt, "\n")[[1]]

# Breaking down the extractResTableV2 for 2009 - Men
url <- urlsV2[11]
doc = htmlParse(url, encoding="UTF-8")
div1 = getNodeSet(doc, "//div[@class='Section1']")
pres = getNodeSet(div1[[1]], "//pre")
els = sapply(pres, xmlValue)



In [3]:
ubase = "http://www.cherryblossom.org/"
menURLs = ["cb99m.htm", 
    "cb003m.htm", 
    "results/2001/oof_m.html",
    "results/2002/oofm.htm", 
    "results/2003/CB03-M.HTM",
    "results/2004/men.htm", 
    "results/2005/CB05-M.htm", 
    "results/2006/men.htm", 
    "results/2007/men.htm", 
    "results/2008/men.htm", 
    "results/2009/09cucb-M.htm",
    "results/2010/2010cucb10m-m.htm", 
    "results/2011/2011cucb10m-m.htm",
    "results/2012/2012cucb10m-m.htm"]
m_urls = [ubase + s for s in menURLs]
m_urls[1:4]

['http://www.cherryblossom.org/cb003m.htm',
 'http://www.cherryblossom.org/results/2001/oof_m.html',
 'http://www.cherryblossom.org/results/2002/oofm.htm']

In [11]:
ubase = "http://www.cherryblossom.org/"
womenURLs = ["results/1999/cb99f.html",
    "results/2000/Cb003f.htm",
    "results/2001/oof_f.html",
    "results/2002/ooff.htm",
    "results/2003/CB03-F.HTM",
    "results/2004/women.htm",
    "results/2005/CB05-F.htm",
    "results/2006/women.htm",
    "results/2007/women.htm",
    "results/2008/women.htm",
    "results/2009/09cucb-F.htm",
    "results/2010/2010cucb10m-f.htm",
    "results/2011/2011cucb10m-f.htm",
    "results/2012/2012cucb10m-f.htm"
]
f_urls = [ubase + s for s in womenURLs]
f_urls[13]

'http://www.cherryblossom.org/results/2012/2012cucb10m-f.htm'

In [2]:
import urllib.request
from bs4 import BeautifulSoup

URL = 'http://www.cherryblossom.org/results/1999/cb99f.html'

response = urllib.request.urlopen(URL)
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, "lxml")
table = soup.find("pre").find(text=True)
raw_rows = [x.strip() for x in table.split('\n')]
table_cols = raw_rows[1].split()
table_cols[1] = table_cols[1] + table_cols[2]
table_cols.remove("/TOT")
table_cols

['PLACE', 'DIV/TOT', 'NAME', 'AG', 'HOMETOWN', 'TIME', 'PACE']

In [3]:
print(''.join(table_cols))

PLACEDIV/TOTNAMEAGHOMETOWNTIMEPACE


In [4]:
table_cols

['PLACE', 'DIV/TOT', 'NAME', 'AG', 'HOMETOWN', 'TIME', 'PACE']

In [5]:
table_cols[1] = table_cols[1] + table_cols[2]
table_cols.remove("/TOT")

ValueError: list.remove(x): x not in list

In [6]:
import pandas as pd
import re

table = soup.find("pre").find(text=True)
raw_rows = [x.strip() for x in table.split('\n')]
table_cols = raw_rows[1].split()
table_cols[1] = table_cols[1] + table_cols[2]
table_cols.remove("/TOT")
parsed_rows = []
for row in raw_rows[3:len(raw_rows[3:])+1]:
    parsed_rows.append(re.split(r'\s{2,}|(?<=[0-9])\s', row))

In [7]:
for row in parsed_rows:
    if len(row)<7:
        print(row)

['3', 'Lidiya Grigoryeva', 'Russia', '53:40', '5:22']
['8', 'Gladys Asiba', 'Kenya', '54:50', '5:29']
['17', 'Connie Buckwalter', 'Lancaster PA', '59:36', '5:58']
['66', '55/1683', 'Deirdre Mccarthygalla 29', 'Arlington VA', '1:09:06', '6:55']
['163', '123/1683', 'Patricia Rhea', '36', 'University Park MD 1:16:24', '7:39']
['368', '283/1683', 'Christine Livingstone 27', 'Alexandria VA', '1:23:06', '8:19']
['409', '20/185', 'Magdalena Chica-Garzo 47', 'Gaithersburg MD', '1:23:48', '8:23']
['410', '316/1683', 'Ann Kim', '25', 'Charlottesville VA 1:23:49', '8:23']
['475', '363/1683', 'Elizabeth Davidsen Bo 36', 'Chevy Chase MD', '1:25:05', '8:31']
['476', '364/1683', 'Elizabeth Lower-Basch 27', 'Alexandria VA', '1:25:06', '8:31']
['496', '381/1683', 'Dana Riesner', '30', 'Montgomery Vill MD 1:25:26', '8:33']
['514', '62/306', 'Kim Freeze', '41', 'Fairfax Station VA 1:25:50', '8:35']
['543', '418/1683', 'Lisa Radziwanowicz', '39', 'Fairfax Station VA 1:26:27', '8:39']
['565', '437/1683', '

In [8]:
for row in raw_rows[3:10]:
    print(re.split(r'\s{2,}|(?<=[0-9])\s', row))
    #parsed_rows.append(re.split(r'\s{2,}|(?<=[0-9])\s', row))

['1', '1/1683', 'Jane Omoro', '26', 'Kenya', '53:37', '5:22']
['2', '2/1683', 'Jane Ngotho', '29', 'Kenya', '53:38', '5:22']
['3', 'Lidiya Grigoryeva', 'Russia', '53:40', '5:22']
['4', '3/1683', 'Eunice Sagero', '20', 'Kenya', '53:55', '5:24']
['5', '4/1683', 'Alla Zhilyayeva', '29', 'Russia', '54:08', '5:25']
['6', '5/1683', 'Teresa Wanjiku', '24', 'Kenya', '54:10', '5:25']
['7', '6/1683', 'Elana Viazova', '38', 'Ukraine', '54:29', '5:27']


In [9]:
# ISSUE: Data entries are misplaced in columns due to missing values
df = pd.DataFrame(parsed_rows, columns=table_cols)
df

Unnamed: 0,PLACE,DIV/TOT,NAME,AG,HOMETOWN,TIME,PACE
0,1,1/1683,Jane Omoro,26,Kenya,53:37,5:22
1,2,2/1683,Jane Ngotho,29,Kenya,53:38,5:22
2,3,Lidiya Grigoryeva,Russia,53:40,5:22,,
3,4,3/1683,Eunice Sagero,20,Kenya,53:55,5:24
4,5,4/1683,Alla Zhilyayeva,29,Russia,54:08,5:25
...,...,...,...,...,...,...,...
2350,2351,185/185,Dianette Stokes,46,Chesapeake VA,2:17:32,13:46
2351,2352,115/115,Jeanette Lampron,50,Woodbine MD,2:19:58,14:00
2352,2353,1680/1683,Tina Werking,29,Bethesda MD,2:21:40,14:10
2353,2354,1681/1683,Maria Walsh,30,New York NY,2:23:46,14:23


In [13]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import re

def scrape2002(URL):
    response = urllib.request.urlopen(URL)
    html = response.read().decode('utf-8')
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("pre").find(text=True)
    raw_rows = [x.strip() for x in table.split('\n')]
    table_cols = raw_rows[1].split()
    parsed_rows = []
    for row in raw_rows[3:len(raw_rows[3:])+1]:
        parsed_rows.append(re.split(r'\s{2,}|(?<=[0-9])\s', row))
    df = pd.DataFrame(parsed_rows, columns = table_cols)
    return df

In [14]:
f2002 = scrape2002(f_urls[3])

In [15]:
f2002

Unnamed: 0,Place,Num,Name,Ag,Hometown,Net,Gun
0,1,6005,Luminita TALPOS,29,Rom,52:50,52:50
1,2,6003,Teyba ERKASSO,20,Eth,52:53,52:55
2,3,6007,Sylvia MOSQUEDA,35,Usa,53:14,53:17
3,4,6022,Teresa WANJIKU,27,Ken,53:36,53:36
4,5,6020,Marla RUNYAN,33,Usa,53:37,53:37
...,...,...,...,...,...,...,...
3329,3330,10208,Ann HICKEY SHANKROFF,31,Falls Church VA,2:20:40,2:26:55
3330,3331,11140,Nanette VARIAS,48,Fairfax VA,2:25:49,2:29:46
3331,3332,7045,Joyce KIRKSEY,41,Alexandria VA,2:30:08,2:30:08
3332,3333,8016,Gail SUMMERS,39,Lafayette IN,2:38:58,2:38:58


In [31]:
f2011 = scrape2002(f_urls[12])
f2011

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 825609: invalid continuation byte

In [55]:
def scrape_2012(URL):
    response = urllib.request.urlopen(URL)
    html = response.read().decode('utf-8')
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("pre").find(text=True)
    raw_rows = [x.strip() for x in table.split('\n')]
    table_cols = raw_rows[6].split()
    table_cols[1] = table_cols[1] + table_cols[2]
    table_cols.remove("/Tot")
    table_cols.remove("S")
    parsed_rows = []
    for row in raw_rows[8:len(raw_rows[8:])+1]:
        parsed_rows.append(re.split(r'\s{2,}|(?<=[0-9])\s', row))
    df = pd.DataFrame(parsed_rows, columns = table_cols)
    return df

f2012 = scrape_2012(f_urls[13])
f2012

Unnamed: 0,Place,Div/Tot,Num,Name,Ag,Hometown,5,Mile,Time,Pace
0,1,1/2781,2,Jelliah Tinega,26,Kenya,26:48,54:02,5:25,!
1,2,2/2781,24,Malika Mejdoub,29,Ethiopia,27:09,54:24,5:27,!
2,3,1/2228,22,Yihunlish Delelecha,30,Ethiopia,27:09,54:33,5:28,!
3,4,3/2781,14,Agnieszka Ciolek,26,Poland,27:30,54:36,5:28,!
4,5,4/2781,4,Claire Hallissey,29,Arlington VA,27:30,54:37,5:28,!
...,...,...,...,...,...,...,...,...,...,...
9718,9720,2781/2781,18243,Marianne Stonefield,27,Alexandria VA,1:13:14,2:31:10,15:07,
9719,9721,2226/2228,19039,Helena Walker,33,Silver Spring MD,1:13:14,2:31:11,15:08,
9720,9722,1363/1366,15180,Liz Boateng,39,Columbia MD,1:14:32,2:31:21,15:09,
9721,9723,1364/1366,15184,Jennifer Cookson,38,Katy TX,1:14:33,2:31:22,15:09,


In [90]:
def scrape_2011(URL):
    response = urllib.request.urlopen(URL)
    html = response.read()
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("pre").find(text=True)
    raw_rows = [x.strip() for x in table.split('\n')]
    table_cols = raw_rows[6].split()
    table_cols[1] = table_cols[1] + table_cols[2]
    table_cols[8] = table_cols[7] + table_cols[8]
    table_cols[10] = table_cols[10] + table_cols[11]
    table_cols.remove("/Tot")
    table_cols.remove("5")
    table_cols.remove("Tim")
    parsed_rows = []
    for row in raw_rows[8:len(raw_rows[8:])+1]:
        parsed_rows.append(re.split(r'\s{2,}|(?<=[0-9])\s', row))
    df = pd.DataFrame(parsed_rows, columns = table_cols)
    return df

f2011 = scrape_2011(f_urls[12])
f2011

Unnamed: 0,Place,Div/Tot,Num,Name,Ag,Hometown,5Mile,Time,NetTim,Pace,S
0,1,1/2706,14,Julliah Tinega,25,Kenya,54:02,54:02,5:25,!,
1,2,1/937,16,Risper Gesabwa,22,Kenya,27:17,54:03,54:03,5:25,!
2,3,1/1866,48,Tgist Tufa,30,Ethiopia,27:17,54:13,54:13,5:26,!
3,4,2/1866,44,Alemtsehay Misganaw,30,Ethiopia,27:17,55:17,55:17,5:32,!
4,5,2/2706,24,Claire Hallissey,28,United Kingdom,28:01,56:17,56:17,5:38,!
...,...,...,...,...,...,...,...,...,...,...,...
9019,9020,919/920,4067,Teresa Ziffer,41,Clarksburg MD,1:10:01,2:33:00,2:23:34,14:22,
9020,9021,920/920,14278,Heidi Vogelsang,43,Montgomery Villag MD 1:08:47,2:35:35,2:23:45,14:23,,
9021,9022,1264/1265,18623,Michelle Bulger,37,Baltimore MD,1:07:17,2:41:10,2:23:53,14:24,
9022,9023,2703/2706,18531,Larita Williams,28,Washington DC,1:08:27,2:41:18,2:24:23,14:27,


In [80]:
def scrape_2010(URL):
    response = urllib.request.urlopen(URL)
    html = response.read().decode('utf-8')
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("pre").find(text=True)
    raw_rows = [x.strip() for x in table.split('\n')]
    table_cols = raw_rows[6].split()
    table_cols[7] = table_cols[7] + table_cols[8]
    table_cols[9] = table_cols[9] + table_cols[10]
    table_cols[11] = table_cols[11] + table_cols[12]
    table_cols.remove("/Tot")
    table_cols.remove("Mile")
    table_cols.remove("Tim")
    table_cols.remove("Tim")
    parsed_rows = []
    for row in raw_rows[8:len(raw_rows[8:])+1]:
        parsed_rows.append(re.split(r'\s{2,}|(?<=[0-9])\s', row))
    df = pd.DataFrame(parsed_rows, columns = table_cols)
    return df

f2010 = scrape_2010(f_urls[11])
f2011

Unnamed: 0,Place,Div,Num,Name,Ag,Hometown,5Mile,GunTim,NetTim,Pace,S
0,1,1/971,2,Lineth Chepkurui,23,Kenya,25:38,51:51,51:51#,5:12,!
1,2,2/971,28,Julliah Tinega,24,Kenya,25:41,52:40,52:39#,5:16,!
2,3,3/971,6,Belainesh Zemedkun,22,Ethiopia,26:06,53:22,53:22#,5:21,!
3,4,4/971,30,Misker Demessie,23,Ethiopia,27:11,54:37,54:37#,5:28,!
4,5,1/1810,18,Kelly Jaske,33,Portland OR,27:10,54:40,54:40#,5:28,!
...,...,...,...,...,...,...,...,...,...,...,...
8844,8845,560/561,16532,Martha Graf,46,Columbia MD,1:21:46,2:27:18,2:27:18,14:44,
8845,8846,1176/1177,14300,Veronica Chambers,39,Princeton NJ,1:10:19,2:43:05,2:28:20,14:50,
8846,8847,845/845,11222,Mary Frances Cloe,41,Arlington VA,1:14:30,2:42:11,2:28:43,14:53,
8847,8848,2803/2803,8684,Julie Frank,25,Osceola Mills PA,1:13:46,2:36:56,2:29:37,14:58,


In [104]:
def scrape_2009(URL):
    response = urllib.request.urlopen(URL)
    html = response.read()
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("pre").find(text=True)
    raw_rows = [x.strip() for x in table.split('\n')]
    table_cols = raw_rows[6].split()
    table_cols[1] = table_cols[1] + table_cols[2]
    table_cols[8] = table_cols[7] + table_cols[8]
    table_cols[9] = table_cols[9] + table_cols[10]
    table_cols.remove("/Tot")
    table_cols.remove("Gun")
    table_cols.remove("Tim")
    parsed_rows = []
    for row in raw_rows[8:len(raw_rows[8:])+1]:
        parsed_rows.append(re.split(r'\s{2,}|(?<=[0-9])\s', row))
    df = pd.DataFrame(parsed_rows, columns = table_cols)
    return df

f2009 = scrape_2009(f_urls[10])
f2009

Unnamed: 0,Place,Div/Tot,Num,Name,Ag,Hometown,GunTim,NetTim,Pace,S
0,1,1/953,2,Lineth Chepkurui,21,Kenya,53:32,53:32#,5:22,!
1,2,2/953,22,Belianesh Zemed Gebre,21,Ethiopia,53:55,53:55#,5:24,!
2,3,3/953,34,Teyba Naser,22,Ethiopia,53:58,53:58#,5:24,!
3,4,1/71,6,Abebu Gelan,19,Ethiopia,54:26,54:26#,5:27,!
4,5,1/1130,4,Catherine Ndereba,36,Kenya,54:27,54:27#,5:27,!
...,...,...,...,...,...,...,...,...,...,...
8314,8315,2706/2706,17906,Gita Tohan,25,Bethesda MD,2:36:07,2:25:03,14:31,
8315,8316,1675/1678,18633,Gloria Sim,32,Fairfax VA,2:34:48,2:25:33,14:34,
8316,8317,1676/1678,9253,Sarah Manwell,33,Silver Spring MD,2:33:03,2:26:53,14:42,
8317,8318,159/159,18552,Mary Anders,56,Washington DC,2:26:54,2:26:54,14:42,


In [137]:
def scrape_2007(URL):
    response = urllib.request.urlopen(URL)
    html = response.read()
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("pre").find(text=True)
    raw_rows = [x.strip() for x in table.split('\n')]
    table_cols = raw_rows[5].split()
    table_cols[1] = table_cols[1] + table_cols[2]
    table_cols.remove("/Tot")
    parsed_rows = []
    for row in raw_rows[7:len(raw_rows[7:])+1]:
        parsed_rows.append(re.split(r'\s{2,}|(?<=[0-9])\s', row))
    df = pd.DataFrame(parsed_rows, columns = table_cols)
    return df

In [139]:
f2007 = scrape_2007(f_urls[8])
f2007

Unnamed: 0,Place,Div/Tot,Num,Name,Ag,Hometown,Time,Pace,S,Split
0,1,1/728,4,Teyba Erkesso,24,Ethiopia,51:44#,5:11,!,32:06
1,2,2/728,16,Tatyana Petrova,23,Russia Germant.,52:58#,5:18,!,32:47
2,3,1/1083,8,Kathy Butler,33,United Kingdom,53:26#,5:21,!,33:02
3,4,3/728,28,Magdalene Makunzi,24,Kenya,53:45#,5:23,!,33:02
4,5,2/1083,12,Lidia Simon,33,Romania,53:52#,5:24,!,33:20
...,...,...,...,...,...,...,...,...,...,...
5682,5683,381/382,10345,Rose Bennetch,47,Virginia Beach VA,#,1:35:31,,
5683,5684,382/382,14141,Mary Alvarez,47,Little Ferry NJ,#,1:35:36,,
5684,5685,518/519,15858,Georgine Maisch,42,Massapequa NY,#,1:36:21,,
5685,5686,519/519,16162,Janet Nagy,41,Alexandria VA,#,1:36:21,,


In [120]:
for row in parsed_rows:
    if len(row) > 12:
        print(row)

['646', '128/1258', '2564', 'Marla Hallacy', '33', '# 122', 'DC', '40:25', '8:05', '50:33', '8:09', '1:22:23', '8:15']
['1012', '350/1994', '6099', 'Eleanor Rathbone', '29', '#501', 'DC', '45:34', '9:07', '55:39', '8:58', '1:25:39', '8:34']
['1173', '400/1994', '16427', 'Emily Naden', '26', '#409', 'DC', '43:57', '8:48', '54:40', '8:48', '1:26:45', '8:41']
['1297', '439/1994', '7861', 'Amanda Branting', '28', 'Apt. T25', 'VA', '44:43', '8:57', '55:18', '8:54', '1:27:35', '8:46']
['1323', '449/1994', '7612', 'Marissa Fitzpatrick', '27', 'Apt. 1003', 'VA', '45:34', '9:07', '55:44', '8:59', '1:27:45', '8:47']
['1692', '569/1994', '10883', 'Jackie Adair', '26', 'Apt. 1003', 'VA', '45:34', '9:07', '56:24', '9:05', '1:30:09', '9:01']
['2246', '424/1258', '19083', 'Grace L Chen', '32', '#518', 'DC', '49:03', '9:49', '1:00:17', '9:42', '1:33:35', '9:22']
['2277', '759/1994', '7125', 'Meghan Espinoza', '27', 'Alexandria Va 2', 'VA', '44:59', '9:00', '56:47', '9:09', '1:33:47', '9:23']
['2324', 

In [129]:
f2007 = scrape_2009(f_urls[8])
f2007

IndexError: list index out of range

In [133]:
raw_rows

['Credit Union Cherry Blossom 10 Mile Road Race',
 'Washington, DC',
 'Sunday, April 1, 2007',
 'Official Female Results With 10km Split',
 '',
 'Place Div  /Tot   Num    Name                   Ag Hometown           Time     Pace  S Split',
 '1     1/728        4 Teyba Erkesso          24 Ethiopia             51:44#  5:11 !   32:06',
 '2     2/728       16 Tatyana Petrova        23 Russia Germant.      52:58#  5:18 !   32:47',
 '3     1/1083       8 Kathy Butler           33 United Kingdom       53:26#  5:21 !   33:02',
 '4     3/728       28 Magdalene Makunzi      24 Kenya                53:45#  5:23 !   33:02',
 '5     2/1083      12 Lidia Simon            33 Romania              53:52#  5:24 !   33:20',
 '6     3/1083      40 Galina Alexandrova     31 Russia               54:07#  5:25 !   33:20',
 '7     1/835       10 Dorota Gruca           36 Poland NJ            54:27#  5:27 !   33:35',
 '8     4/728       30 Florence Jepkosgei     23 Kenya                54:56#  5:30 !   33:22',