In [None]:
library(XML)
ubase = "http://www.cherryblossom.org/"

#### From text
menURLs = 
  c("cb99m.htm", 
    "cb003m.htm", 
    "results/2001/oof_m.html",
    "results/2002/oofm.htm", 
    "results/2003/CB03-M.HTM",
    "results/2004/men.htm", 
    "results/2005/CB05-M.htm", 
    "results/2006/men.htm", 
    "results/2007/men.htm", 
    "results/2008/men.htm", 
    "results/2009/09cucb-M.htm",
    "results/2010/2010cucb10m-m.htm", 
    "results/2011/2011cucb10m-m.htm",
    "results/2012/2012cucb10m-m.htm")
####

#### Text URLS
urls = paste(ubase, menURLs, sep="")
urls[1:4]

# 1999: http://www.cherryblossom.org/cb99m.htm
# 2000: http://www.cherryblossom.org/cb003m.htm
# 2001: http://www.cherryblossom.org/results/2001/oof_m.html

#### Textbook Function
extractResTable =
  #
  # Retrieve data from web site, 
  # find the preformatted text,
  # and write lines or return as a character vector.
  #
  function(url = "http://www.cherryblossom.org/results/2009/09cucb-F.htm",
           year = 1999, sex = "male", file = NULL)
  {
    #added encoding for windows users who get an "A" symbol
    doc = htmlParse(url)    
    #doc = htmlParse(url, encoding="UTF-8")
    
    if (year == 2000) {
      # Get preformatted text from 4th font element
      # The top file is ill formed so the <pre> search doesn't work.
      ff = getNodeSet(doc, "//font")
      txt = xmlValue(ff[[4]])
      els = strsplit(txt, "\r\n")[[1]]
    }
    else if (year == 2009 & sex == "male") {
      # Get preformatted text from <div class="Section1"> element
      # Each line of results is in a <pre> element
      div1 = getNodeSet(doc, "//div[@class='Section1']")
      pres = getNodeSet(div1[[1]], "//pre")
      els = sapply(pres, xmlValue)
    }
    else {
      # Get preformatted text from <pre> elements
      pres = getNodeSet(doc, "//pre")
      txt = xmlValue(pres[[1]])
      els = strsplit(txt, "\r\n")[[1]]
#      els2 = strsplit(txt, "\n")[[1]]
    } 
    
    if (is.null(file)) return(els)
    # Write the lines as a text file.
    writeLines(els, con = file)
  }

# Skip over the first pass
#### Individual Input Components for Testing: 1999
url <- 'http://www.cherryblossom.org/results/1999/cb99m.html'
year <- 1999
sex <- "male"
file <- NULL
####

# Skip over the first pass
#### Individual Input Components for Testing: 2000
url <- 'http://www.cherryblossom.org/results/2000/Cb003m.htm'
year <- 2000
sex <- "male"
file <- NULL
####

#### Textbook example with (1) URL
df1 <- extractResTable(url = "http://www.cherryblossom.org/results/2000/Cb003m.htm", year = 2000, sex = "male", file = NULL)
df2 <- extractResTable(url = "http://www.cherryblossom.org/results/1999/cb99m.html", year = 1999, sex = "male", file = NULL)
#df3 <- extractResTableV2(url = "http://www.cherryblossom.org/results/1999/cb99m.html", year = 1999, sex = "male", file = NULL)

#### Textbook extraction of Male tables (results in an error)
years = 1999:2012
menTables = mapply(extractResTable, url = urls, year = years)
#names(menTables) = years # can't run b/c menTables hasn't been created
#sapply(menTables, length) # can't run b/c menTables hasn't been created

menTables <- list()
for(i in 1:length(years)){
  menTables[[i]] <- try(extractResTable(url=urls[i], year=years[i]))
}

# Let's go check out the first two URLs
urls[1] # [1] "http://www.cherryblossom.org/cb99m.htm"
urls[2] # [1] "http://www.cherryblossom.org/cb003m.htm"

#### Revised URLS
menURLsV2 = 
  c("results/1999/cb99m.html", #"cb99m.htm"
    "results/2000/Cb003m.htm", #"cb003m.htm"
    "results/2001/oof_m.html", #"results/2001/oof_m.html"
    "results/2002/oofm.htm", #"results/2002/oofm.htm"
    "results/2003/CB03-M.HTM", #"results/2003/CB03-M.HTM"
    "results/2004/men.htm", #"results/2004/men.htm"
    "results/2005/CB05-M.htm", #"results/2005/CB05-M.htm"
    "results/2006/men.htm", #"results/2006/men.htm"
    "results/2007/men.htm", #"results/2007/men.htm"
    "results/2008/men.htm", #"results/2008/men.htm"
    "results/2009/09cucb-M.htm", #"results/2009/09cucb-M.htm"
    "results/2010/2010cucb10m-m.htm", #"results/2010/2010cucb10m-m.htm"
    "results/2011/2011cucb10m-m.htm", #"results/2011/2011cucb10m-m.htm"
    "results/2012/2012cucb10m-m.htm" #"results/2012/2012cucb10m-m.htm"
  )
####

#### Revised URLS
urlsV2 = paste(ubase, menURLsV2, sep="")
urlsV2[1:4]

#### Modified textbook extraction of Male tables (results in 1999 having (1) record)
menTables = mapply(extractResTable, url = urlsV2, year = years)
names(menTables) = years
sapply(menTables, length)

#### Code to compare and contrast the format of two different years
substr(menTables$'1999', start = 1, stop = 100)
substr(menTables$'2000', start = 1, stop = 100)
menTables$'2000'[1:10]

#### Revised Function
extractResTableV2 =
  #
  # Retrieve data from web site, 
  # find the preformatted text,
  # and write lines or return as a character vector.
  #
  function(url = "http://www.cherryblossom.org/results/2009/09cucb-F.htm",
           year = 1999, sex = "male", file = NULL)
  {
    #added encoding for windows users who get an "A" symbol
    doc = htmlParse(url, encoding="UTF-8")
    
    if (year == 2000) {
      # Get preformatted text from 4th font element
      # The top file is ill formed so the <pre> search doesn't work.
      ff = getNodeSet(doc, "//font")
      txt = xmlValue(ff[[4]])
      els = strsplit(txt, "\r\n")[[1]]
    }
    else if (year == 2009 & sex == "male") {
      # Get preformatted text from <div class="Section1"> element
      # Each line of results is in a <pre> element
      div1 = getNodeSet(doc, "//div[@class='Section1']")
      pres = getNodeSet(div1[[1]], "//pre")
      els = sapply(pres, xmlValue)
    }
    else if (year == 1999 & sex == "male") { # have to add this else if statement
      # Get preformatted text from <pre> elements
      pres = getNodeSet(doc, "//pre")
      txt = xmlValue(pres[[1]])
      els = strsplit(txt, "\n")[[1]]   
    } 
    else {
      # Get preformatted text from <pre> elements
      pres = getNodeSet(doc, "//pre")
      txt = xmlValue(pres[[1]])
      els = strsplit(txt, "\r\n")[[1]]   
    } 
    
    if (is.null(file)) return(els)
    # Write the lines as a text file.
    writeLines(els, con = file)
  }

#### Corrected function to pull down Male tables with consistent format
menTablesV2 = mapply(extractResTableV2, url = urlsV2, year = years)
names(menTablesV2) = years
sapply(menTablesV2, length)

#### Confirmation that the 1999 and other years have consistent formatting
menTablesV2$'1999'[1:10]
menTablesV2[[2]][1:10]

#### Save the outputs
save(menTablesV2, file = "CBMenTextTables.rda")

#### Now we need to investigate the differences between the male and female result pages
# 2000
df_male_2000 <- extractResTableV2(url = "http://www.cherryblossom.org/results/2000/Cb003m.htm", year = 2000, sex = "male", file = NULL)
df_female_2000 <- extractResTableV2(url = "http://www.cherryblossom.org/results/2000/Cb003f.htm", year = 2000, sex = "female", file = NULL)

df_female_2000[1:10]
df_male_2000[1:10]

# 2006
df_male_2006 <- extractResTableV2(url = "http://www.cherryblossom.org/results/2006/men.htm", year = 2006, sex = "male", file = NULL)
df_female_2006 <- extractResTableV2(url = "http://www.cherryblossom.org/results/2006/women.htm", year = 2006, sex = "female", file = NULL)

df_female_2006[1:10]
df_male_2006[1:10]

######################################################################
# Miscellaneous Code

menTables <- list()
for(i in 1:length(years)){
  menTables[[i]] <- try(extractResTable(url=urlsV2[i], year=years[i]))
}

# Breaking down the extractResTableV2 for 1999 - Men
url <- urlsV2[1]
doc = htmlParse(url, encoding="UTF-8")
pres = getNodeSet(doc, "//pre")
txt = xmlValue(pres[[1]])
els = strsplit(txt, "\r\n")[[1]]
els = strsplit(txt, "\n")[[1]]

# Breaking down the extractResTableV2 for 2009 - Men
url <- urlsV2[11]
doc = htmlParse(url, encoding="UTF-8")
div1 = getNodeSet(doc, "//div[@class='Section1']")
pres = getNodeSet(div1[[1]], "//pre")
els = sapply(pres, xmlValue)



In [3]:
ubase = "http://www.cherryblossom.org/"
menURLs = ["cb99m.htm", 
    "cb003m.htm", 
    "results/2001/oof_m.html",
    "results/2002/oofm.htm", 
    "results/2003/CB03-M.HTM",
    "results/2004/men.htm", 
    "results/2005/CB05-M.htm", 
    "results/2006/men.htm", 
    "results/2007/men.htm", 
    "results/2008/men.htm", 
    "results/2009/09cucb-M.htm",
    "results/2010/2010cucb10m-m.htm", 
    "results/2011/2011cucb10m-m.htm",
    "results/2012/2012cucb10m-m.htm"]
m_urls = [ubase + s for s in menURLs]
m_urls[1:4]

['http://www.cherryblossom.org/cb003m.htm',
 'http://www.cherryblossom.org/results/2001/oof_m.html',
 'http://www.cherryblossom.org/results/2002/oofm.htm']

In [1]:
ubase = "http://www.cherryblossom.org/"
womenURLs = ["results/1999/cb99f.html",
    "results/2000/Cb003f.htm",
    "results/2001/oof_f.html",
    "results/2002/ooff.htm",
    "results/2003/CB03-F.HTM",
    "results/2004/women.htm",
    "results/2005/CB05-F.htm",
    "results/2006/women.htm",
    "results/2007/women.htm",
    "results/2008/women.htm",
    "results/2009/09cucb-F.htm",
    "results/2010/2010cucb10m-f.htm",
    "results/2011/2011cucb10m-f.htm",
    "results/2012/2012cucb10m-f.htm"
]
f_urls = [ubase + s for s in womenURLs]
f_urls[1:4]

['http://www.cherryblossom.org/results/2000/Cb003f.htm',
 'http://www.cherryblossom.org/results/2001/oof_f.html',
 'http://www.cherryblossom.org/results/2002/ooff.htm']

In [144]:
import urllib.request
from bs4 import BeautifulSoup

URL = 'http://www.cherryblossom.org/results/1999/cb99f.html'

response = urllib.request.urlopen(URL)
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, "lxml")
table = soup.find("pre").find(text=True)
raw_rows = [x.strip() for x in table.split('\n')]
table_cols = raw_rows[1].split()
table_cols[1] = table_cols[1] + table_cols[2]
table_cols.remove("/TOT")
table_cols

['PLACE', 'DIV', '/TOT', 'NAME', 'AG', 'HOMETOWN', 'TIME', 'PACE']

In [128]:
print(''.join(table_cols))

PLACE DIV /TOT  NAME                  AG HOMETOWN           TIME    PACE


In [146]:
table_cols

['PLACE', 'DIV/TOT', 'NAME', 'AG', 'HOMETOWN', 'TIME', 'PACE']

In [145]:
table_cols[1] = table_cols[1] + table_cols[2]
table_cols.remove("/TOT")

In [202]:
import pandas as pd
import re

table = soup.find("pre").find(text=True)
raw_rows = [x.strip() for x in table.split('\n')]
table_cols = raw_rows[1].split()
table_cols[1] = table_cols[1] + table_cols[2]
table_cols.remove("/TOT")
parsed_rows = []
for row in raw_rows[3:len(raw_rows[3:])+1]:
    parsed_rows.append(re.split(r'\s{2,}|(?<=[0-9])\s', row))

In [203]:
for row in parsed_rows:
    if len(row)<7:
        print(row)

['3', 'Lidiya Grigoryeva', 'Russia', '53:40', '5:22']
['8', 'Gladys Asiba', 'Kenya', '54:50', '5:29']
['17', 'Connie Buckwalter', 'Lancaster PA', '59:36', '5:58']
['66', '55/1683', 'Deirdre Mccarthygalla 29', 'Arlington VA', '1:09:06', '6:55']
['163', '123/1683', 'Patricia Rhea', '36', 'University Park MD 1:16:24', '7:39']
['368', '283/1683', 'Christine Livingstone 27', 'Alexandria VA', '1:23:06', '8:19']
['409', '20/185', 'Magdalena Chica-Garzo 47', 'Gaithersburg MD', '1:23:48', '8:23']
['410', '316/1683', 'Ann Kim', '25', 'Charlottesville VA 1:23:49', '8:23']
['475', '363/1683', 'Elizabeth Davidsen Bo 36', 'Chevy Chase MD', '1:25:05', '8:31']
['476', '364/1683', 'Elizabeth Lower-Basch 27', 'Alexandria VA', '1:25:06', '8:31']
['496', '381/1683', 'Dana Riesner', '30', 'Montgomery Vill MD 1:25:26', '8:33']
['514', '62/306', 'Kim Freeze', '41', 'Fairfax Station VA 1:25:50', '8:35']
['543', '418/1683', 'Lisa Radziwanowicz', '39', 'Fairfax Station VA 1:26:27', '8:39']
['565', '437/1683', '

In [204]:
for row in raw_rows[3:10]:
    print(re.split(r'\s{2,}|(?<=[0-9])\s', row))
    #parsed_rows.append(re.split(r'\s{2,}|(?<=[0-9])\s', row))

['1', '1/1683', 'Jane Omoro', '26', 'Kenya', '53:37', '5:22']
['2', '2/1683', 'Jane Ngotho', '29', 'Kenya', '53:38', '5:22']
['3', 'Lidiya Grigoryeva', 'Russia', '53:40', '5:22']
['4', '3/1683', 'Eunice Sagero', '20', 'Kenya', '53:55', '5:24']
['5', '4/1683', 'Alla Zhilyayeva', '29', 'Russia', '54:08', '5:25']
['6', '5/1683', 'Teresa Wanjiku', '24', 'Kenya', '54:10', '5:25']
['7', '6/1683', 'Elana Viazova', '38', 'Ukraine', '54:29', '5:27']


In [205]:
# ISSUE: Data entries are misplaced in columns due to missing values
df = pd.DataFrame(parsed_rows, columns=table_cols)
df

Unnamed: 0,PLACE,DIV/TOT,NAME,AG,HOMETOWN,TIME,PACE
0,1,1/1683,Jane Omoro,26,Kenya,53:37,5:22
1,2,2/1683,Jane Ngotho,29,Kenya,53:38,5:22
2,3,Lidiya Grigoryeva,Russia,53:40,5:22,,
3,4,3/1683,Eunice Sagero,20,Kenya,53:55,5:24
4,5,4/1683,Alla Zhilyayeva,29,Russia,54:08,5:25
...,...,...,...,...,...,...,...
2350,2351,185/185,Dianette Stokes,46,Chesapeake VA,2:17:32,13:46
2351,2352,115/115,Jeanette Lampron,50,Woodbine MD,2:19:58,14:00
2352,2353,1680/1683,Tina Werking,29,Bethesda MD,2:21:40,14:10
2353,2354,1681/1683,Maria Walsh,30,New York NY,2:23:46,14:23


In [158]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import re

def scrape2002(URL):
    response = urllib.request.urlopen(URL)
    html = response.read().decode('utf-8')
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("pre").find(text=True)
    raw_rows = [x.strip() for x in table.split('\n')]
    table_cols = raw_rows[1].split()
    parsed_rows = []
    for row in raw_rows[3:len(raw_rows[3:])+1]:
        parsed_rows.append(re.split(r'\s{2,}|(?<=[0-9])\s', row))
    df = pd.DataFrame(parsed_rows, columns = table_cols)
    return df

In [159]:
f2002 = scrape2002(f_urls[3])

In [160]:
f2002

Unnamed: 0,Place,Num,Name,Ag,Hometown,Net,Gun
0,1,6005,Luminita TALPOS,29,Rom,52:50,52:50
1,2,6003,Teyba ERKASSO,20,Eth,52:53,52:55
2,3,6007,Sylvia MOSQUEDA,35,Usa,53:14,53:17
3,4,6022,Teresa WANJIKU,27,Ken,53:36,53:36
4,5,6020,Marla RUNYAN,33,Usa,53:37,53:37
...,...,...,...,...,...,...,...
3329,3330,10208,Ann HICKEY SHANKROFF,31,Falls Church VA,2:20:40,2:26:55
3330,3331,11140,Nanette VARIAS,48,Fairfax VA,2:25:49,2:29:46
3331,3332,7045,Joyce KIRKSEY,41,Alexandria VA,2:30:08,2:30:08
3332,3333,8016,Gail SUMMERS,39,Lafayette IN,2:38:58,2:38:58
