Tab Scraper
---

Author: Peter Zhang

Scraping tool for Tabroom.

### Setup

#### Imports

In [1]:
# imports
import urllib.request, urllib.parse, urllib.error
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import csv
import os.path
from os import path
import sys
from string import ascii_lowercase

#### Settings

- OVERWRITE determines whether or not to update existing files.
- PAGES_URL is a list of Wiki pages
- OUTPATH is where files are stored

In [2]:
# settings
OVERWRITE = True

In [3]:
# get tournament URLS
#TOURNAMENT_CSV = 'tools/tourn_info.csv'
TOURNAMENT_CSV = 'tools/edebate_tourns.csv'

In [48]:
# outpath
OUTPATH = "tab_data/"
ENTRIES_FILE = "edebate_entries.csv"
INFO_FILE = "edebate_info.csv"

In [44]:
# equivalent names
VLD_NAMES = [name.strip() for name in open('tools/VLD_names.txt', 'r')]
JVLD_NAMES = [name.strip() for name in open('tools/JVLD_names.txt', 'r')]
NLD_NAMES = [name.strip() for name in open('tools/NLD_names.txt', 'r')]
VPF_NAMES = [name.strip() for name in open('tools/VPF_names.txt', 'r')]
JVPF_NAMES = [name.strip() for name in open('tools/JVPF_names.txt', 'r')]
NPF_NAMES = [name.strip() for name in open('tools/NPF_names.txt', 'r')]
VCX_NAMES = [name.strip() for name in open('tools/VCX_names.txt', 'r')]
JVCX_NAMES = [name.strip() for name in open('tools/JVCX_names.txt', 'r')]
NCX_NAMES = [name.strip() for name in open('tools/NCX_names.txt', 'r')]

In [45]:
# set events to scrape
TARGET_EVENTS = ["VLD","JVLD", "NLD", "VPF", "JVPF", "NPF", "VCX", "JVCX", "NCX"]
def getType(raw_name):
    if raw_name in VLD_NAMES:
        return "VLD"
    if raw_name in JVLD_NAMES:
        return "JVLD"
    if raw_name in NLD_NAMES:
        return "NLD"
    if raw_name in VPF_NAMES:
        return "VPF"
    if raw_name in JVPF_NAMES:
        return "JVPF"
    if raw_name in NPF_NAMES:
        return "NPF"
    if raw_name in VCX_NAMES:
        return "VCX"
    if raw_name in JVCX_NAMES:
        return "JVCX"
    if raw_name in NCX_NAMES:
        return "NCX"
    return "None"

### Entry Scrapers

#### Events

Take a tournament ID and get links to events.

In [39]:
def getEvents(tourn_id):
    url = "https://www.tabroom.com/index/tourn/fields.mhtml?tourn_id=" + tourn_id
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")
    links = [link for link in soup.find_all('a') if "event_id" in link.get('href')]
    return [(link.contents[0].strip(), link.get('href')) for link in links]

In [23]:
getURLs("16856")

[('Congressional Debate',
  '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141000'),
 ('JV LD', '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141001'),
 ('Novice LD', '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141003'),
 ('Novice Public Forum',
  '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141004'),
 ('Varsity LD', '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141005'),
 ('Varsity Public Forum',
  '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141006')]

#### Entries

Take the URL to an event's page and return event entry info.

In [11]:
# extract table from a page
def getEntries(url, eventType, tournName):
    
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")

    table = soup.find("table").find("tbody")
    
    entries = []
    for row in table.find_all("tr"):
        
        entries.append([tournName, eventType] + [field.text.strip() for field in row.find_all("td")])
    
    return entries



In [13]:
getEntries("https://www.tabroom.com/index/tourn/fields.mhtml?tourn_id=16856&event_id=141005", "JVLD", "AppleValley")

[['AppleValley',
  'JVLD',
  'Acton-Boxborough Regional High Scho',
  'MA/US',
  'Bellerina Hu',
  'Acton-Boxborough BH'],
 ['AppleValley',
  'JVLD',
  'Apple Valley High School',
  'MN/US',
  'John Schwartz',
  'Apple Valley JS'],
 ['AppleValley',
  'JVLD',
  'Apple Valley High School',
  'MN/US',
  'Nora Bolsoni',
  'Apple Valley NB'],
 ['AppleValley',
  'JVLD',
  'Appleton North',
  'WI/US',
  'Mihir Uberoi',
  'Appleton North MU'],
 ['AppleValley',
  'JVLD',
  'BASIS Independent Silicon Valley In',
  'CA/US',
  'Shreyas Kapavarapu',
  'BASIS Independent Silicon Valley Independent SK'],
 ['AppleValley',
  'JVLD',
  'Bergen County Academies',
  'NJ/US',
  'Andrew Kim',
  'Bergen County Academies AK'],
 ['AppleValley',
  'JVLD',
  'Bettendorf High School',
  'IA/US',
  'Noah Rantilla',
  'Bettendorf NR'],
 ['AppleValley',
  'JVLD',
  'Brentwood School',
  'CA/US',
  'Sophie Rubin',
  'Brentwood SR'],
 ['AppleValley',
  'JVLD',
  'Byram Hills High School',
  'NY/US',
  'Eleanor Wangens

#### Info

Get a tournament ID and get the tournament info.

In [14]:
def getInfo(tourn_id):

    url = "https://www.tabroom.com/index/tourn/index.mhtml?tourn_id=" + tourn_id

    # load page
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")

    # find header
    header = soup.select('h5')[0].text.strip()
    
    # get sub-header
    year = header.split('—')[0].strip()
    location = header.split('—')[1].strip()
    if ',' in location:
        city = location.split(',')[0].strip()
        state = location.split(',')[1].strip()
    
    else:
        city = "None"
        state = location

    # get info box
    info = soup.find_all('span', {'class' : 'smaller half'})[0].text
    date = ' '.join(info.split())

    return [date, year, city, state]

In [15]:
getInfo("16856")

['11/6 to 11/8', '2020', 'NSDA Campus', 'MN/US']

#### Execution

Loop through tournaments.

In [34]:
VPF_NAMES

['PF',
 'PFD',
 'VPF',
 'Varsity PF',
 'Pubilc Forum',
 'Public Forum Debate',
 'Varsity Public Forum',
 'Varsity Public Forum Debate',
 'Varsity PF Debate',
 'Open PF',
 'Open Public Forum',
 'Championship Public Forum Debate',
 'Open - Public Forum Debate']

In [49]:
# read tourn list
with open(TOURNAMENT_CSV, 'r') as tourn_file,  open(OUTPATH + ENTRIES_FILE, 'w') as out_file:
    
    tourn_reader = csv.DictReader(tourn_file)
    tournWriter = csv.writer(out_file,
                                    lineterminator = "\n")
    
    tournWriter.writerow(["Tournament", "Event", "School", "State", "Name", "Code", "Status"])

    for tourn in tourn_reader:
        tourn_name = tourn["Name"]
        tourn_id = tourn["URL"]
        
        print("Checking " + tourn_name)
            
        events = getEvents(tourn_id)

        for event in events:

            eventType = getType(event[0])
            print(event[0], " -> ", eventType)
                
            if eventType in TARGET_EVENTS:
                    
                eventURL = "https://www.tabroom.com/" + event[1]

                tournWriter.writerows(getEntries(eventURL, eventType, tourn_name))
                
                print("Scraped", eventType, "for", tourn_name)

Checking dowling20
Congress  ->  None
JV Policy Debate  ->  JVCX
Scraped JVCX for dowling20
Lincoln Douglas  ->  VLD
Scraped VLD for dowling20
Novice LD  ->  NLD
Scraped NLD for dowling20
Novice Policy  ->  NCX
Scraped NCX for dowling20
Policy Debate  ->  VCX
Scraped VCX for dowling20
Public Forum  ->  VPF
Scraped VPF for dowling20
Checking dowling19
Congress  ->  None
JV Policy Debate  ->  JVCX
Scraped JVCX for dowling19
Lincoln Douglas  ->  VLD
Scraped VLD for dowling19
Novice Policy  ->  NCX
Scraped NCX for dowling19
Policy Debate  ->  VCX
Scraped VCX for dowling19
Public Forum  ->  VPF
Scraped VPF for dowling19
Checking strake20
Varsity LD  ->  VLD
Scraped VLD for strake20
Checking strake19
LD Round Robin  ->  None
PF Round Robin  ->  None
Varsity LD  ->  VLD
Scraped VLD for strake19
Checking applevalley20
Congressional Debate  ->  None
JV LD  ->  JVLD
Scraped JVLD for applevalley20
Novice LD  ->  NLD
Scraped NLD for applevalley20
Novice Public Forum  ->  NPF
Scraped NPF for applev

Scraped NPF for isidore20
Public Forum Debate Varsity  ->  VPF
Scraped VPF for isidore20
World Schools Debate  ->  None
Checking isidore19
Congressional Debate  ->  None
Declamation Speaking  ->  None
Dramatic Interpretation  ->  None
Duo Interpretation  ->  None
Extemporaneous Speaking  ->  None
Humorous Interpretation  ->  None
Impromptu Speaking  ->  None
Informative Speaking  ->  None
Novice Lincoln Douglas Debate  ->  NLD
Scraped NLD for isidore19
Novice Policy Debate  ->  NCX
Scraped NCX for isidore19
Novice Public Forum Debate  ->  NPF
Scraped NPF for isidore19
Oral Interpretation  ->  None
Original Oratory  ->  None
Program of Oral Interpretation  ->  None
Varsity Lincoln Douglas Debate  ->  VLD
Scraped VLD for isidore19
Varsity Policy Debate  ->  VCX
Scraped VCX for isidore19
Varsity Public Forum Debate  ->  VPF
Scraped VPF for isidore19
World Schools Debate  ->  None
Checking jackhowe19
Novice Congress  ->  None
Novice Dramatic Interpretation  ->  None
Novice Duo Interpretati

Scraped JVPF for seattle20
Novice Congress  ->  None
Novice Dramatic  ->  None
Novice Extemp  ->  None
Novice Humorous Interp  ->  None
Novice Impromptu  ->  None
Novice Informative  ->  None
Novice Lincoln Douglas  ->  NLD
Scraped NLD for seattle20
Novice Original Oratory  ->  None
Novice Public Forum  ->  NPF
Scraped NPF for seattle20
Open Congress  ->  None
Open Dramatic Interp  ->  None
Open Extemp  ->  None
Open Humorous  ->  None
Open Impromptu  ->  None
Open Informative  ->  None
Open Lincoln Douglas  ->  VLD
Scraped VLD for seattle20
Open Original Oratory  ->  None
Open Policy  ->  VCX
Scraped VCX for seattle20
Open Program Oral Interp  ->  None
Open Public Forum  ->  VPF
Scraped VPF for seattle20
Oral Interp (Not POI)  ->  None
Original Performance  ->  None
Checking seattle19
Dramatic Interpretation  ->  None
Duo Interpretation  ->  None
Editorial Commentary  ->  None
Junior Lincoln Douglas  ->  JVLD
Scraped JVLD for seattle19
Novice Congress  ->  None
Novice Extemp  ->  None

In [51]:
# read tourn list
with open(TOURNAMENT_CSV, 'r') as tourn_file,   open(OUTPATH + INFO_FILE, 'w') as out_file:
    
    tourn_reader = csv.DictReader(tourn_file)
    tournWriter = csv.writer(out_file,
                                    lineterminator = "\n")
    
    tournWriter.writerow(["Tourn Name", "Dates", "Year", "Location", "State"])

    for tourn in tourn_reader:
        tourn_name = tourn["Name"]
        tourn_id = tourn["URL"]
        
        print("Checking " + tourn_name)
        
        tournWriter.writerow([tourn_name] + getInfo(tourn_id))
        

Checking dowling20
Checking dowling19
Checking strake20
Checking strake19
Checking applevalley20
Checking applevalley19
Checking bronx20
Checking bronx19
Checking glenbrooks19
Checking glenbrooks20
Checking greenhill19
Checking greenhill20
Checking valley19
Checking valley20
Checking grapevine20
Checking grapevine19
Checking loyola19
Checking loyola20
Checking meadows20
Checking meadows19
Checking presentation19
Checking presentation20
Checking yale19
Checking yale20
Checking alta20
Checking alta19
Checking holycross19
Checking holycross20
Checking isidore20
Checking isidore19
Checking jackhowe19
Checking jackhowe20
Checking princeton19
Checking princeton20
Checking scarsdale20
Checking scarsdale19
Checking UT19
Checking UT20
Checking cypress19
Checking cypress20
Checking duke20
Checking duke19
Checking heritage19
Checking heritage20
Checking ridge20
Checking ridge19
Checking seattle20
Checking seattle19
Checking uk19
Checking uk20
Checking blake19
Checking blake20
Checking collegeprep

### Judge Scraper

#### Collect Links

In [13]:
paradigm_links = []

# iterate through all judges by first name
for c in ascii_lowercase:
    
    print(c)
    
    url = "https://www.tabroom.com/index/paradigm.mhtml?search_first={char}&search_last=".format(char = c)
    
    # load page
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")
    
    # get all links
    links = [link.get("href") for link in soup.find_all("a")]
    print(len(links))
    
    # append all paradigm links
    paradigm_links += [link for link in links if "judge_person" in link]

a
3229
b
1217
c
1624
d
1386
e
1088
f
266
g
670
h
647
i
307
j
3011
k
1471
l
953
m
2278
n
993
o
188
p
747
q
83
r
1526
s
2507
t
1045
u
66
v
468
w
362
x
76
y
219
z
274


In [16]:
# save judgelinks
with open(OUTPATH + "judgeLinks.txt", 'w') as outFile:
    for link in set(paradigm_links):
        outFile.write(link + "\n")

#### Collect Records

In [11]:
paradigm_links = list(set([name.strip() for name in open(OUTPATH + "judgeLinks.txt", 'r')]))

In [12]:
def getRecords(url):
    
    judgeID = url.split("=")[1]
    
    # load page
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")
    name = " ".join(soup.find("span", {"class": "twothirds"}).text.strip().split()[:-1])
    table = soup.find("table")
    
    records = []
    for row in table.find_all("tr")[1:]:
        cols = row.find_all("td")
        tourn = cols[0].text.strip()
        date = cols[1].text.split()[1]
        event = cols[2].text.strip()
        roundNum= cols[3].span.text
        roundName = cols[3].a.text
        aff = cols[4].text.strip()
        neg = cols[5].text.strip()
        decision = cols[6].text.strip()
        panel = cols[7].text.strip()
        records.append({"Judge" : name,
                        "Judge ID" : judgeID,
                       "Tournament": tourn,
                       "Date" : date,
                       "Event" : event,
                       "Round Number" : roundNum,
                       "Round Name" : roundName,
                       "Aff" : aff,
                       "Neg" : neg,
                       "Decision" : decision,
                       "Panel" : panel})
        
    return records

In [24]:

with open(OUTPATH + "records2.csv", 'w') as outFile:
    
    outWriter  = csv.DictWriter(outFile,
                                fieldnames = ["Judge",
                                              "Judge ID",
                                            "Tournament",
                                            "Date",
                                            "Event",
                                            "Round Number",
                                            "Round Name",
                                            "Aff",
                                            "Neg", 
                                            "Decision",
                                            "Panel"],
                                quotechar='"', 
                                quoting=csv.QUOTE_NONNUMERIC,
                                lineterminator = "\n")
    
    outWriter.writeheader()
    
    count = 0
    for link in paradigm_links:
        count += 1
        url = "https://www.tabroom.com/index/" + link
        
        try:
        
            records = getRecords(url)
            if (count % 100 == 0):
                print(count)
            outWriter.writerows(records)
        except KeyboardInterrupt:
            sys.exit(0)
        except:
            print("Broke for " + url)


NameError: name 'paradigm_links' is not defined

In [10]:
len(paradigm_links[23000:])

TypeError: 'set' object is not subscriptable