Tab Scraper
---

Author: Peter Zhang

Scraping tool for Tabroom.

### Setup

#### Imports

In [14]:
# imports
import urllib.request, urllib.parse, urllib.error
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import csv
import os.path
from os import path
import sys
from string import ascii_lowercase

#### Settings

- OVERWRITE determines whether or not to update existing files.
- PAGES_URL is a list of Wiki pages
- OUTPATH is where files are stored

In [2]:
# settings
OVERWRITE = True

In [3]:
# get tournament URLS
TOURNAMENT_CSV = 'tools/tourn_info.csv'

In [4]:
# outpath
OUTPATH = "tab_data/"

In [5]:
# equivalent names
LD_NAMES = [name.strip() for name in open('tools/ld_eventnames.txt', 'r')]
PF_NAMES = [name.strip() for name in open('tools/pf_eventnames.txt', 'r')]
CX_NAMES = [name.strip() for name in open('tools/cx_eventnames.txt', 'r')]

In [6]:
# set events to scrape
TARGET_EVENTS = ["LD", "PF", "CX"]
def getType(raw_name):
    if raw_name in LD_NAMES:
        return "LD"
    if raw_name in PF_NAMES:
        return "PF"
    if raw_name in CX_NAMES:
        return "CX"
    return "None"

### Entry Scrapers

#### Events

Take a tournament ID and get links to events.

In [24]:
def getEvents(tourn_id):
    url = "https://www.tabroom.com/index/tourn/fields.mhtml?tourn_id=" + tourn_id
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")
    links = [link for link in soup.find_all('a') if "event_id" in link.get('href')]
    return [(link.contents[0].strip(), link.get('href')) for link in links]

In [23]:
getURLs("16856")

[('Congressional Debate',
  '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141000'),
 ('JV LD', '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141001'),
 ('Novice LD', '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141003'),
 ('Novice Public Forum',
  '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141004'),
 ('Varsity LD', '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141005'),
 ('Varsity Public Forum',
  '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141006')]

#### Entries

Take the URL to an event's page and return event entry info.

In [25]:
# extract table from a page
def getEntries(url, eventType, tournName):
    
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")

    table = soup.find("table").find("tbody")
    
    entries = []
    for row in table.find_all("tr"):
        
        entries.append([tournName, eventType] + [field.text.strip() for field in row.find_all("td")])
    
    return entries



In [44]:
getEntries("https://www.tabroom.com/index/tourn/fields.mhtml?tourn_id=16856&event_id=141005", "LD")

[['Acton-Boxborough Regional High Scho',
  'MA/US',
  'Bellerina Hu',
  'Acton-Boxborough BH',
  'LD'],
 ['Apple Valley High School',
  'MN/US',
  'John Schwartz',
  'Apple Valley JS',
  'LD'],
 ['Apple Valley High School',
  'MN/US',
  'Nora Bolsoni',
  'Apple Valley NB',
  'LD'],
 ['Appleton North', 'WI/US', 'Mihir Uberoi', 'Appleton North MU', 'LD'],
 ['BASIS Independent Silicon Valley In',
  'CA/US',
  'Shreyas Kapavarapu',
  'BASIS Independent Silicon Valley Independent SK',
  'LD'],
 ['Bergen County Academies',
  'NJ/US',
  'Andrew Kim',
  'Bergen County Academies AK',
  'LD'],
 ['Bettendorf High School', 'IA/US', 'Noah Rantilla', 'Bettendorf NR', 'LD'],
 ['Brentwood School', 'CA/US', 'Sophie Rubin', 'Brentwood SR', 'LD'],
 ['Byram Hills High School',
  'NY/US',
  'Eleanor Wangensteen',
  'Byram Hills EW',
  'LD'],
 ['Byram Hills High School',
  'NY/US',
  'Magdalena Whelley',
  'Byram Hills MW',
  'LD'],
 ['Byram Hills High School', 'NY/US', 'Sam Hadiono', 'Byram Hills SH', 'LD'

#### Info

Get a tournament ID and get the tournament info.

In [26]:
def getInfo(tourn_id):

    url = "https://www.tabroom.com/index/tourn/index.mhtml?tourn_id=" + tourn_id

    # load page
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")

    # find header
    header = soup.select('h5')[0].text.strip()
    
    # get sub-header
    year = header.split('—')[0].strip()
    location = header.split('—')[1].strip()
    if ',' in location:
        city = location.split(',')[0].strip()
        state = location.split(',')[1].strip()
    
    else:
        city = "None"
        state = location

    # get info box
    info = soup.find_all('span', {'class' : 'smaller half'})[0].text
    date = ' '.join(info.split())

    return [date, year, city, state]

In [39]:
getInfo("16856")

['11/6 to 11/8', '2020', 'NSDA Campus', 'MN/US']

#### Execution

Loop through tournaments.

In [27]:
# read tourn list
with open(TOURNAMENT_CSV, 'r') as tourn_file,  open(OUTPATH + "tab_data.csv", 'w') as out_file:
    
    tourn_reader = csv.DictReader(tourn_file)
    tournWriter = csv.writer(out_file,
                                    lineterminator = "\n")
    
    tournWriter.writerow(["Tournament", "Event", "School", "State", "Name", "Code", "Status"])

    for tourn in tourn_reader:
        tourn_name = tourn["Name"]
        tourn_id = tourn["URL"]
        
        print("Checking " + tourn_name)
            
        events = getEvents(tourn_id)

        for event in events:

            eventType = getType(event[0])
                
            if eventType in TARGET_EVENTS:
                    
                eventURL = "https://www.tabroom.com/" + event[1]

                tournWriter.writerows(getEntries(eventURL, eventType, tourn_name))
                    
                print("Scraped", eventType, "for", tourn_name)

Checking alta17
Scraped LD for alta17
Scraped PF for alta17
Scraped CX for alta17
Checking alta18
Scraped LD for alta18
Scraped PF for alta18
Scraped CX for alta18
Checking applevalley17
Scraped LD for applevalley17
Scraped PF for applevalley17
Checking applevalley18
Scraped LD for applevalley18
Scraped PF for applevalley18
Checking asu18
Scraped LD for asu18
Scraped CX for asu18
Scraped PF for asu18
Checking asu19
Scraped LD for asu19
Scraped CX for asu19
Scraped PF for asu19
Checking badgerland17
Scraped LD for badgerland17
Scraped CX for badgerland17
Checking badgerland18
Scraped LD for badgerland18
Scraped CX for badgerland18
Checking bethelpark18
Checking bethelpark19
Checking blake17
Scraped LD for blake17
Scraped CX for blake17
Checking blake18
Scraped LD for blake18
Scraped CX for blake18
Checking bronx17
Scraped LD for bronx17
Scraped CX for bronx17
Checking bronx18
Scraped LD for bronx18
Scraped CX for bronx18
Checking cal18
Scraped LD for cal18
Scraped CX for cal18
Scraped P

### Judge Scraper

#### Collect Links

In [10]:
paradigm_links = []

# iterate through all judges by first name
for c in ascii_lowercase:
    
    url = "https://www.tabroom.com/index/paradigm.mhtml?search_first={char}&search_last=".format(char = c)
    
    # load page
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")
    
    # get all links
    links = [link.get("href") for link in soup.find_all("a")]
    
    # append all paradigm links
    paradigm_links += [link for link in links if "judge_person" in link]

In [30]:
# save judgelinks
with open(OUTPATH + "judgeLinks.txt", 'w') as outFile:
    for link in paradigm_links:
        outFile.write(link + "\n")
    

#### Collect Records

In [10]:
paradigm_links = [name.strip() for name in open(OUTPATH + "judgeLinks.txt", 'r')]

In [12]:
def getRecords(url):
    # load page
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")
    name = " ".join(soup.find("span", {"class": "twothirds"}).text.strip().split()[:-1])
    table = soup.find("table")
    
    records = []
    for row in table.find_all("tr")[1:]:
        cols = row.find_all("td")
        tourn = cols[0].text.strip()
        date = cols[1].text.split()[1]
        event = cols[2].text.strip()
        roundNum= cols[3].span.text
        roundName = cols[3].a.text
        aff = cols[4].text.strip()
        neg = cols[5].text.strip()
        decision = cols[6].text.strip()
        panel = cols[7].text.strip()
        records.append({"Judge" : name,
                       "Tournament": tourn,
                       "Date" : date,
                       "Event" : event,
                       "Round Number" : roundNum,
                       "Round Name" : roundName,
                       "Aff" : aff,
                       "Neg" : neg,
                       "Decision" : decision,
                       "Panel" : panel})
        
    return records

In [None]:

with open(OUTPATH + "records_1.csv", 'w') as outFile:
    
    outWriter  = csv.DictWriter(outFile,
                                fieldnames = ["Judge",
                                            "Tournament",
                                            "Date",
                                            "Event",
                                            "Round Number",
                                            "Round Name",
                                            "Aff",
                                            "Neg", 
                                            "Decision",
                                            "Panel"],
                                quotechar='"', 
                                quoting=csv.QUOTE_NONNUMERIC,
                                lineterminator = "\n")
    
    outWriter.writeheader()
    
    count = 0
    for link in paradigm_links[:10000]:
        count += 1
        url = "https://www.tabroom.com/index/" + link
        
        try:
        
            records = getRecords(url)
            if (count % 100 == 0):
                print(count)
            outWriter.writerows(records)
        except KeyboardInterrupt:
            sys.exit(0)
        except:
            print("Broke for " + url)


Broke for https://www.tabroom.com/index/paradigm.mhtml?judge_person_id=6197
100
200
300
400
500
600
700
800
900
1000
1100


In [79]:
len(paradigm_links[:10000])

10000