Tab Scraper
---

Author: Peter Zhang

Scraping tool for Tabroom.

### Setup

#### Imports

In [1]:

# imports
import urllib.request, urllib.parse, urllib.error
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import csv
import os.path
from os import path
import sys
from string import ascii_lowercase

#### Settings

- OVERWRITE determines whether or not to update existing files.
- PAGES_URL is a list of Wiki pages
- OUTPATH is where files are stored

In [2]:
# settings
OVERWRITE = True

In [13]:
# get tournament URLS
#TOURNAMENT_CSV = 'tools/tourn_info.csv'
TOURNAMENT_CSV = 'tools/edebate_tourns.csv'

In [19]:
# outpath
OUTPATH = "tab_data/"
ENTRIES_FILE = "edebate_entries.csv"
INFO_FILE = "edebate_info.csv"

In [23]:
# equivalent names
LD_NAMES = [name.strip() for name in open('tools/ld_eventnames.txt', 'r')]
PF_NAMES = [name.strip() for name in open('tools/pf_eventnames.txt', 'r')]
CX_NAMES = [name.strip() for name in open('tools/cx_eventnames.txt', 'r')]

In [17]:
# set events to scrape
TARGET_EVENTS = ["LD"]
def getType(raw_name):
    if raw_name in LD_NAMES:
        return "LD"
    if raw_name in PF_NAMES:
        return "PF"
    if raw_name in CX_NAMES:
        return "CX"
    return "None"

### Entry Scrapers

#### Events

Take a tournament ID and get links to events.

In [7]:
def getEvents(tourn_id):
    url = "https://www.tabroom.com/index/tourn/fields.mhtml?tourn_id=" + tourn_id
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")
    links = [link for link in soup.find_all('a') if "event_id" in link.get('href')]
    return [(link.contents[0].strip(), link.get('href')) for link in links]

In [23]:
getURLs("16856")

[('Congressional Debate',
  '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141000'),
 ('JV LD', '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141001'),
 ('Novice LD', '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141003'),
 ('Novice Public Forum',
  '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141004'),
 ('Varsity LD', '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141005'),
 ('Varsity Public Forum',
  '/index/tourn/fields.mhtml?tourn_id=16856&event_id=141006')]

#### Entries

Take the URL to an event's page and return event entry info.

In [8]:
# extract table from a page
def getEntries(url, eventType, tournName):
    
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")

    table = soup.find("table").find("tbody")
    
    entries = []
    for row in table.find_all("tr"):
        
        entries.append([tournName, eventType] + [field.text.strip() for field in row.find_all("td")])
    
    return entries



In [44]:
getEntries("https://www.tabroom.com/index/tourn/fields.mhtml?tourn_id=16856&event_id=141005", "LD")

[['Acton-Boxborough Regional High Scho',
  'MA/US',
  'Bellerina Hu',
  'Acton-Boxborough BH',
  'LD'],
 ['Apple Valley High School',
  'MN/US',
  'John Schwartz',
  'Apple Valley JS',
  'LD'],
 ['Apple Valley High School',
  'MN/US',
  'Nora Bolsoni',
  'Apple Valley NB',
  'LD'],
 ['Appleton North', 'WI/US', 'Mihir Uberoi', 'Appleton North MU', 'LD'],
 ['BASIS Independent Silicon Valley In',
  'CA/US',
  'Shreyas Kapavarapu',
  'BASIS Independent Silicon Valley Independent SK',
  'LD'],
 ['Bergen County Academies',
  'NJ/US',
  'Andrew Kim',
  'Bergen County Academies AK',
  'LD'],
 ['Bettendorf High School', 'IA/US', 'Noah Rantilla', 'Bettendorf NR', 'LD'],
 ['Brentwood School', 'CA/US', 'Sophie Rubin', 'Brentwood SR', 'LD'],
 ['Byram Hills High School',
  'NY/US',
  'Eleanor Wangensteen',
  'Byram Hills EW',
  'LD'],
 ['Byram Hills High School',
  'NY/US',
  'Magdalena Whelley',
  'Byram Hills MW',
  'LD'],
 ['Byram Hills High School', 'NY/US', 'Sam Hadiono', 'Byram Hills SH', 'LD'

#### Info

Get a tournament ID and get the tournament info.

In [9]:
def getInfo(tourn_id):

    url = "https://www.tabroom.com/index/tourn/index.mhtml?tourn_id=" + tourn_id

    # load page
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")

    # find header
    header = soup.select('h5')[0].text.strip()
    
    # get sub-header
    year = header.split('—')[0].strip()
    location = header.split('—')[1].strip()
    if ',' in location:
        city = location.split(',')[0].strip()
        state = location.split(',')[1].strip()
    
    else:
        city = "None"
        state = location

    # get info box
    info = soup.find_all('span', {'class' : 'smaller half'})[0].text
    date = ' '.join(info.split())

    return [date, year, city, state]

In [39]:
getInfo("16856")

['11/6 to 11/8', '2020', 'NSDA Campus', 'MN/US']

#### Execution

Loop through tournaments.

In [26]:
# read tourn list
with open(TOURNAMENT_CSV, 'r') as tourn_file,  open(OUTPATH + ENTRIES_FILE, 'w') as out_file:
    
    tourn_reader = csv.DictReader(tourn_file)
    tournWriter = csv.writer(out_file,
                                    lineterminator = "\n")
    
    tournWriter.writerow(["Tournament", "Event", "School", "State", "Name", "Code", "Status"])

    for tourn in tourn_reader:
        tourn_name = tourn["Name"]
        tourn_id = tourn["URL"]
        
        print("Checking " + tourn_name)
            
        events = getEvents(tourn_id)

        for event in events:

            eventType = getType(event[0])
                
            if eventType in TARGET_EVENTS:
                    
                eventURL = "https://www.tabroom.com/" + event[1]

                tournWriter.writerows(getEntries(eventURL, eventType, tourn_name))
                    
                print("Scraped", eventType, "for", tourn_name)

Checking dowling20
Scraped LD for dowling20
Checking dowling19
Scraped LD for dowling19
Checking strake20
Scraped LD for strake20
Checking strake19
Scraped LD for strake19
Checking applevalley20
Scraped LD for applevalley20
Checking applevalley19
Scraped LD for applevalley19
Checking bronx20
Scraped LD for bronx20
Checking bronx19
Scraped LD for bronx19
Checking glenbrooks19
Scraped LD for glenbrooks19
Checking glenbrooks20
Scraped LD for glenbrooks20
Checking greenhill19
Scraped LD for greenhill19
Checking greenhill20
Scraped LD for greenhill20
Checking valley19
Scraped LD for valley19
Checking valley20
Scraped LD for valley20
Checking grapevine20
Scraped LD for grapevine20
Checking grapevine19
Scraped LD for grapevine19
Checking loyola19
Scraped LD for loyola19
Checking loyola20
Scraped LD for loyola20
Checking meadows20
Scraped LD for meadows20
Checking meadows19
Scraped LD for meadows19
Checking presentation19
Scraped LD for presentation19
Checking presentation20
Scraped LD for pre

In [25]:
# read tourn list
with open(TOURNAMENT_CSV, 'r') as tourn_file,   open(OUTPATH + INFO_FILE, 'w') as out_file:
    
    tourn_reader = csv.DictReader(tourn_file)
    tournWriter = csv.writer(out_file,
                                    lineterminator = "\n")
    
    tournWriter.writerow(["Tourn Name", "Dates", "Year", "Location", "State"])

    for tourn in tourn_reader:
        tourn_name = tourn["Name"]
        tourn_id = tourn["URL"]
        
        print("Checking " + tourn_name)
        
        tournWriter.writerow([tourn_name] + getInfo(tourn_id))
        

Checking dowling20
Checking dowling19
Checking strake20
Checking strake19
Checking applevalley20
Checking applevalley19
Checking bronx20
Checking bronx19
Checking glenbrooks19
Checking glenbrooks20
Checking greenhill19
Checking greenhill20
Checking valley19
Checking valley20
Checking grapevine20
Checking grapevine19
Checking loyola19
Checking loyola20
Checking meadows20
Checking meadows19
Checking presentation19
Checking presentation20
Checking yale19
Checking yale20
Checking alta20
Checking alta19
Checking holycross19
Checking holycross20
Checking isidore20
Checking isidore19
Checking jackhowe19
Checking jackhowe20
Checking princeton19
Checking princeton20
Checking scarsdale20
Checking scarsdale19
Checking UT19
Checking UT20


### Judge Scraper

#### Collect Links

In [13]:
paradigm_links = []

# iterate through all judges by first name
for c in ascii_lowercase:
    
    print(c)
    
    url = "https://www.tabroom.com/index/paradigm.mhtml?search_first={char}&search_last=".format(char = c)
    
    # load page
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")
    
    # get all links
    links = [link.get("href") for link in soup.find_all("a")]
    print(len(links))
    
    # append all paradigm links
    paradigm_links += [link for link in links if "judge_person" in link]

a
3229
b
1217
c
1624
d
1386
e
1088
f
266
g
670
h
647
i
307
j
3011
k
1471
l
953
m
2278
n
993
o
188
p
747
q
83
r
1526
s
2507
t
1045
u
66
v
468
w
362
x
76
y
219
z
274


In [16]:
# save judgelinks
with open(OUTPATH + "judgeLinks.txt", 'w') as outFile:
    for link in set(paradigm_links):
        outFile.write(link + "\n")

#### Collect Records

In [11]:
paradigm_links = list(set([name.strip() for name in open(OUTPATH + "judgeLinks.txt", 'r')]))

In [12]:
def getRecords(url):
    
    judgeID = url.split("=")[1]
    
    # load page
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")
    name = " ".join(soup.find("span", {"class": "twothirds"}).text.strip().split()[:-1])
    table = soup.find("table")
    
    records = []
    for row in table.find_all("tr")[1:]:
        cols = row.find_all("td")
        tourn = cols[0].text.strip()
        date = cols[1].text.split()[1]
        event = cols[2].text.strip()
        roundNum= cols[3].span.text
        roundName = cols[3].a.text
        aff = cols[4].text.strip()
        neg = cols[5].text.strip()
        decision = cols[6].text.strip()
        panel = cols[7].text.strip()
        records.append({"Judge" : name,
                        "Judge ID" : judgeID,
                       "Tournament": tourn,
                       "Date" : date,
                       "Event" : event,
                       "Round Number" : roundNum,
                       "Round Name" : roundName,
                       "Aff" : aff,
                       "Neg" : neg,
                       "Decision" : decision,
                       "Panel" : panel})
        
    return records

In [24]:

with open(OUTPATH + "records2.csv", 'w') as outFile:
    
    outWriter  = csv.DictWriter(outFile,
                                fieldnames = ["Judge",
                                              "Judge ID",
                                            "Tournament",
                                            "Date",
                                            "Event",
                                            "Round Number",
                                            "Round Name",
                                            "Aff",
                                            "Neg", 
                                            "Decision",
                                            "Panel"],
                                quotechar='"', 
                                quoting=csv.QUOTE_NONNUMERIC,
                                lineterminator = "\n")
    
    outWriter.writeheader()
    
    count = 0
    for link in paradigm_links:
        count += 1
        url = "https://www.tabroom.com/index/" + link
        
        try:
        
            records = getRecords(url)
            if (count % 100 == 0):
                print(count)
            outWriter.writerows(records)
        except KeyboardInterrupt:
            sys.exit(0)
        except:
            print("Broke for " + url)


NameError: name 'paradigm_links' is not defined

In [10]:
len(paradigm_links[23000:])

TypeError: 'set' object is not subscriptable