Wiki Scraper
---

Author: Peter Zhang

Scraping tool for the Wiki.

### Setup

#### Imports

In [1]:
# imports
import urllib.request, urllib.parse, urllib.error
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import csv
import os.path
from os import path
import re

#### Settings

- OVERWRITE determines whether or not to update existing files.
- PAGES_URL is a list of Wiki pages
- OUTPATH is where files are stored

In [2]:
# settings
OVERWRITE = False

In [3]:
# get page URLs
WIKIS_URL = "tools/wiki_pages.csv"
WIKIS = [row for row in csv.reader(open(WIKIS_URL, 'r'))]

In [4]:
# outpath
OUTPATH = "wiki_data/"

### Scrapers

#### Wiki Page

Take the URL to a wiki page and returns a list of all the school names and their respective URLs.

In [8]:
# for a given archive year, return all schools
def getSchools(url):
    
    # try opening URL
    try:
        html = urlopen(url).read()
    except Exeption:
        print("Wiki URL broke")
        return []
    
    
    # soupify
    soup = BeautifulSoup(html, "html.parser")
    
    # find all links
    links = soup.find_all('a')
    
    # find only school links
    links = [link for link in links if "(" in link.text and ")" in link.text]
    
    # get link text and URL
    schools = [[link.text, link.get('href')] for link in links]

    return schools

In [6]:
getSchools("https://hspolicy19.debatecoaches.org/")

[['Advanced Technologies Academy (NV)', '/Advanced%20Technologies%20Academy/'],
 ['Airline (AL)', '/Airline/'],
 ['Aledo (TX)', '/Aledo/'],
 ['Alief Taylor (TX)', '/Alief%20Taylor/'],
 ['Alpharetta (GA)', '/Alpharetta/'],
 ['Altamont (AL)', '/Altamont/'],
 ['American Heritage (FL)', '/American%20Heritage/'],
 ['Anderson (TX)', '/Anderson/'],
 ['Andover (KS)', '/Andover/'],
 ['Andover Central (KS)', '/Andover%20Central/'],
 ['Andrews (TX)', '/Andrews/'],
 ['Archbishop Mitty (CA)', '/Archbishop%20Mitty/'],
 ['Ashland (OR)', '/Ashland/'],
 ['Asian Debate League (AK)', '/Asian%20Debate%20League/'],
 ['Athens (TX)', '/Athens/'],
 ['Aubrey (TX)', '/Aubrey/'],
 ['BASIS Chandler (AZ)', '/BASIS%20Chandler/'],
 ['BASIS Peoria (AZ)', '/BASIS%20Peoria/'],
 ['BASIS Shavano (TX)', '/BASIS%20Shavano/'],
 ['Bakersfield (CA)', '/Bakersfield/'],
 ['Baltimore City College (MD)', '/Baltimore%20City%20College/'],
 ['Barstow (MO)', '/Barstow/'],
 ['Baton Rouge Magnet (LA)', '/Baton%20Rouge%20Magnet/'],
 ['B

#### School Page

Take the URL to a school and return a list of debaters and the URLs that point to their Aff and Neg pages

In [5]:
# return aff and neg URLs for all the debaters of a given school

def getTeams(url):
    
    # try opening URL
    try:
        html = urlopen(url).read()
    except Exception:
        return (0, [], {}, "School URL broke")
        
    # soupify
    soup = BeautifulSoup(html, "html.parser")
    
    # look for a table
    if len(soup.find_all("table")):
        table = soup.find_all("table")[0]
    else:
        return (0, [], {}, "School page not set up")
        
    # exclude header
    numDebaters = len(table.find_all('tr'))-1
    
    if numDebaters == 0:
        return (0, [], {}, "No debaters listed")
    
    # track team names and map to URLs
    teamNames = []
    pageURLs = {}
    
    
    # for each table row
    for row in table.find_all('tr')[1:]:
        
        # the first column's string is the team name
        team = row.find('td').string
        teamNames.append(team)
        
        # collect the URLs to the aff and neg page
        links = row.find_all('a')
        URLs = [link.get("href") for link in links]
        pageURLs[team] = URLs
    
    return (numDebaters, teamNames, pageURLs, "")


In [8]:
getTeams("https://hspolicy19.debatecoaches.org/Advanced%20Technologies%20Academy")

(3,
 ['Advanced Technologies Academy Jacqueline Balanovsky - Raymond Behnke',
  'Advanced Technologies Academy Sam Self - Jonah Gentleman',
  'Advanced Technologies Academy Ryan Fritchel - Hannah Lewis'],
 {'Advanced Technologies Academy Jacqueline Balanovsky - Raymond Behnke': ['/Advanced%20Technologies%20Academy/Balanovsky-Behnke%20Aff',
   '/Advanced%20Technologies%20Academy/Balanovsky-Behnke%20Neg'],
  'Advanced Technologies Academy Sam Self - Jonah Gentleman': ['/Advanced%20Technologies%20Academy/Self-Gentleman%20Aff',
   '/Advanced%20Technologies%20Academy/Self-Gentleman%20Neg'],
  'Advanced Technologies Academy Ryan Fritchel - Hannah Lewis': ['/Advanced%20Technologies%20Academy/Fritchel-Lewis%20Aff',
   '/Advanced%20Technologies%20Academy/Fritchel-Lewis%20Neg']},
 '')

#### Team Page

In [6]:
def getDisclosure(url):

    # try opening URL
    try:
        html = urlopen(url).read().decode('utf-8-sig')
        
    except Exception:
        return {"Team URL":url,
                "Rounds": [],
                "Round Reports": [],
                "Cites": [],
                "Errors": "Did not open"}
    
    
    # soupify
    soup = BeautifulSoup(html, "html.parser")
    
    tables = soup.find_all("table")

    if len(tables) > 2:
        
        rnds = tables[0]
        rndrprts = tables[1]
        cts = tables[2]
    
    else:
        
        return {"Team URL":url,
                "Rounds": [],
                "Round Reports": [],
                "Cites": [],
                "Errors": "Not configured properly"}
    
    # collect all rounds
    rounds = []
    
    for row in rnds.find_all("tr")[1:]:
        
        # get pieces
        cols = row.find_all("td")
        trn = cols[0].text
        rnd = cols[1].text
        opp = cols[2].text
        jdg = cols[3].text
        osrc = cols[6].find("a")
        if osrc:
            docURL = osrc.get("href")
        else:
            docURL = ""
            
        rounds.append([trn, rnd, opp, jdg, docURL])
    
    # collect all round reports
    roundReports = []
    
    for row in rndrprts.find_all("tr")[1:]:
        try:
            pars = roundReports.append(row.find_all("td")[2].find_all("p")[1].text)
        except:
            continue

    # collect all cites
    cites = [row.find_all("td")[0].find("span").text for row in cts.find_all('tr')[1:]]
        
    return {"Team URL":url,
            "Rounds": rounds,
            "Round Reports": roundReports,
            "Cites": cites,
            "Errors": ""}
    


In [29]:
getDisclosure("https://hsld17.debatecoaches.org/Acton-Boxborough/Liu%20Aff")

{'Team URL': 'https://hsld17.debatecoaches.org/Acton-Boxborough/Liu%20Aff',
 'Rounds': [['-', 'Finals', '-', '-', ''],
  ['Harvard', '2', 'Dan Shahab', 'David Moon', ''],
  ['Lexington Winter Invitational', '2', 'Scarsdale AW', 'David Moon', ''],
  ['Newark Invitational', '4', 'Stuyvesant JL', 'Amit Kukreja', ''],
  ['Practice Round', '1', 'Michelle I drop T Li', 'the coolest judge', ''],
  ['Practice Round', 'Semis', '-', '-', '']],
 'Round Reports': [],
 'Cites': ['Contact',
  'JF - Brown Bear AC',
  'JF - Brown Bear AC v2',
  'JF - Brown Bear AC v3',
  'JF - Deleuzian Narrativity 1AC'],
 'Errors': ''}

#### Execution

Loop through wikis, find all schools, visit all entry pages, scrape all positions.

In [None]:
for wiki in WIKIS:
    
    # unpack wiki
    wikiName = wiki[0]
    wikiURL = wiki[1]
    
    # if it exists, don't overwrite
    if (not OVERWRITE) and path.exists(OUTPATH + wikiName + "_schools_wiki.csv"):
        continue
        
    # open output file
    with open(OUTPATH + wikiName + "schools_wiki.csv", 'w', encoding="utf-8") as SCHOOLFILE, open(OUTPATH + wikiName + "teams_wiki.csv", 'w', encoding="utf-8") as TEAMFILE:
        
        # instantaite writer
        schoolWriter = csv.DictWriter(SCHOOLFILE,
                                     fieldnames = ["School Name",
                                                   "School URL",
                                                   "School Teams",
                                                   "Errors"],
                                      quotechar='"', 
                                      quoting=csv.QUOTE_NONNUMERIC,
                                     lineterminator = "\n")
        
        # write header
        schoolWriter.writeheader()
        
        # instantaite writer
        teamWriter = csv.DictWriter(TEAMFILE,
                                    fieldnames = ["Team Name",
                                                  "Side",
                                                  "Team URL",
                                                  "Rounds",
                                                  "Round Reports",
                                                  "Cites",
                                                  "Errors"],
                                    lineterminator = "\n")
        
        # write header
        teamWriter.writeheader()
            
        print("Scraping " + wikiName)
        
        # get school URLs
        schools = getSchools(wikiURL)
        
        print("There are " + str(len(schools)) + " schools")
        
        for school in schools:
            
            # get school name and URL
            schoolName, schoolURL = school[0], wikiURL + school[1]
            
            print("Checking " + schoolName)
            
            # get teams from the school
            numTeams, teamNames, teamURLs, errors = getTeams(schoolURL)
            
            print("Found " + str(numTeams) + " teams")
            
            # write school information
            schoolWriter.writerow({"School Name" : schoolName,
                                  "School URL": schoolURL,
                                  "School Teams": teamNames,
                                  "Errors": errors})
            
            for team in teamNames:
                
                try:
                
                    print("Scraping " + team)

                    # get team URL
                    teamURL = teamURLs[team]
                    affURL = wikiURL + teamURL[0]
                    negURL = wikiURL + teamURL[1]

                    # aff scraping
                    affInfo = getDisclosure(affURL)
                    affInfo["Team Name"] = team
                    affInfo["Side"] = "Aff"

                    # write aff info
                    teamWriter.writerow(affInfo)

                    # neg scraping
                    negInfo = getDisclosure(negURL)
                    negInfo["Team Name"] = team
                    negInfo["Side"] = "Neg"

                    # write neg info
                    teamWriter.writerow(negInfo)
                    
                except:
                    
                    print(team + " failed.")
                    
                

Scraping LD17
There are 495 schools
Checking Aberdeen Central (SD)
Found 0 teams
Checking Academy of Higher Learning (CA)
Found 0 teams
Checking Acton-Boxborough (MA)
Found 3 teams
Scraping Acton-Boxborough - Matthew Liu
Scraping Acton-Boxborough - Jeffrey Huang
Scraping Acton-Boxborough - Jerry Wu
Checking Advanced Technologies Academy (NV)
Found 0 teams
Checking Albany (CA)
Found 0 teams
Checking Albuquerque Academy (NM)
Found 1 teams
Scraping Albuquerque Academy - Harrison Bay
Checking Alief Taylor (TX)
Found 2 teams
Scraping Alief Taylor - Steven Ha
Scraping Alief Taylor - Jayden Kannedy
Checking Allen Homeschool (TX)
Found 1 teams
Scraping Allen Homeschool - Kaitlyn Johnson
Checking Altamont (AL)
Found 3 teams
Scraping Altamont - Isabella Maldia
Scraping Altamont - Wei Shiow Fong
Scraping Altamont - Aly Pabani
Checking American Heritage Boca Delray (FL)
Found 3 teams
Scraping American Heritage Boca Delray - Abhilash Datti
Scraping American Heritage Boca Delray - Eswar Mohan
Scrapi

Found 1 teams
Scraping Carnegie Vanguard - Arnav Burudgunte
Checking Carpe Diem (NJ)
Found 1 teams
Scraping Carpe Diem - Rithvik Seela
Checking Cary Academy (NC)
Found 1 teams
Scraping Cary Academy - Will Aarons
Checking Catalina Foothills (AZ)
Found 2 teams
Scraping Catalina Foothills - Joshua Cohen
Scraping Catalina Foothills - Sylvia Zarnescu
Checking Cedar Park (TX)
Found 0 teams
Checking Cedar Ridge (TX)
Found 0 teams
Checking Centennial (ID)
Found 2 teams
Scraping Centennial - Avalyn Hine
Scraping Centennial - Harrison Hall
Checking Centennial (TX)
Found 1 teams
Scraping Centennial TX - Arlinda Chen
Checking Center For Talented Youth (MD)
Found 0 teams
Checking Cerritos (CA)
Found 0 teams
Checking Chaminade (CA)
Found 6 teams
Scraping Chaminade - Ronak Ahuja
Scraping Chaminade - Joey Thornhill
Scraping Chaminade -  Azi Hormozdiari
Scraping Chaminade - Austin Li
Scraping Chaminade -  Josh Kirshner
Scraping Chaminade - Jatin Batta
Checking Chandler (AZ)
Found 0 teams
Checking Chand

Scraping Edina - Prasoon Sinha
Scraping Edina - Kathleen Scoggin
Scraping Edina - Sandra Chen
Scraping Edina - Rahil Modi
Scraping Edina - Anand Mittal
Scraping Edina - Jonah Rosenthal
Scraping Edina - Stella OlkenHunt
Scraping Edina - Matthew Ruppert
Scraping Edina - Ananth Veluvali
Checking Edmond North (OK)
Found 0 teams
Checking Edmond Santa Fe (OK)
Found 1 teams
Scraping Edmond Santa Fe - Christine Nguyen
Checking Eisenhower (KS)
Found 0 teams
Checking El Cerrito (CA)
Found 0 teams
Checking Elite of Irvine (CA)
Found 2 teams
Scraping Elite of Irvine - Allen Pau
Scraping Elite of Irvine - Caitlin Lee
Checking Elkins (TX)
Found 1 teams
Scraping Elkins - Kedar Pandya
Checking Enloe (NC)
Found 1 teams
Scraping Enloe - Tej Gedela
Checking ESD (TX)
Found 2 teams
Scraping ESD - Zan Haq
Scraping ESD - Jiaying Fu
Checking Evanston (IL)
Found 11 teams
Scraping Evanston - Henry Eberhart
Scraping Evanston - Elliot Davis
Scraping Evanston - Zachary Schwartz
Scraping Evanston - Joshua Ahn
Scrap

Checking Holy Cross (LA)
Found 1 teams
Scraping Holy Cross - Blake Ziegler
Checking Homewood Flossmoor (IL)
Found 0 teams
Checking Hopkins (MN)
Found 0 teams
Checking Houston Homeschool (TX)
Found 0 teams
Checking Hunter College (NY)
Found 9 teams
Scraping Hunter College - Michelle Li
Scraping Hunter College - Michael Ning
Scraping Hunter College - MarieRose Sheinerman
Scraping Hunter College - Nate Kruger
Scraping Hunter College - Nicole Gladstein
Scraping Hunter College - Amy Dolan
Scraping Hunter College - Scott Klein
Scraping Hunter College - Grace Tian
Scraping Hunter College - Tammuz Frankel
Checking Hutchinson (KS)
Found 0 teams
Checking Immaculate Heart (CA)
Found 4 teams
Scraping Immaculate Heart - Madeleine ConradMogin
Scraping Immaculate Heart - Lena Mizrahi
Scraping Immaculate Heart - Danielle Dosch
Scraping Immaculate Heart - Mia Speier
Checking Independent (All)
Found 2 teams
Scraping Independent - Ryan Hemnarine
Scraping Independent - Nevin Gera
Checking Interlake (WA)
F

Scraping Loyola - Luis Arbelaez
Scraping Loyola - Joseph Namkung
Scraping Loyola - Dante Bajarias
Scraping Loyola - Ryan Beckman
Scraping Loyola - Holden Fraser
Scraping Loyola - Connor Lindquist
Scraping Loyola - John Choi
Scraping Loyola - Andrew Overing
Scraping Loyola - Michael Castro
Scraping Loyola - James Duchesneau
Scraping Loyola - Jack Koenig
Scraping Loyola - Lucas Hunter
Scraping Loyola - John Soza
Scraping Loyola - Patrick Oh
Scraping Loyola - Alessandro Behney
Checking Loyola Blakefield (MA)
Found 0 teams
Checking Lynbrook (CA)
Found 13 teams
Scraping Lynbrook - Chris Wang
Scraping Lynbrook - Nikhil Ajjarapu
Scraping Lynbrook - Yichen Zhu
Scraping Lynbrook - Anya Poplavska
Scraping Lynbrook - Susan Zhou
Scraping Lynbrook - Selina Li
Scraping Lynbrook - Hsinyen Huang
Scraping Lynbrook - Divya Nelakonda
Scraping Lynbrook - Meera Balaji
Scraping Lynbrook - Cindy Xu
Scraping Lynbrook - Lakshay Maharana
Scraping Lynbrook - Nisha Fernandes
Scraping Lynbrook - Sloka Suresh
Check

Found 2 teams
Scraping Notre Dame - Arushi Bansal
Scraping Notre Dame - Dipashreya Sur
Checking Nueva (CA)
Found 1 teams
Scraping Nueva - Adam Keller
Checking Oak Hall (FL)
Found 1 teams
Scraping Oak Hall - Kumail Zaidi
Checking Oakwood (CA)
Found 7 teams
Scraping Oakwood - Maximillian Wolf Valdes
Scraping Oakwood - Arlo Weiner
Scraping Oakwood - Logan Bauman
Scraping Oakwood - Wyatt Alpert
Scraping Oakwood - Sarah Mostow
Scraping Oakwood - Nikki Bayat
Scraping Oakwood - August Orser
Checking Oakwood (OH)
Found 1 teams
Scraping Oakwood OH - Madelynn Einhorn
Checking Okoboji (IA)
Found 0 teams
Checking Olathe West (KS)
Found 0 teams
Checking Oxbridge (FL)
Found 0 teams
Checking Oxford (CA)
Found 0 teams
Checking Oxford (MS)
Found 1 teams
Scraping Oxford MS - Bennett Brown
Checking Pacific Hills (CA)
Found 1 teams
Scraping Pacific Hills - Alexander Shaikh
Checking Pacific Ridge (CA)
Found 0 teams
Checking Palm Beach Gardens (FL)
Found 0 teams
Checking Palo Alto Independent (CA)
Found 2 t