Wiki Scraper
---

Author: Peter Zhang

Scraping tool for the Wiki.

### Setup

#### Imports

In [1]:
# imports
import urllib.request, urllib.parse, urllib.error
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import csv
import os.path
from os import path
import re

#### Settings

- OVERWRITE determines whether or not to update existing files.
- PAGES_URL is a list of Wiki pages
- OUTPATH is where files are stored

In [12]:
# settings
OVERWRITE = False
TARGET_WIKIS = ["LD19"]

In [3]:
# get page URLs
WIKIS_URL = "tools/wiki_pages.csv"
WIKIS = [row for row in csv.reader(open(WIKIS_URL, 'r'))]

In [4]:
# outpath
OUTPATH = "wiki_data/"

### Scrapers

#### Wiki Page

Take the URL to a wiki page and returns a list of all the school names and their respective URLs.

In [5]:
# for a given archive year, return all schools
def getSchools(url):
    
    # try opening URL
    try:
        html = urlopen(url).read()
    except Exeption:
        print("Wiki URL broke")
        return []
    
    
    # soupify
    soup = BeautifulSoup(html, "html.parser")
    
    # find all links
    links = soup.find_all('a')
    
    # find only school links
    links = [link for link in links if "(" in link.text and ")" in link.text]
    
    # get link text and URL
    schools = [[link.text, link.get('href')] for link in links]

    return schools

In [6]:
getSchools("https://hspolicy19.debatecoaches.org/")

[['Advanced Technologies Academy (NV)', '/Advanced%20Technologies%20Academy/'],
 ['Airline (AL)', '/Airline/'],
 ['Aledo (TX)', '/Aledo/'],
 ['Alief Taylor (TX)', '/Alief%20Taylor/'],
 ['Alpharetta (GA)', '/Alpharetta/'],
 ['Altamont (AL)', '/Altamont/'],
 ['American Heritage (FL)', '/American%20Heritage/'],
 ['Anderson (TX)', '/Anderson/'],
 ['Andover (KS)', '/Andover/'],
 ['Andover Central (KS)', '/Andover%20Central/'],
 ['Andrews (TX)', '/Andrews/'],
 ['Archbishop Mitty (CA)', '/Archbishop%20Mitty/'],
 ['Ashland (OR)', '/Ashland/'],
 ['Asian Debate League (AK)', '/Asian%20Debate%20League/'],
 ['Athens (TX)', '/Athens/'],
 ['Aubrey (TX)', '/Aubrey/'],
 ['BASIS Chandler (AZ)', '/BASIS%20Chandler/'],
 ['BASIS Peoria (AZ)', '/BASIS%20Peoria/'],
 ['BASIS Shavano (TX)', '/BASIS%20Shavano/'],
 ['Bakersfield (CA)', '/Bakersfield/'],
 ['Baltimore City College (MD)', '/Baltimore%20City%20College/'],
 ['Barstow (MO)', '/Barstow/'],
 ['Baton Rouge Magnet (LA)', '/Baton%20Rouge%20Magnet/'],
 ['B

#### School Page

Take the URL to a school and return a list of debaters and the URLs that point to their Aff and Neg pages

In [7]:
# return aff and neg URLs for all the debaters of a given school

def getTeams(url):
    
    # try opening URL
    try:
        html = urlopen(url).read()
    except Exception:
        return (0, [], {}, "School URL broke")
        
    # soupify
    soup = BeautifulSoup(html, "html.parser")
    
    # look for a table
    if len(soup.find_all("table")):
        table = soup.find_all("table")[0]
    else:
        return (0, [], {}, "School page not set up")
        
    # exclude header
    numDebaters = len(table.find_all('tr'))-1
    
    if numDebaters == 0:
        return (0, [], {}, "No debaters listed")
    
    # track team names and map to URLs
    teamNames = []
    pageURLs = {}
    
    
    # for each table row
    for row in table.find_all('tr')[1:]:
        
        # the first column's string is the team name
        team = row.find('td').string
        teamNames.append(team)
        
        # collect the URLs to the aff and neg page
        links = row.find_all('a')
        URLs = [link.get("href") for link in links]
        pageURLs[team] = URLs
    
    return (numDebaters, teamNames, pageURLs, "")


In [8]:
getTeams("https://hspolicy19.debatecoaches.org/Advanced%20Technologies%20Academy")

(3,
 ['Advanced Technologies Academy Jacqueline Balanovsky - Raymond Behnke',
  'Advanced Technologies Academy Sam Self - Jonah Gentleman',
  'Advanced Technologies Academy Ryan Fritchel - Hannah Lewis'],
 {'Advanced Technologies Academy Jacqueline Balanovsky - Raymond Behnke': ['/Advanced%20Technologies%20Academy/Balanovsky-Behnke%20Aff',
   '/Advanced%20Technologies%20Academy/Balanovsky-Behnke%20Neg'],
  'Advanced Technologies Academy Sam Self - Jonah Gentleman': ['/Advanced%20Technologies%20Academy/Self-Gentleman%20Aff',
   '/Advanced%20Technologies%20Academy/Self-Gentleman%20Neg'],
  'Advanced Technologies Academy Ryan Fritchel - Hannah Lewis': ['/Advanced%20Technologies%20Academy/Fritchel-Lewis%20Aff',
   '/Advanced%20Technologies%20Academy/Fritchel-Lewis%20Neg']},
 '')

#### Team Page

In [9]:
def getDisclosure(url):

    # try opening URL
    try:
        html = urlopen(url).read().decode('utf-8-sig')
        
    except Exception:
        return {"Team URL":url,
                "Rounds": [],
                "Round Reports": [],
                "Cites": [],
                "Errors": "Did not open"}
    
    
    # soupify
    soup = BeautifulSoup(html, "html.parser")
    
    tables = soup.find_all("table")

    if len(tables) > 2:
        
        rnds = tables[0]
        rndrprts = tables[1]
        cts = tables[2]
    
    else:
        
        return {"Team URL":url,
                "Rounds": [],
                "Round Reports": [],
                "Cites": [],
                "Errors": "Not configured properly"}
    
    # collect all rounds
    rounds = []
    
    for row in rnds.find_all("tr")[1:]:
        
        # get pieces
        cols = row.find_all("td")
        trn = cols[0].text
        rnd = cols[1].text
        opp = cols[2].text
        jdg = cols[3].text
        osrc = cols[6].find("a")
        if osrc:
            docURL = osrc.get("href")
        else:
            docURL = ""
            
        rounds.append([trn, rnd, opp, jdg, docURL])
    
    # collect all round reports
    roundReports = []
    
    for row in rndrprts.find_all("tr")[1:]:
        try:
            pars = roundReports.append(row.find_all("td")[2].find_all("p")[1].text)
        except:
            continue

    # collect all cites
    cites = [row.find_all("td")[0].find("span").text for row in cts.find_all('tr')[1:]]
        
    return {"Team URL":url,
            "Rounds": rounds,
            "Round Reports": roundReports,
            "Cites": cites,
            "Errors": ""}
    


In [10]:
getDisclosure("https://hsld17.debatecoaches.org/Acton-Boxborough/Liu%20Aff")

{'Team URL': 'https://hsld17.debatecoaches.org/Acton-Boxborough/Liu%20Aff',
 'Rounds': [['-', 'Finals', '-', '-', ''],
  ['Harvard', '2', 'Dan Shahab', 'David Moon', ''],
  ['Lexington Winter Invitational', '2', 'Scarsdale AW', 'David Moon', ''],
  ['Newark Invitational', '4', 'Stuyvesant JL', 'Amit Kukreja', ''],
  ['Practice Round', '1', 'Michelle I drop T Li', 'the coolest judge', ''],
  ['Practice Round', 'Semis', '-', '-', '']],
 'Round Reports': [],
 'Cites': ['Contact',
  'JF - Brown Bear AC',
  'JF - Brown Bear AC v2',
  'JF - Brown Bear AC v3',
  'JF - Deleuzian Narrativity 1AC'],
 'Errors': ''}

#### Execution

Loop through wikis, find all schools, visit all entry pages, scrape all positions.

In [13]:
for wiki in WIKIS:
    
    # unpack wiki
    wikiName = wiki[0]
    wikiURL = wiki[1]
    
    # if not in target wikis, don't write
    if wikiName not in TARGET_WIKIS:
        continue
    
    # if it exists, don't overwrite
    if (not OVERWRITE) and path.exists(OUTPATH + wikiName + "_schools_wiki.csv"):
        continue
        
    # open output file
    with open(OUTPATH + wikiName + "schools_wiki.csv", 'w', encoding="utf-8") as SCHOOLFILE, open(OUTPATH + wikiName + "teams_wiki.csv", 'w', encoding="utf-8") as TEAMFILE:
        
        # instantaite writer
        schoolWriter = csv.DictWriter(SCHOOLFILE,
                                     fieldnames = ["School Name",
                                                   "School URL",
                                                   "School Teams",
                                                   "Errors"],
                                      quotechar='"', 
                                      quoting=csv.QUOTE_NONNUMERIC,
                                     lineterminator = "\n")
        
        # write header
        schoolWriter.writeheader()
        
        # instantaite writer
        teamWriter = csv.DictWriter(TEAMFILE,
                                    fieldnames = ["Team Name",
                                                  "Side",
                                                  "Team URL",
                                                  "Rounds",
                                                  "Round Reports",
                                                  "Cites",
                                                  "Errors"],
                                    lineterminator = "\n")
        
        # write header
        teamWriter.writeheader()
            
        print("Scraping " + wikiName)
        
        # get school URLs
        schools = getSchools(wikiURL)
        
        print("There are " + str(len(schools)) + " schools")
        
        for school in schools:
            
            # get school name and URL
            schoolName, schoolURL = school[0], wikiURL + school[1]
            
            print("Checking " + schoolName)
            
            # get teams from the school
            numTeams, teamNames, teamURLs, errors = getTeams(schoolURL)
            
            print("Found " + str(numTeams) + " teams")
            
            # write school information
            schoolWriter.writerow({"School Name" : schoolName,
                                  "School URL": schoolURL,
                                  "School Teams": teamNames,
                                  "Errors": errors})
            
            for team in teamNames:
                
                try:
                
                    print("Scraping " + team)

                    # get team URL
                    teamURL = teamURLs[team]
                    affURL = wikiURL + teamURL[0]
                    negURL = wikiURL + teamURL[1]

                    # aff scraping
                    affInfo = getDisclosure(affURL)
                    affInfo["Team Name"] = team
                    affInfo["Side"] = "Aff"

                    # write aff info
                    teamWriter.writerow(affInfo)

                    # neg scraping
                    negInfo = getDisclosure(negURL)
                    negInfo["Team Name"] = team
                    negInfo["Side"] = "Neg"

                    # write neg info
                    teamWriter.writerow(negInfo)
                    
                except:
                    
                    print(team + " failed.")
                    
                

Scraping LD19
There are 463 schools
Checking Acton Boxborough (MA)
Found 8 teams
Scraping Acton Boxborough - Matthew Liu
Scraping Acton Boxborough - Olivia Hu
Scraping Acton Boxborough - Sophie Zhang
Scraping Acton Boxborough - Chris Xu
Scraping Acton Boxborough - Iris Shu
Scraping Acton Boxborough - Samuel Cogen
Scraping Acton Boxborough - Amanda Wu
Scraping Acton Boxborough - bellerina hu
Checking Advanced Technologies Academy (NV)
Found 4 teams
Scraping Advanced Technologies Academy - Jonah Gentleman
Scraping Advanced Technologies Academy - Aleem Ahmed
Scraping Advanced Technologies Academy - Samuel Self
Scraping Advanced Technologies Academy - Jack Marshall
Checking Airline (LA)
Found 4 teams
Scraping Airline - Aidan Price
Scraping Airline - Nicholas Cooksey
Scraping Airline - Kai Macias
Scraping Airline - Jadyn Nourse
Checking Alameda Independent (CA)
Found 2 teams
Scraping Alameda Independent - Grant Aung
Scraping Alameda Independent - Grant Aung
Checking Albuquerque Academy (NM)

Scraping Brentwood School - Linus Epstein
Scraping Brentwood School - Dylan Liu
Scraping Brentwood School - Paola Santos
Scraping Brentwood School - Sophie Rubin
Scraping Brentwood School - John Chung
Scraping Brentwood School - Jessica Korobkin
Scraping Brentwood School - Sam Drake
Scraping Brentwood School - Hannah Taheri
Scraping Brentwood School - Alex Lowe
Scraping Brentwood School - Rahul Yates
Scraping Brentwood School - Samantha Ho
Scraping Brentwood School - Mika McCaffrey
Checking Bridgewater Raritan (NJ)
Found 2 teams
Scraping Bridgewater Raritan - Amulya Natchukuri
Scraping Bridgewater Raritan - ananya natchukuri
Checking Bronx Science (NY)
Found 16 teams
Scraping Bronx Science - Annie Wang
Scraping Bronx Science - William Freedman
Scraping Bronx Science - Nicolas Kim
Scraping Bronx Science - Rory An
Scraping Bronx Science - Wang Tiffany
Scraping Bronx Science - Meena Shikes
Scraping Bronx Science - Wang Howard
Scraping Bronx Science - Emily Shang
Scraping Bronx Science - H

Scraping Desert Vista - Baaz Jhaj
Scraping Desert Vista - Isabella KeeslerEvans
Checking Dougherty Valley (CA)
Found 20 teams
Scraping Dougherty Valley - Krish Kapoor
Scraping Dougherty Valley - Arjun Garg
Scraping Dougherty Valley - Savit Bhat
Scraping Dougherty Valley - Anurag Rao
Scraping Dougherty Valley - Kabir Dubey
Scraping Dougherty Valley - Mishka Narasimhan
Scraping Dougherty Valley - Shray Patel
Scraping Dougherty Valley - Kavin Kumaravel
Scraping Dougherty Valley - Aayush PateI
Scraping Dougherty Valley - Saketh Kotapati
Scraping Dougherty Valley - Anish Maram
Scraping Dougherty Valley - Arya Goel
Scraping Dougherty Valley - Justin Lee
Scraping Dougherty Valley - Dhruv Channa
Scraping Dougherty Valley - Sohum Tiwary
Scraping Dougherty Valley - Neda Bahrani
Scraping Dougherty Valley - Tarpan Mishra
Scraping Dougherty Valley - Aditya Madaraju
Scraping Dougherty Valley - Pranav Chandra
Scraping Dougherty Valley - Sandhya Nayar
Checking Dripping Springs (TX)
Found 1 teams
Scrap

Scraping Gridiron Chopper Boarding School - Massa Goon
Scraping Gridiron Chopper Boarding School - Ahuja Jean
Scraping Gridiron Chopper Boarding School - Ethan Zaidi
Scraping Gridiron Chopper Boarding School - Matthew Blackhe
Checking Gucci Wang (LA)
Found 1 teams
Scraping Gucci Wang - Cade Savoy
Checking Guyer (AL)
Found 1 teams
Scraping Guyer - Ciarra McClinton
Checking HB Plant (FL)
Found 1 teams
Scraping HB Plant - Taman Kanchanapalli
Checking Half Hollow Hills (NY)
Found 2 teams
Scraping Half Hollow Hills - Victoria Tong
Scraping Half Hollow Hills - AJ Nambiar
Checking Hamilton (AZ)
Found 7 teams
Scraping Hamilton - Ria Manathkar
Scraping Hamilton - Claire Mullings
Scraping Hamilton - Dev Singhania
Scraping Hamilton - Nolan Burke
Scraping Hamilton - Michael Randolph
Scraping Hamilton - Daniel Shih
Scraping Hamilton - Nivea Krishnan
Checking Harker (CA)
Found 40 teams
Scraping Harker - Akshay Manglik
Scraping Harker - Aditi Vinod
Scraping Harker - Anshul Reddy
Scraping Harker - Rah

Scraping James Madison - kim gonzalez
Scraping James Madison - gio calzada
Checking Jenks (OK)
Found 3 teams
Scraping Jenks - Noah Coffman
Scraping Jenks - Maya Chandwaney
Scraping Jenks - Taylor Rafferty
Checking John F Kennedy (IA)
Found 1 teams
Scraping John F Kennedy - Andrew Shea
Checking John Richard Fitzgerald (CA)
Found 1 teams
Scraping John Richard Fitzgerald - Alyssa Sawyer
Checking KAPS (TX)
Found 1 teams
Scraping KAPS - Ben Thomas
Checking KIS (MD)
Found 3 teams
Scraping KIS - Millie Lee
Scraping KIS - Kelly Leee
Scraping KIS - Raphael Koo
Checking Katy Taylor (TX)
Found 1 teams
Scraping Katy Taylor - Ann Popovici
Checking Kellenberg Memorial (NY)
Found 1 teams
Scraping Kellenberg Memorial - Veronica Tadross
Checking Keller (TX)
Found 1 teams
Scraping Keller - Hannah Broussard
Checking Kennedy (CA)
Found 1 teams
Scraping Kennedy - Keshav Rastogi
Checking Khan Academy (TX)
Found 2 teams
Scraping Khan Academy - Aate Kalang
Scraping Khan Academy - Nimun Ghan
Checking Khan Lab 

Scraping Loyola - Ian Brown
Scraping Loyola - Lucas Hunter
Checking Loyola Blakefield (MD)
Found 1 teams
Scraping Loyola Blakefield - Thomas McNulty
Checking Lynbrook (CA)
Found 15 teams
Scraping Lynbrook - Shreeram Modi
Scraping Lynbrook - Yash Mishra
Scraping Lynbrook - Krishna Ajjarapu
Scraping Lynbrook - Ayush Mishraa
Scraping Lynbrook - Siddharth Chattoraj
Scraping Lynbrook - Sid Kannan
Scraping Lynbrook - Arnav Dixit
Scraping Lynbrook - Keshav Dandu
Scraping Lynbrook - Arnav Jain
Scraping Lynbrook - Zach Uriarte
Scraping Lynbrook - Soohyuk Yoon
Scraping Lynbrook - Khushi Nigam
Scraping Lynbrook - Allison Hsu
Scraping Lynbrook - Reina Pradhan
Scraping Lynbrook - Audrey Iwashita
Checking MERS (DE)
Found 0 teams
Checking Magnet (CA)
Found 2 teams
Scraping Magnet - Krishna Khawani
Scraping Magnet - Rutvij Holay
Checking Maize (KS)
Found 1 teams
Scraping Maize - Avery Dover
Checking Marcus (TX)
Found 2 teams
Scraping Marcus - Joey Rogers
Scraping Marcus - Jack Graham
Checking Marlboro

Scraping New Trier - Izaak van Til
Scraping New Trier - Cara Siebert
Checking Newark Science (NJ)
Found 6 teams
Scraping Newark Science - Temitope Ogundare
Scraping Newark Science - Victoria Ajayi
Scraping Newark Science - Simone Braithwaite
Scraping Newark Science - Jasmin Koonjan
Scraping Newark Science - Devin Kyser
Scraping Newark Science - Christal St Clair
Checking Newport (WA)
Found 1 teams
Scraping Newport - Ethan Luo
Checking Newsome (FL)
Found 1 teams
Scraping Newsome - Dylan Burke
Checking Norman (OK)
Found 1 teams
Scraping Norman - Callahan Stroud
Checking North Allegheny (PA)
Found 5 teams
Scraping North Allegheny - Elijah Duckworth
Scraping North Allegheny - Benjamin Lannis
Scraping North Allegheny - Akshana Dassanaike
Scraping North Allegheny - Shejuti Wahed
Scraping North Allegheny - Rajat Reddy
Checking North Central (WA)
Found 1 teams
Scraping North Central - Colton Schons
Checking North Hollywood (CA)
Found 1 teams
Scraping North Hollywood - Lydia Qin
Checking North 

Scraping Sage Hill - Grace Ma
Scraping Sage Hill - Emily Xu
Checking Saint Paul (CA)
Found 1 teams
Scraping Saint Paul - Jeremy Lee
Checking Sam Barlow (OR)
Found 1 teams
Scraping Sam Barlow - Eli Leadham
Checking San Marino (CA)
Found 3 teams
Scraping San Marino - Mark MacDermott
Scraping San Marino - Jacob Chon
Scraping San Marino - Edmond Wen
Checking Santa Monica (CA)
Found 1 teams
Scraping Santa Monica - Rex Evans
Checking Saratoga (CA)
Found 5 teams
Scraping Saratoga - Arnav Garg
Scraping Saratoga - Ujjwal Krishnamurthi
Scraping Saratoga - Howard Huang
Scraping Saratoga - All Novices
Scraping Saratoga - Aeshon Balasubramanian
Checking Savoy Independent (LA)
Found 1 teams
Scraping Savoy Independent - Cade Savoy
Checking Scarsdale (NY)
Found 21 teams
Scraping Scarsdale - Zachary Siegel
Scraping Scarsdale - Curtis Chang
Scraping Scarsdale - Felicty Huang
Scraping Scarsdale - Wolf Cukier
Scraping Scarsdale - Aanya Schoetz
Scraping Scarsdale - Karen Lee
Scraping Scarsdale - Vivian Guo

Scraping Syosset - Alan Huang
Checking Tempe Preparatory Academy (AZ)
Found 2 teams
Scraping Tempe Preparatory Academy - Connor Clark
Scraping Tempe Preparatory Academy - Cole Montei
Checking Teurlings Catholic (LA)
Found 2 teams
Scraping Teurlings Catholic - Cade Savoy
Scraping Teurlings Catholic - Emile Olivier
Checking The Bishops School (CA)
Found 0 teams
Checking The Hill School (PA)
Found 1 teams
Scraping The Hill School - Nethmin Liyanage
Checking The Memorial Novices (TX)
Found 2 teams
Scraping The Memorial Novices - Sebastian Cho
Scraping The Memorial Novices - Daniel Xu
Checking The Village School (TX)
Found 2 teams
Scraping The Village School - Jay Natarajan
Scraping The Village School - Regina Blenda
Checking Thomas Jefferson (VA)
Found 9 teams
Scraping Thomas Jefferson - Avyuk Dixit
Scraping Thomas Jefferson - Teja Buddhavarapu
Scraping Thomas Jefferson - Tarushii Goel
Scraping Thomas Jefferson - Muhurto Rahman
Scraping Thomas Jefferson - Vikram Bala
Scraping Thomas Jeffer

Scraping Winston Churchill - Mireya Rodriguez
Scraping Winston Churchill - Anyone NOT Listed Above is A Novice
Scraping Winston Churchill - Om Joshi
Checking Woodgrove (VA)
Found 1 teams
Scraping Woodgrove - Anthony Cusat
Checking Woodlands (TX)
Found 2 teams
Scraping Woodlands - Allison Aldridge
Scraping Woodlands - Lulu Whitson
Checking Woodlands College Park (TX)
Found 1 teams
Scraping Woodlands College Park - Rawson Duplantis
Checking Woodrow Wilson (AL)
Found 0 teams
Checking Wylie (TX)
Found 3 teams
Scraping Wylie - Deborah Banketa
Scraping Wylie - Justin Meyer
Scraping Wylie - All Novices
Checking Yang Gang (TX)
Found 1 teams
Scraping Yang Gang - Jayant Namdhari
Checking YerbaBuena (CA)
Found 1 teams
Scraping YerbaBuena - Jude Gadingan
