In [2]:
import requests
from bs4 import BeautifulSoup
import re

In [3]:
class Game(object):
    
    def __init__(self, url):
        """Class to hold and parse data from a single Jeopardy game."""
        self._url = url
        self._game_id = url.split("=")[1]
        self._page = requests.get(url)
        self._parsed_page = BeautifulSoup(self._page.content, "lxml")
        
    def get_questions(self, round_name):
        """Method to return all of the questions and answers for a given round.
        
        Args:
          round_name: (string) one of jeopardy, double_jeopardy, final_jeopardy.
        
        Returns:
          list of (question, answer) tuples, unformatted.
        """
        assert round_name in ["jeopardy", "double_jeopardy", "final_jeopardy"]
        
        table = self._parsed_page.find_all(id=round_name + "_round")
        assert(len(table) == 1)
        
        questions = []
        clues = table[0].find_all(class_="clue")
        if round_name == "final_jeopardy":
            assert(len(clues) == 1)
        else:
            assert(len(clues) == 30)
            
        for clue in clues:
            question = clue.find(class_="clue_text")
            answer = str(clue).split("correct_response")[1].split("/")[0].split(";")[-2][:-3]  # I'm not happy about this.
            questions.append((question.text, answer))
            
        return questions

In [4]:
class Season(object):
    
    def __init__(self, url):
        """Class to parse data for a season of Jeopardy."""
        self._url = url
        self._page = requests.get(url)
        self._parsed_page = BeautifulSoup(self._page.content, "lxml")
        
    def get_links(self):
        elements = self._parsed_page.find_all("a")
        all_links = [e.get("href") for e in elements]
        links = [l for l in all_links if "game_id" in l]
        return links

In [10]:
class Jeopardy(object):
    
    def __init__(self):
        self._url = "http://www.j-archive.com/listseasons.php"
        self._page = requests.get(self._url)
        self._parsed_page = BeautifulSoup(self._page.content, "lxml")
        self._season_links = []
        self._questions = []
        
    def get_seasons(self):
        elements = self._parsed_page.find_all("a")
        all_links = [e.get("href") for e in elements]
        links = [l for l in all_links if "showseason" in l]
        formatted_links = ["http://www.j-archive.com/{}".format(l) for l in links]
        self._season_links = formatted_links
        
    def _save(self):
        out = "\n".join(["{}\n{}\n".format(q, a) for q, a in self._questions])
        with open("qs.txt", "wb") as f:
            f.write(out)
        
    def get_questions(self):
        assert len(self._season_links) != 0
        print("Parsing {} seasons...".format(len(self._season_links)))
        
        for i, link in enumerate(self._season_links):
            print("Parsing {}th season...".format(i))
            s = Season(link)
            links = s.get_links()
            for link in s.get_links():
                self._save()
                g = Game(link)
                try:
                    self._questions.extend(g.get_questions("jeopardy"))
                except:
                    print("jeopardy", link)
                try:
                    self._questions.extend(g.get_questions("double_jeopardy"))
                except:
                    print("double_jeopardy", link)
                try:
                    self._questions.extend(g.get_questions("final_jeopardy"))
                except:
                    print("final_jeopardy", link)
                del g
    
j = Jeopardy()
print("Initialized Jeopardy object.")
j.get_seasons()
print("Finished getting seasons.")
qs = j.get_questions()


Initialized Jeopardy object.
Finished getting seasons.
Parsing 41 seasons...
Parsing 0th season...
Parsing 1th season...
Parsing 2th season...


TypeError: a bytes-like object is required, not 'str'