In [1]:
from bs4 import BeautifulSoup
import re
import requests
import string

In [56]:
JEOP_URL = "http://www.j-archive.com/listseasons.php"
Q_PATH = "qs.csv"
COLUMNS = "QUESTION,ANSWER,SEASON,DOLLARS\n"

In [68]:
class Clue(object):
    """Class to hold, clean, and write data from a single Jeopardy clue."""
    
    def __init__(self, ssn, raw_html):
        self._raw_html = raw_html
        self._ssn = ssn
        self._q = ""
        self._dollars = ""
        self._a = ""
        # self._date = None
        # self._ep = None
        
    def parse(self):
        """Extract data from the raw html."""
        try:
            self._q = self._raw_html.find(class_="clue_text").text
            self._a = str(self._raw_html).split("correct_response")[1].split("<table")[0]
            self._a = self._a[: 30]
        except:
            print("\n\nFAILED TO PARSE:")
            print(self._raw_html)
            return
        try:
            self._dollars = self._raw_html.find(class_="clue_value").text
        except:
            self._dollars = self._raw_html.find(class_="clue_value_daily_double").text
    
    def _clean_text(self, s):
        """Remove punctuation from a string and convert to lowercase."""
        s = s.replace("\n", "")
        s = s.replace("'", "")
        s = s.replace(",", "")
        s = s.replace(string.punctuation, "")
        s = s.lower()
        return s        
    
    def to_string(self):
        """Create write-able string."""
        q = self._clean_text(self._q)
        a = self._clean_text(self._a)
        s = "{},{},{},{}".format(q, a, self._ssn, self._dollars)
        return s

In [69]:
class Game(object):
    """Class to hold and parse data from a single Jeopardy game."""
    
    def __init__(self, url, ssn):
        self._url = url
        self._ssn = ssn
        self._game_id = url.split("=")[1]
        self._page = requests.get(url)
        self._parsed_page = BeautifulSoup(self._page.content, "lxml")
        
    def get_clues(self, round_name):
        """Method to return all of the data for a given round.
        
        Args:
          round_name: (string) one of jeopardy, double_jeopardy, final_jeopardy.
        
        Returns:
          list of Clue objects.
        """
        assert round_name in ["jeopardy", "double_jeopardy", "final_jeopardy"]
        
        table = self._parsed_page.find_all(id=round_name + "_round")
        assert(len(table) == 1)
        
        clues = []
        raw_clues = table[0].find_all(class_="clue")
        if round_name == "final_jeopardy":
            assert(len(raw_clues) == 1)
        else:
            assert(len(raw_clues) == 30)
            
        for raw_clue in raw_clues:
            c = Clue(self._ssn, raw_clue)
            c.parse()
            clues.append(c)
            
        return clues

In [70]:
class Season(object):
    """Class to parse data for a single Jeopardy season."""
    
    def __init__(self, url):
        self._url = url
        self.ssn = url.split("=")[1]
        self._page = requests.get(url)
        self._parsed_page = BeautifulSoup(self._page.content, "lxml")
        
    def get_links(self):
        """Returns the links to all of the games on the season page."""
        elements = self._parsed_page.find_all("a")
        all_links = [e.get("href") for e in elements]
        links = [l for l in all_links if "game_id" in l]
        return links

In [71]:
class Jeopardy(object):
    """Class to scrape all Jeopardy data."""
    
    def __init__(self):
        self._url = JEOP_URL
        self._page = requests.get(self._url)
        self._parsed_page = BeautifulSoup(self._page.content, "lxml")
        self._season_links = []
        self._last_saved = 0
        
    def get_season_links(self):
        """Grabs the links to all of the seasons on the main page."""
        elements = self._parsed_page.find_all("a")
        all_links = [e.get("href") for e in elements]
        links = [l for l in all_links if "showseason" in l]
        seen = set()
        formatted_links = []
        for l in links:
            ssn = l.split("=")[1]
            if ssn in seen:
                continue
            if l.startswith("http"):
                formatted_links.append(l)
            else:
                formatted_links.append("http://www.j-archive.com/{}".format(l))
            seen.add(ssn)
        self._season_links = formatted_links
        
    def _save(self, clues):
        """Dumps existing chunk of clue data to a CSV."""
        if self._last_saved == 0:
            with open(Q_PATH, "w") as f:
                f.write(COLUMNS)
        clue_strings = [c.to_string() for c in clues]
        self._last_saved += len(clue_strings)
        print("Saving {} clues, for a total of {} clues saved.".format(len(clue_strings), self._last_saved))
        with open(Q_PATH, "a") as f:
            f.write("\n".join(clue_strings))
        
    def get_clues(self):
        """Grabs data about each clue, utilizing the Season, Game, and Clue objects."""
        assert len(self._season_links) != 0
        
        for i, season_link in enumerate(self._season_links):
            
            s = Season(season_link)
            game_links = s.get_links()
            ssn = s.ssn
            del s
            
            print("Parsing games for season {}.".format(ssn))
            for game_link in game_links:
                g = Game(game_link, ssn)
                clues = g.get_clues("jeopardy")
                # self._clues.extend(g.get_clues("double_jeopardy"))
                # self._clues.extend(g.get_clues("final_jeopardy"))
                self._save(clues)
                del g, clues

In [73]:
j = Jeopardy()
j.get_season_links()
j.get_clues()

Parsing games for season 36.


FAILED TO PARSE:
<td class="clue">
</td>
Saving 30 clues, for a total of 30 clues saved.


FAILED TO PARSE:
<td class="clue">
</td>


FAILED TO PARSE:
<td class="clue">
</td>
Saving 30 clues, for a total of 60 clues saved.
Saving 30 clues, for a total of 90 clues saved.
Saving 30 clues, for a total of 120 clues saved.
Saving 30 clues, for a total of 150 clues saved.
Saving 30 clues, for a total of 180 clues saved.
Saving 30 clues, for a total of 210 clues saved.
Saving 30 clues, for a total of 240 clues saved.


PermissionError: [Errno 13] Permission denied: 'qs.csv'