This notebook contains Jonathan's original csv writing code, but updated so that:
- it uses the csv.writer() object to better handle quotes and special characters
- includes the new functions/fields for additional puzzle metadata
- strips the clue whitespace in advance
- goes backwards (scrapes newest puzzles first)
- added some basic error handling

In [11]:
## Import base packages we'll use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#from seaborn import set_style
#set_style("whitegrid")

## this import BeautifulSoup
from bs4 import BeautifulSoup

from urllib.request import urlopen

from datetime import timedelta, date

import re

In [52]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

def daterange(start_date, end_date):
    """
    https://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python
    
    returns generator to iterate over days in date range

    modified here to go backwards
    """
    for n in range(int((end_date - start_date).days))[::-1]:
        yield start_date + timedelta(n)

def get_clues_from_column(column):
    """
    Converts list of divs to lines, clues, and answers
    
    Twice as many <div>s as hints/answers:
    each line in column is 
    <div>Number</div>
    <div>
    Clue : <a href="asdf">Answer</a>
    </div>
    """
    lines = []
    clues = []
    answers = []
    for i in range(int(len(column)/2)):
        lines.append(int(column[2*i].text))
        clues.append(column[2*i+1].text.replace(",", ";").split(":")[0]) # This splits on commas too
        answers.append(column[2*i+1].a.text)
    return lines, clues, answers

def get_clues(numclue):
    """
    There are 2 <div class="numclue">, 
    one for across hints/answers and one for down
    
    returns across and down as tuples
    can be expanded with *
    """
    across_divs = numclue[0].find_all('div')
    down_divs = numclue[1].find_all('div')
    
    a_lines, a_clues, a_answers = get_clues_from_column(across_divs)
    d_lines, d_clues, d_answers = get_clues_from_column(down_divs)
    
    across = (a_lines, a_clues, a_answers)
    down = (d_lines, d_clues, d_answers)
    
    return across, down

def get_stats():
    '''returns a dict containing rows, columns, words, blocks, and missing letters'''
    stat_block = {'rows' : '','columns' : '','words' : '','blocks' : '','missing' : ''}
    stats = soup.find_all('div',{'id':'CPHContent_StatsData'})[0].find_all('span')
    try:
        stat_block['rows'] = re.search('(?<=Rows: )\d+',stats[0].get_text()).group(0)
    except:
        pass
    try:
        stat_block['columns'] = re.search('(?<=Columns: )\d+',stats[0].get_text()).group(0)
    except:
        pass
    try:
        stat_block['words'] = re.search('(?<=Words: )\d+',stats[1].get_text()).group(0)
    except:
        pass
    try: 
        stat_block['blocks'] = re.search('(?<=Blocks: )\d+',stats[1].get_text()).group(0)
    except: 
        pass
    try:
        stat_block['missing'] = re.search('({)(.+)(})',stats[4].get_text()).group(2)
    except:
        pass
    return stat_block

def get_authors():
    '''returns a dict containing puzzle author and editor'''
    author_info = soup.find_all('div', {'id' : 'CPHContent_AEGrid'})[0]
    scraped_info = {'author' : '', 'editor': ''}
    for index, div in enumerate(author_info):
        try:
            if div.get_text() == 'Author:':
                scraped_info['author'] = list(author_info)[index+1].get_text()
            elif div.get_text() == 'Editor:':
                scraped_info['editor'] = list(author_info)[index+1].get_text()
        except:
            continue
    return scraped_info

def puz_info():
    '''get the puzzle title and clue'''
    try:
        title = soup.find_all('h1', {'id' : 'PuzTitle'})[0].get_text()
        clue = soup.find_all('h2', {'class' : 'keyclue'})[0].get_text()
    except:
        title = 'could not retrieve'
        clue = 'could not retrieve'
    return {'title' : title, 'clue' : clue}

def print_column(lines, clues, answers):
    for l,c,a in zip(lines, clues, answers):
        print("%d - %s : %s" % (l,c,a))
        
def write_clues_to_csv(date, clues, fname, direction):
    for l,c,a in zip(*clues):
        fname.write(
            f"{date.year},{date.month},{date.day},{date.strftime('%A')},"\
            f"{direction},{l},{c.strip()},{a.strip()},"\
            f"{puz_info()['title']},{puz_info()['clue']},"\
            f"{get_stats()['rows']},{get_stats()['columns']},"\
            f"{get_stats()['words']},{get_stats()['blocks']},{get_stats()['missing']}\n"\
            )

In [53]:
def make_csv_rows(date, clues, direction):
    '''updated write_clues_to_csv to instead make the clues as a list of lists, for csv_writer'''
    rows = []
    for l,c,a in zip(*clues):
        row = [
            date.year,date.month,date.day,date.strftime('%A'),
            direction,l,c.strip(),a.strip(),
            get_authors()['author'], get_authors()['editor'],
            puz_info()['title'],puz_info()['clue'],
            get_stats()['rows'],get_stats()['columns'],
            get_stats()['words'],get_stats()['blocks'],get_stats()['missing']
            ]
        rows.append(row)
    return rows


Running the cell below will start the scraping process!

In [60]:
import csv
header = "Year,Month,Day,Weekday,Direction,Line,Hint,Answer,Author,Editor,PuzTitle,PuzClue,NumRows,NumCols,NumWords,NumBlocks,MissingLetters".split(',')

with open("nick-xword.csv",'a',newline='') as f:
    start_date = date(1995, 1, 1)
    end_date = date(1995, 1, 7) # go up to 2021,5,1
    csv_writer = csv.writer(f)
    csv_writer.writerow(header)
    for single_date in daterange(start_date, end_date):
        try:
            day_of_week = single_date.strftime('%A')
            date_for_url = single_date.strftime("%#m/%#d/%Y") # replace # with - on mac/linux
            print(date_for_url)
            html = urlopen("https://www.xwordinfo.com/Crossword?date="+date_for_url)
            soup = BeautifulSoup(html,"html.parser")
            numclue = soup.find_all('div', {'class': 'numclue'})
            across, down = get_clues(numclue)
            csv_writer.writerows(make_csv_rows(single_date, across, "Across"))
            csv_writer.writerows(make_csv_rows(single_date, down, "Down"))
        except Exception as e:
            with open("nick-errors.txt", "w+") as log:
                log.write(f"Error getting puzzle for {single_date}: {e}")

1/6/1995
1/5/1995
1/4/1995
1/3/1995
1/2/1995
1/1/1995


Test the output file with pandas:

In [67]:
df = pd.read_csv("nick-xword.csv")

df.sample(10)

Unnamed: 0,Year,Month,Day,Weekday,Direction,Line,Hint,Answer,Author,Editor,PuzTitle,PuzClue,NumRows,NumCols,NumWords,NumBlocks,MissingLetters
447,1995,1,1,Sunday,Across,121,Where heros are made,DELI,Henry Hook,Will Shortz,A New Beginning,"Adolphe ___, musical instrument inventor",21,22,151,76,
430,1995,1,1,Sunday,Across,90,Greek consonants,PSIS,Henry Hook,Will Shortz,A New Beginning,"Adolphe ___, musical instrument inventor",21,22,151,76,
281,1995,1,3,Tuesday,Down,23,"""Far out""",RAD,Rich Norris,Will Shortz,"New York Times, Tuesday, January 3, 1995","Jodie Foster's directorial debut, 1991",15,15,78,38,JQZ
221,1995,1,4,Wednesday,Down,57,Buster Brown's dog,TIGE,Stephanie Spadaccini,Will Shortz,"New York Times, Wednesday, January 4, 1995","""Wake Me Up Before You Go-Go"" group",15,15,78,38,FQXZ
290,1995,1,3,Tuesday,Down,40,Atty.'s degree,LLB,Rich Norris,Will Shortz,"New York Times, Tuesday, January 3, 1995","Jodie Foster's directorial debut, 1991",15,15,78,38,JQZ
137,1995,1,5,Thursday,Down,39,Respond angrily,LASHOUT,Harvey Estes,Will Shortz,"New York Times, Thursday, January 5, 1995",Period starting about 1000 B.C.,15,15,72,33,BFJVY
253,1995,1,3,Tuesday,Across,51,Monkeyshine,ANTIC,Rich Norris,Will Shortz,"New York Times, Tuesday, January 3, 1995","Jodie Foster's directorial debut, 1991",15,15,78,38,JQZ
300,1995,1,3,Tuesday,Down,57,Cast out,EGEST,Rich Norris,Will Shortz,"New York Times, Tuesday, January 3, 1995","Jodie Foster's directorial debut, 1991",15,15,78,38,JQZ
0,1995,1,6,Friday,Across,1,Karate blow,CHOP,Fran & Lou Sabin,Will Shortz,"New York Times, Friday, January 6, 1995","""Werewolf of London"" star, 1935",15,15,78,38,FJQZ
330,1995,1,2,Monday,Across,52,Lair,DEN,Sidney L. Robbins,Will Shortz,"New York Times, Monday, January 2, 1995",Shape of St. Anthony's cross,15,15,78,42,JQXYZ
