In [11]:
## Import base packages we'll use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#from seaborn import set_style
#set_style("whitegrid")

## this import BeautifulSoup
from bs4 import BeautifulSoup

from urllib.request import urlopen

from datetime import timedelta, date

In [12]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

def daterange(start_date, end_date):
    """
    https://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python
    
    returns generator to iterate over days in date range

    modified here to go backwards
    """
    for n in range(int((end_date - start_date).days))[::-1]:
        yield start_date + timedelta(n)

def get_clues_from_column(column):
    """
    Converts list of divs to lines, clues, and answers
    
    Twice as many <div>s as hints/answers:
    each line in column is 
    <div>Number</div>
    <div>
    Clue : <a href="asdf">Answer</a>
    </div>
    """
    lines = []
    clues = []
    answers = []
    for i in range(int(len(column)/2)):
        lines.append(int(column[2*i].text))
        clues.append(column[2*i+1].text.replace(",", ";").split(":")[0]) # This splits on commas too
        answers.append(column[2*i+1].a.text)
    return lines, clues, answers

def get_clues(numclue):
    """
    There are 2 <div class="numclue">, 
    one for across hints/answers and one for down
    
    returns across and down as tuples
    can be expanded with *
    """
    across_divs = numclue[0].find_all('div')
    down_divs = numclue[1].find_all('div')
    
    a_lines, a_clues, a_answers = get_clues_from_column(across_divs)
    d_lines, d_clues, d_answers = get_clues_from_column(down_divs)
    
    across = (a_lines, a_clues, a_answers)
    down = (d_lines, d_clues, d_answers)
    
    return across, down

def print_column(lines, clues, answers):
    for l,c,a in zip(lines, clues, answers):
        print("%d - %s : %s" % (l,c,a))
        
def write_clues_to_csv(date, clues, fname, direction):
    for l,c,a in zip(*clues):
        fname.write(f"{date.year},{date.month},{date.day},{date.strftime('%A')},{direction},{l},{c},{a}\n")

In [13]:
with open("nick-xword.csv", 'w+', encoding='utf-8') as csv_file:
    csv_file.write("Year,Month,Day,Weekday,Direction,Line,Hint,Answer\n")

    start_date = date(1995, 1, 1)
    end_date = date(2021, 5, 1) # go up to 2021,5,1

    for single_date in daterange(start_date, end_date):
        try:
            day_of_week = single_date.strftime('%A')
            date_for_url = single_date.strftime("%#m/%#d/%Y") # replace # with - on mac/linux
            print(date_for_url)
            html = urlopen("https://www.xwordinfo.com/Crossword?date="+date_for_url)
            soup = BeautifulSoup(html,"html.parser")
            numclue = soup.find_all('div', {'class': 'numclue'})
            across, down = get_clues(numclue)
            #print("across")
            #print_column(*across)
            #print("Down")
            #print_column(*down)
            write_clues_to_csv(single_date, across, csv_file, "Across")
            write_clues_to_csv(single_date, down, csv_file, "Down")
        except:
            print(f"Could not get puzzle for {single_date}")

4/30/2021
4/29/2021
4/28/2021
4/27/2021
4/26/2021
4/25/2021
4/24/2021
4/23/2021
4/22/2021
4/21/2021
4/20/2021
4/19/2021
4/18/2021
4/17/2021
4/16/2021
4/15/2021
4/14/2021
4/13/2021
4/12/2021
4/11/2021
4/10/2021
4/9/2021
4/8/2021
4/7/2021
4/6/2021
4/5/2021
4/4/2021
4/3/2021
4/2/2021
4/1/2021
3/31/2021
3/30/2021
3/29/2021
3/28/2021
3/27/2021
3/26/2021
3/25/2021
3/24/2021
3/23/2021
3/22/2021
3/21/2021
3/20/2021
3/19/2021
3/18/2021
3/17/2021
3/16/2021
3/15/2021
3/14/2021
3/13/2021
3/12/2021
3/11/2021
3/10/2021
3/9/2021
3/8/2021
3/7/2021
3/6/2021
3/5/2021
3/4/2021
3/3/2021
3/2/2021
3/1/2021
2/28/2021
2/27/2021
2/26/2021
2/25/2021
2/24/2021
2/23/2021
2/22/2021
2/21/2021
2/20/2021
2/19/2021
2/18/2021
2/17/2021
2/16/2021
2/15/2021
2/14/2021
2/13/2021
2/12/2021
2/11/2021
2/10/2021
2/9/2021
2/8/2021
2/7/2021
2/6/2021
2/5/2021
2/4/2021
2/3/2021
2/2/2021
2/1/2021
1/31/2021
1/30/2021
1/29/2021
1/28/2021
1/27/2021
1/26/2021
1/25/2021
1/24/2021
1/23/2021
1/22/2021
1/21/2021
1/20/2021
1/19/2021
1/18/20

# Notes

- 6/7/2000 is a uniclue puzzle, not included for now

In [13]:
with open("xword2.csv", 'w+', encoding='utf-8') as csv_file:
    #csv_file = open("xword2.csv", "w+")
    csv_file.write("Year,Month,Day,Weekday,Direction,Line,Hint,Answer\n")

    single_date = date(1994,6,19)

    html = urlopen("https://www.xwordinfo.com/Crossword?date=6/19/1994")
    soup = BeautifulSoup(html,"html.parser")
    numclue = soup.find_all('div', {'class': 'numclue'})
    across, down = get_clues(numclue)
    print("across")
    print_column(*across)
    print("Down")
    print_column(*down)
    write_clues_to_csv(single_date, across, csv_file, "Across")
    write_clues_to_csv(single_date, down, csv_file, "Down")

#csv_file.close()

across
1 - Rodeo rope  : LASSO
6 - City north of Des Moines  : AMES
10 - Sch. supporters  : PTAS
14 - Esaus wife  : ADAH
18 - Travel section advertiser  : USAIR
19 - ___ wire  : LIVE
20 - Elektra baritone  : OREST
21 - Army mascot  : MULE
22 - ROBS  : CARLREINER
24 - BRIDGETS  : PETERFONDA
26 - Neighbor of Scot.  : ENG
27 - JOHNS  : TEXRITTER
29 - Escapee  : ELUDER
30 - Anarchist Goldman  : EMMA
33 - Gladly; in olden times  : FAIN
34 - Night rumblers  : SNORERS
35 - Asia-Africa link  : SINAI
37 - Org. once headed by Lewis Strauss  : AEC
40 - Half of D  : CCL
41 - Conceit  : EGO
42 - Exaggerators suffix  : EST
43 - Yucca fiber  : ISTLE
44 - NANCYS  : FRANKSINATRA
48 - Theyre checked at checkpoints  : VISAS
49 - Record collections  : FILES
50 - Adam was his father  : SETH
51 - Yellow-breasted bird  : CHAT
55 - Pale  : ASHY
56 - Cockatoo cousin  : MACAW
57 - Approved model : STD
58 - Contrary girl  : MARY
59 - Seventh Muslim month  : RAJAB
61 - Gershwins ___ It a Pity?  : ISNT

In [12]:
csv_file.close()

["Smuggler's nemesis, maybe ", ' SHOREPATROL']

In [17]:
df = pd.read_csv("xword.csv")

df.tail()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 11: invalid continuation byte