updated 5/19/2021

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.cm as cm
import nltk
from collections import defaultdict

In [None]:
# read csv making sure Answer column is a string because of "NULL" answers
df = pd.read_csv("nick-xword-big.csv",converters={'Answer' : str, 'Hint' : str})

# Puzzle count by author
df.groupby(['Author'])['PuzTitle'].nunique().sort_values(ascending=False).reset_index()

In [None]:
df.head()

In [None]:
#Average number of blocks by weekday
df.groupby(['Weekday'])['NumBlocks'].mean().sort_values(ascending=False).reset_index()

In [None]:
# Create BlockDensity column
df['BlockDensity'] = (df['NumBlocks'] / (df['NumRows'] * df['NumCols']))*100

In [None]:
#Make a column for answer length
df['AnswerLength'] = df['Answer'].str.len()

In [None]:
# top ten longest answers
df.sort_values(by='AnswerLength',ascending=False).head(10)[['Weekday','Hint','Answer','AnswerLength']]

## block density

In [None]:
# Get the average BlockDensity by day, and also all individual BlockDensity values for the plot later
data = df.groupby(['Weekday'])['BlockDensity'].agg([np.mean, list]).reset_index()
data

In [None]:
# Sort by the mean
data.sort_values(by='mean', ascending=False, inplace=True)

In [None]:
# plot
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
bplot = ax.boxplot(data['list'],patch_artist=True, medianprops=dict(color='black'))
ax.set_xticklabels(data['Weekday'].tolist(),fontsize=14)
ax.set_ylabel('pct of puzzle as blocks',fontsize=16)
ax.set_title('BlockDensity by weekday',fontsize=20)
ax.grid(True,axis='y')

cmap = cm.get_cmap('Pastel1')

for patch, color in zip(bplot['boxes'], cmap.colors):
    patch.set_facecolor(color)
#plt.savefig(fname="density")
plt.show()

## puzzle density over time

In [None]:
# make a single date column (this should also serve a unique puzzle identifier)
df['Date'] = df['Year'].astype(str) +'-'+ df['Month'].astype(str) +'-'+ df['Day'].astype(str)
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
time_data = df.groupby(['Weekday','Year'])['BlockDensity'].mean().reset_index().copy()
time_data

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)

for day in df.Weekday.unique().tolist():
    ax.plot(time_data['Year'].loc[time_data['Weekday'] == day],time_data['BlockDensity'].loc[time_data['Weekday'] == day],label=day)


ax.set_ylabel('pct of puzzle as blocks',fontsize=14)
ax.set_title('Average weekday puzzle density by year',fontsize=16)
ax.legend()

plt.show()

In [None]:
# inspect short answers
df.loc[df['Answer'].str.len() < 2]

## average answer length

In [None]:
len_data = df.groupby('Weekday')['AnswerLength'].agg([np.mean, list]).reset_index().copy()
len_data.sort_values(by='mean', ascending=True, inplace=True)
len_data

In [None]:
fig1, ax1 = plt.subplots()
fig1.set_size_inches(10, 8)

ax1.boxplot(len_data['list'], showfliers=False, showmeans=True)
ax1.set_xticklabels(len_data['Weekday'].tolist(),fontsize=14)
ax1.set_ylabel('length of answer',fontsize=16)
ax1.set_title('Answer length by weekday',fontsize=20)
ax1.grid(True,axis='y')

plt.show()

## playing with homophones, work in progress

In [None]:
prondict = nltk.corpus.cmudict.dict()
pronentries = nltk.corpus.cmudict.entries()

In [None]:
homophonedict = defaultdict(list)

# compile a dictionary with pronunciations as keys and lists of words as values
for word, pron in pronentries:
    ent = tuple(pron)
    homophonedict[ent].append(word)

In [None]:
def get_homophones(word):
    '''take a word and return all other words with potentially same pronunciation as a list'''
    homophones = set()
    # get all pronunciations of the word
    prons = prondict.get(word)
    # feed each pron into homophone dict, get results
    if prons is not None:
        for p in prons:
            homophones.update(homophonedict.get(tuple(p)))
    # return unique homophones
    homophones.discard(word)
    return list(homophones)

In [None]:
get_homophones('bite')

In [None]:
test = "The man walked to the store and bought a pie?"

def get_clue_homophones(clue):
    num_homophones = 0
    tokens = nltk.word_tokenize(clue)
    for token in tokens:
        num_homophones += len(get_homophones(token.lower()))
        #print(get_homophones(token.lower()))
    return num_homophones

In [None]:
get_clue_homophones(test)

In [None]:
get_clue_homophones('bite?')

In [None]:
df['HintHomophones'] = df['Hint'].apply(lambda x: get_clue_homophones(x))

In [None]:
df['HintLength'] = df['Hint'].apply(lambda x: len(nltk.word_tokenize(x)))

In [None]:
df.groupby('Weekday')['HintHomophones'].sum()

In [None]:
for i in range(len(df)):
    try:
        get_clue_homophones(df['Hint'].loc[i])
    except:
        print(i,df['Hint'].loc[i])

In [None]:
df.loc[235264]