# Preprocessing sandbox

To help improve our preprocessing script in 'helper.py' use this space to adjust things. This is also helpful for just visulizing what happens at each point in the preprocessing pipeline. 

Adapted from Miggy and Rahma's work

In [11]:
import urllib
import re
import helper

# Run this to reload helper.py so you don't have to restart the kernel
import importlib

importlib.reload(helper)

<module 'helper' from '/Users/glchau/Desktop/Caltech/CS155/loan-sharks/poems/helper.py'>

In [2]:
# Load in syllable dictionary and save outputs into a dictionary

with open("data/Syllable_dictionary.txt") as f:
    syllable_dict_ = f.readlines()

syllable_dict = {}

for line in syllable_dict_:
    word = line.strip().split()[0]
    num = line.strip().split()[-1]
    syllable_dict[word] = num
    
# Create a set of words from the syllable dictionary to match with
# words from the sonnet parsing. This is mainly for checking that 
# our parsing is consistent with the full set of words in syllable_dictionary
syllable_words = set(syllable_dict.keys())

In [3]:
# they asked us to import directly from github
# with urllib.request.urlopen("https://github.com/charlesincharge/Caltech-CS155-2022/tree/main/miniprojects/miniproject3/data/shakespeare.txt") as f:
#     shakespeare = f.readlines()
with open("data/shakespeare.txt") as f:
    shakespeare = f.readlines()


In [4]:
# Get start line of each sonnet 
p = re.compile('[\d]+')
start_indexes = []
for i, line in enumerate(shakespeare): 
    if p.match(line.strip()):
        start_indexes.append(i)

In [5]:
# Parse each sonnet
all_words = set()
all_sequences = []
for start_ind in start_indexes: 
    # Iterate through each line in the sonnet (starts at index +1 to not use the line that has the number)
    for i, line in enumerate(shakespeare[start_ind+1: start_ind+15]): 
        
        # Remove whitespace at start + end of line
        clean_line = line.strip()
        
        # Remove punctuation
        clean_line = clean_line.replace(',', '')
        clean_line = clean_line.replace(':', '')
        clean_line = clean_line.replace('"', '')
        clean_line = clean_line.replace(';', '')
        clean_line = clean_line.replace('.', '')
        clean_line = clean_line.replace('(', '')
        clean_line = clean_line.replace(')', '')
        clean_line = clean_line.replace('!', '')
        clean_line = clean_line.replace('?', '')
        
        # Remove capitalization
        clean_line = clean_line.lower() 
        
        # Create array of words
        clean_words = clean_line.split()
        
        ## Debating to account for empty lines or not. 
        ## If we account for empty lines, the resulting number of lines is not evenly divisibly by 14. 
        # if len(clean_words) == 0 :
        #     # If the line is empty as in sonnet 126, continue from this loop early
        #     continue 
        
        # Create sequence
        sequence = [] 
        for word in clean_words: 
            if word not in syllable_words: 
                # Sometimes words have extra apostrophes at the front and/or end of the word
                # that cause it not to appear in the syllable dict. This happens when 
                # Shakespeare is quoting something like 'I hate' so we can strip the apostrophes
                # before adding the word
                word = word.strip("'")
                
            all_words.add(word) 
            sequence.append(word)
            
        # Add new-line word to signify end of line.
        sequence.append('\n')  
        # TODO: maybe we can have special end-of-line tokens to signify the line #, 
        # or whether it is part of quatrain or couplet, or is the volta. 
        
        # Add sequence to all sequences
        all_sequences.append(sequence)
# XOR (i.e. all words not in the intersection of the two sets)
# Should be empty set 
print("Words that are not in intersection with syllable dictionary:", syllable_words ^ all_words)

Words that are not in intersection with syllable dictionary: set()


In [6]:
# Word embeddings (assign every word an integer number) 
word_dict = {} 
for i, word in enumerate(all_words): 
    word_dict[word] = i
word_dict['\n'] = i+1

In [7]:
# Now, convert all_sonnets into integer representation
all_sonnet_int = []
for sonnet in all_sequences:
    current_sonnet = []
    for word in sonnet:
        current_sonnet.append(word_dict[word])
    all_sonnet_int.append(current_sonnet)

In [9]:
all_sonnet_int[:10]

[[2843, 2133, 0, 7, 2400, 2369, 3205],
 [124, 355, 1837, 1853, 1621, 2109, 477, 3205],
 [239, 1219, 1757, 1329, 991, 1740, 114, 2918, 3205],
 [430, 777, 703, 1621, 1971, 430, 1882, 3205],
 [239, 2313, 1372, 2042, 1886, 129, 440, 2378, 3205],
 [2137, 1665, 674, 1697, 1780, 2275, 1679, 3205],
 [1139, 1111, 2449, 2660, 2751, 2254, 3205],
 [1665, 1048, 1665, 202, 2042, 1665, 95, 1048, 149, 984, 3205],
 [2313, 124, 2945, 2390, 1757, 130, 1996, 3081, 3205],
 [3038, 677, 2661, 2042, 1757, 750, 865, 3205]]

## Current Implementation

In [12]:
all_words, all_sequences, word_dict, all_sonnet_int = helper.getAllWordsAndSequences("data/shakespeare.txt", "data/Syllable_dictionary.txt")

In [17]:
' '.join(all_sequences[0][:-1]) + "\n"

'from fairest creatures we desire increase\n'