## Setup

In [1]:
# Imports

import hfst_dev as hfst
import graphviz
import random

import itertools
import random
from nltk.parse.generate import generate
from nltk.probability import FreqDist
from nltk import CFG
from nltk import grammar

In [2]:
# Stream English 

istream = hfst.HfstInputStream('English')
assert istream.is_good() == True
English = istream.read()
istream.close()

In [53]:
type(English)

libhfst_dev.HfstTransducer

In [3]:
# Functions to sample input and output

def sample_input(x,n=8,cycles=3):
        x2 = x.copy()
        x2.input_project()
        x2.minimize()
        return(random.sample(set(x2.extract_paths(max_cycles=3).keys()),n))
def sample_output(x,n=8,cycles=3):
        x2 = x.copy()
        x2.output_project()
        x2.minimize()
        return(random.sample(set(x2.extract_paths(max_cycles=3).keys()),n))

In [4]:
# Set up definitions from phoneclass.fst 

defs = {'English' : English}

VowAA = hfst.regex('AA0 | AA1 | AA2', definitions=defs)
defs['VowAA'] = VowAA
VowAE = hfst.regex('AE0 | AE1 | AE2', definitions=defs)
defs['VowAE'] = VowAE
VowAH = hfst.regex('AH0 | AH1 | AH2', definitions=defs)
defs['VowAH'] = VowAH
VowAO = hfst.regex('AO0 | AO1 | AO2', definitions=defs)
defs['VowAO'] = VowAO
VowAW = hfst.regex('AW0 | AW1 | AW2', definitions=defs)
defs['VowAW'] = VowAW
VowAY = hfst.regex('AY0 | AY1 | AY2', definitions=defs)
defs['VowAY'] = VowAY
VowEH = hfst.regex('EH0 | EH1 | EH2', definitions=defs)
defs['VowEH'] = VowEH
VowER = hfst.regex('ER0 | ER1 | ER2', definitions=defs)
defs['VowER'] = VowER
VowEY = hfst.regex('EY0 | EY1 | EY2', definitions=defs)
defs['VowEY'] = VowEY
VowIH = hfst.regex('IH0 | IH1 | IH2', definitions=defs)
defs['VowIH'] = VowIH
VowIY = hfst.regex('IY0 | IY1 | IY2', definitions=defs)
defs['VowIY'] = VowIY
VowOW = hfst.regex('OW0 | OW1 | OW2', definitions=defs)
defs['VowOW'] = VowOW
VowOY = hfst.regex('OY0 | OY1 | OY2', definitions=defs)
defs['VowOY'] = VowOY
VowUH = hfst.regex('UH0 | UH1 | UH2', definitions=defs)
defs['VowUH'] = VowUH
VowUW = hfst.regex('UW0 | UW1 | UW2', definitions=defs)
defs['VowUW'] = VowUW

Vow0 = hfst.regex('AH0| IH0| ER0| IY0| OW0| AA0| EH0| UW0| AE0| AO0| AY0| EY0| AW0| UH0| OY0', definitions=defs)
defs['Vow0'] = Vow0
Vow1 = hfst.regex('EH1| AE1| AA1| IH1| IY1| EY1| OW1| AO1| AY1| AH1| UW1| ER1| AW1| UH1| OY1', definitions=defs)
defs['Vow1'] = Vow1
Vow2 = hfst.regex('EH2| EY2| AE2| AY2| AA2| IH2| OW2| IY2| AO2| UW2| AH2| AW2| ER2| UH2| OY2', definitions=defs)
defs['Vow2'] = Vow2

Vow = hfst.regex('Vow0 | Vow1 | Vow2', definitions=defs)
defs['Vow'] = Vow

Nas = hfst.regex('N | M | NG', definitions=defs)
defs['Nas'] = Nas

Phone = hfst.regex('AH0| N| S| L| T| R| K| D| IH0| M| Z| ER0| IY0| B| EH1| P| AE1| AA1| IH1| F| G| V| IY1| NG| HH| EY1| W| SH| OW1| OW0| AO1| AY1| AH1| UW1| JH| Y| CH| AA0| ER1| EH2| EY2| AE2| AY2| AA2| EH0| IH2| TH| AW1| OW2| UW0| IY2| AO2| AE0| UH1| AO0| AY0| UW2| AH2| EY0| OY1| AW2| DH| ZH| ER2| UH2| AW0| UH0| OY2| OY0', definitions = defs)
defs['Phone'] = Phone

Cons = hfst.regex('[Phone - Vow]', definitions = defs)
defs['Cons'] = Cons

## Generating Stress Classes 

Only created classes we could use (unstressed stressed pattern): for example s0s0 wouldnt be helpful to us, so we didn't define it 

### One Syllable Words

#### Stressed

In [5]:
expr = '[English .o. [[ Cons* Vow1 Cons* ].l]].u'
n = hfst.regex(expr, definitions=defs)
defs["s1"] = n

#### Unstressed

In [6]:
expr = '[English .o. [[ Cons* Vow0 Cons* ].l]].u'
n = hfst.regex(expr, definitions=defs)
defs["s0"] = n

### Two Syllable Words

#### Main stress first

In [7]:
expr = '[English .o. [[Cons* Vow1 Cons* Vow0 Cons*]].l].u'
n = hfst.regex(expr, definitions=defs)
defs["s1s0"] = n

expr = '[English .o. [[Cons* Vow1 Cons* Vow2 Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["s1s2"] = m

#### Main stress second 

In [8]:
expr = '[English .o. [[Cons* Vow0 Cons* Vow1 Cons*]].l].u'
n = hfst.regex(expr, definitions=defs)
defs["s0s1"] = n

expr = '[English .o. [[Cons* Vow2 Cons* Vow1 Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["s2s1"] = m

### Three Syllable Words

#### Stressed, unstressed, stressed

In [9]:
expr = '[English .o. [[ Cons* Vow1 Cons* Vow0 Cons* Vow1 Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["s1s0s1"] = m

expr = '[English .o. [[ Cons* Vow1 Cons* Vow0 Cons* Vow2 Cons*]].l].u'
n = hfst.regex(expr, definitions=defs)
defs["s1s0s2"] = n

expr = '[English .o. [[ Cons* Vow1 Cons* Vow2 Cons* Vow1 Cons*]].l].u'
o = hfst.regex(expr, definitions=defs)
defs["s1s2s1"] = o

expr = '[English .o. [[ Cons* Vow2 Cons* Vow0 Cons* Vow2 Cons*]].l].u'
p = hfst.regex(expr, definitions=defs)
defs["s2s0s2"] = p

expr = '[English .o. [[ Cons* Vow2 Cons* Vow0 Cons* Vow1 Cons*]].l].u'
q = hfst.regex(expr, definitions=defs)
defs["s2s0s1"] = q

expr = '[English .o. [[ Cons* Vow1 Cons* Vow2 Cons* Vow2 Cons*]].l].u'
r = hfst.regex(expr, definitions=defs)
defs["s1s2s2"] = r

expr = '[English .o. [[ Cons* Vow2 Cons* Vow2 Cons* Vow1 Cons*]].l].u'
s = hfst.regex(expr, definitions=defs)
defs["s2s2s1"] = s

#### Unstressed, stressed, unstressed

In [10]:
expr = '[English .o. [[ Cons* Vow0 Cons* Vow1 Cons* Vow0 Cons*]].l].u'
n = hfst.regex(expr, definitions=defs)
defs["s0s1s0"] = n

expr = '[English .o. [[ Cons* Vow0 Cons* Vow1 Cons* Vow2 Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["s0s1s2"] = m

expr = '[English .o. [[ Cons* Vow0 Cons* Vow2 Cons* Vow0 Cons*]].l].u'
o = hfst.regex(expr, definitions=defs)
defs["s0s2s0"] = o

expr = '[English .o. [[ Cons* Vow2 Cons* Vow1 Cons* Vow0 Cons*]].l].u'
p = hfst.regex(expr, definitions=defs)
defs["s2s1s0"] = p

expr = '[English .o. [[ Cons* Vow2 Cons* Vow1 Cons* Vow2 Cons*]].l].u'
q = hfst.regex(expr, definitions=defs)
defs["s2s1s2"] = q

## Generating Rhyme Classes

In [20]:
expr = '[English .o. [[ Phone* VowAA Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeAA"] = m

expr = '[English .o. [[ Phone* VowAE Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeAE"] = m

expr = '[English .o. [[ Phone* VowAH Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeAH"] = m

expr = '[English .o. [[ Phone* VowAO Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeAO"] = m

expr = '[English .o. [[ Phone* VowAW Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeAW"] = m

expr = '[English .o. [[ Phone* VowAY Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeAY"] = m

expr = '[English .o. [[ Phone* VowEH Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeEH"] = m

expr = '[English .o. [[ Phone* VowER Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeER"] = m

expr = '[English .o. [[ Phone* VowEY Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeEY"] = m

expr = '[English .o. [[ Phone* VowIH Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeIH"] = m

expr = '[English .o. [[ Phone* VowIY Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeIY"] = m

expr = '[English .o. [[ Phone* VowOW Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeOW"] = m

expr = '[English .o. [[ Phone* VowUH Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeUH"] = m

expr = '[English .o. [[ Phone* VowUW Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
defs["rhymeUW"] = m

## Generating Iambic Pentameter

In [11]:
def sample(wordClasses : list, defs) -> (str, str):
    # [wordClasses] a LIST of [word class, frequency] lists, with word classes defined in [defs]
    # Frequencies should add up to 1
    # 
    # Returns: (word class, sample)
    r = random.random()
    for wordClass in wordClasses:
        r -= wordClass[1]
        if r < 0:
            return (wordClass[0], sample_input(hfst.regex(wordClass[0], definitions=defs), n=1)[0])

In [12]:
def classesToList(lst, wordClasses : dict):
    result = []
    sumFreq = 0
    for wc in lst:
        result.append([wc, wordClasses[wc]])
        sumFreq += wordClasses[wc]
    for i in range(len(result)):
        result[i][1] /= sumFreq
    return result

In [13]:
def generate_iambs(wordClasses : dict, defs):
    # [wordClasses] a DICTIONARY mapping word classes (defined in [defs]) to frequencies
    # Frequencies should add up to 1
    # Each element: s1 (primary), s2 (secondary), s0 (unstressed)
    syllables = []
    words_out = []
    index = 0
    while index < 10:
        if index == 0:
            preLst = ["s0", "s0s1", "s2s1", "s0s1s0", "s0s1s2", "s0s2s0", "s2s1s0", "s2s1s2"]
        elif index == 8:
            if index % 2 == 0:
                preLst = ["s0", "s0s1"]
                if syllables[index - 1] == "s1":
                    preLst.extend(["s2s1"])
        elif index == 9:
            preLst = ["s1"]
        else:
            # Unstressed
            if index % 2 == 0:
                preLst = ["s0", "s0s1", "s0s1s0", "s0s1s2", "s0s2s0"]
                if syllables[index - 1] == "s1":
                    preLst.extend(["s2s1", "s2s1s0", "s2s1s2"])
            # Stressed
            else:
                preLst = ["s1", "s1s0", "s1s2", "s1s0s1", "s1s0s2", "s1s2s1", "s1s2s2"]
                if syllables[index - 1] == "s0":
                    preLst.extend(["s2s0s2", "s2s0s1", "s2s2s1"])
                    
        lst = classesToList(preLst, wordClasses)      
        wordClass, word = sample(lst, defs)
        wordSyl = wordClass.split("s")
        for syl in wordSyl:
            if syl != "":   
                syllables.append("s" + syl) 
                index += 1
        words_out.append(word)
        
    return words_out

In [14]:
wordClasses = {"s0": 1/18, "s1": 1/18, "s0s1": 1/18, "s1s0": 1/18, "s2s1": 1/18, "s1s2": 1/18, "s0s1s0": 1/18, "s0s1s2": 1/18, "s0s2s0": 1/18, "s2s1s0": 1/18, "s2s1s2": 1/18, "s1s0s1": 1/18, "s1s0s2": 1/18, "s1s2s1": 1/18, "s1s2s2": 1/18, "s2s0s2": 1/18, "s2s0s1": 1/18, "s2s2s1": 1/18}

In [15]:
generate_iambs(wordClasses, defs)

['ac|credit', 'denouement', 'ac|credit', 'co|os']

... put line through parser => grammatically sound even if not semnatically 

Define a limited set of terminals and a grammar that checks GRAMMATICAL VALIDITY without semantic evaluation; can create a CFG that does this 


Could have each line be it's own sentence, or could have a way to check line breaks (The big dogs// were on logs); have a line break before


Viable approach: creating a grammar that generates a sonnet nums=? meter=?

In [313]:

# keep this to show how we used hfst to generate rhymes and find words 

def sample_input(x,n=1,cycles=3):
        x2 = x.copy()
        x2.input_project()
        x2.minimize()
        return(random.sample(set(x2.extract_paths(max_cycles=3).keys()),n))

expr = '[{pronto} .o. English].l'
m = hfst.regex(expr, definitions=defs)
sample_input(m)

['PRAA1NTOW0']

In [217]:
def sample_input(x,n=10,cycles=3):
        x2 = x.copy()
        x2.input_project()
        x2.minimize()
        return(random.sample(set(x2.extract_paths(max_cycles=3).keys()),n))
    
expr = '[English .o. [[ Phone* VowUW Cons*]].l].u'
m = hfst.regex(expr, definitions=defs)
sample_input(m)

['t|he|ro|ux',
 'la|wsuit',
 'lerew',
 'revie|wed',
 'bal|lo|u',
 'rilwanu',
 'verisimilitude',
 'marco|u',
 "tro|op's",
 'yazo|o']

In [248]:
AA = ["snowfall", "sakura", "enlarge", "jumpstart", "unharmed", "remark", "wasp", "resolved", "jock", "charm"]
AE = ["bask", "replant", "chasse", "hunchback", "woodland", "thrash", "catch", "ads", "fad", "mailbag"]
AH = ["oration", "straightened", "walnut", "abhorrent", "credence", "megaton", "puma", "stuffs", "junction", "patients"]
AO = ["troughs", "frauds", "meatballs", "imports", "lords", "hoar", "malformed", "billboard", "shorn", "thorns"]
AW = ["bloodhound", "crowns", "blackout", "reroute", "loud", "hometown", "scowl", "countdown", "rouse", "mount"]
AY = ["devise", "privatize", "bribe", "modernize", "coincide", "chimes", "deprived", "reunite", "apprise", "knifelike"]
EH = ["doorsteps", "aspects", "flare", "sleepwear", "pens", "pastel", "bullpen", "pipette"]
ER = ["rewired", "spindler", "harvesters", "thunders", "lowered", "gander", "prisoners", "trimmer", "scholar", "modern"]
EY = ["prepaid", "gateways", "blockades", "replace", "cliched", "acclimate", "drain", "birthdays", "upscale", "sedate"]
IH = ["hallways", "parades", "dislocate", "hurricane", "escape", "downplay", "shortchange", "lace", "days"]
IY = ["coyote", "squeaky", "delete", "cheek", "cream", "blackberry", "publicly", "blatantly"]
OW = ["yolks", "chrome", "intone", "pronto", "sorrow", "disowned", "potatoes", "mole", "notes"]
OY = ["decoy", "convoy", "noise", "annoy", "purloin", "steroid", "datapoint", "boy", "tabloids", "soy"]
UH = ["wolves", "underwood", "scrapbooks", "cooked", "schedules", "cookbooks", "endure", "understood", "rook", "woods"]
UW = ["balloons", "typhoons", "duped", "croon", "loon", "resume", "ingenue", "remove", "lawsuit", "troops"]
