In [3]:
import re


In [78]:


# Splits a single word into syllables.
def syllabify_word(text):
    return _perform_final_splits(_perform_initial_splits(text))

# Performs the first (easy and unambiguous) phase of syllabification.
def _perform_initial_splits(text):
    return _split_hiatus(_split_dieresis(_split_double_cons(_split_multiple_cons(text))))

# Performs the second (difficult and heuristic) phase of syllabification.
def _perform_final_splits(text):
    # ho aggiunto l'h -> 'richeggio'
    cvcv = r"""(?i)([bcdfglmnpqrstvz][ÄäÁÀàáAaËëÉÈèéEeÏïÍÌíìIiÖöÓÒóòOoÜüÚÙúùUu]+)([bcdfglmnpqrstvz]+[ÄäÁÀàáAaËëÉÈèéEeÏïÍÌíìIiÖöÓÒóòOoÜüÚÙúùUuHh]+)"""
    vcv = r"""(?i)([ÄäÁÀàáAaËëÉÈèéEeÏïÍÌíìIiÖöÓÒóòOoÜüÚÙúùUu]+)([bcdfglmnpqrstvz]+[ÄäÁÀàáAaËëÉÈèéEeÏïÍÌíìIiÖöÓÒóòOoÜüÚÙúùUu]+)"""
#    vv = r"""(?i)(?<=[ÄäAaËëEeÏïIiÖöOoÜüUu])(?=[ÄäAaËëEeÏïIiÖöOoÜüUu])"""

#    vv = r"""(?i)(?<=[AaEeIiOoUu])(?=[AaEeIiOoUu])"""

    vv = r"""(?i)(?<=[ÄäÁÀàáAaËëÉÈèéEeÏïÍÌíìIiÖöÓÒóòOoÜüÚÙúùUu])(?=[ÄäÁÀàáAaËëÉÈèéEeÏïÍÌíìIiÖöÓÒóòOoÜüÚÙúùUu])"""

    # Split the contoid vocoid - contoid vocoid case (eg. ca-ne). Deterministic.
    out = re.sub(cvcv, r"""\1#\2""", text)
    # Split the vocoid - contoid vocoid case (eg. ae-reo). Deterministic.
    out = re.sub(vcv, r"""\1#\2""", out)

    # Split the vocoid - vocoid case (eg. a-iuola). Heuristic.
    out = _clump_diphthongs(out)
    out = re.sub(vv, r"""#""", out)
    out = re.sub("§", "", out)

    return out

# Splits double consonants (eg. al-legro)
def _split_double_cons(text): # ok
    doubles = re.compile(r"""(?i)(([bcdfglmnpqrstvz])(?=\2)|c(?=q))""")
    return "#".join(doubles.sub(r"""\1@""", text).split("@"))

# Splits multiple consonants, except: impure s (sc, sg, etc.), mute followed by liquide (eg. tr), digrams and trigrams.
def _split_multiple_cons(text):
    impures = re.compile(r"""(?i)(s(?=[bcdfghlmnpqrtvz]))""")
    muteliquide = re.compile(r"""(?i)([bcdfgptv](?=[lr]))""")
    digrams = re.compile(r"""(?i)(g(?=l[iì])|g(?=n[aeiouàèéìòóù])|s(?=c[eèéiì])|[cg](?=h[eèéiì])|[cg](?=i[aàoòuù]))""")
    trigrams = re.compile(r"""(?i)(g(?=li[aàoòuù])|s(?=ci[aàoòuù]))""")
    multicons = re.compile(r"""(?i)([bcdfglmnpqrstvz](?=[bcdfglmnpqrstvz]+))""")

    # Preserve non admissibile splits.
    out ="§".join(impures.sub(r"""\1@""", text).split("@"))
    out = "§".join(muteliquide.sub(r"""\1@""", out).split("@"))
    out = "§".join(digrams.sub(r"""\1@""", out).split("@"))
    out = "§".join(trigrams.sub(r"""\1@""", out).split("@"))
    # Split everything else.
    out = "#".join(multicons.sub(r"""\1@""", out).split("@"))

    return "".join(re.split("§", out))

# Splits dieresis.
def _split_dieresis(text):
    dieresis = re.compile(r"""(?i)([ÄäËëÏïÖöÜü](?=[aeiou])|[aeiou](?=[ÄäËëÏïÖöÜü]))""")
    return "#".join(dieresis.sub(r"""\1@""", text).split("@"))

# Splits SURE hiatuses only. Ambiguous ones are heuristically considered diphthongs.
def _split_hiatus(text):
    # ho tolto cose... i - u caso 'più','guida'
    # e aggiunto ^
    # hiatus = re.compile(r"""(?i)([aeoàèòóé](?=[aeoàèòóé])|[rbd]i(?=[aeou])|tri(?=[aeou])|[ìù](?=[aeiou])|[aeiou](?=[ìù]))""")
    
    # ok
#    hiatus = re.compile(r"""(?i)([aeoàèòóé](?=[aeoàèòóé])|^[rbd]i(?=[aeou])|^tri(?=[aeou])|[ì](?=[aeo])|[aeo](?=[ì])|[ù](?=[aeo])|[aeo](?=[ù]))""")


    hiatus = re.compile(r"""(?i)([aeo](?=[aeo])|^[rbd]i(?=[aeou])|^tri(?=[aeou])|[ì](?=[aeo])|[aeo](?=[ì])|[ù](?=[aeo])|[aeo](?=[ù]))""")

    return "#".join(hiatus.sub(r"""\1@""", text).split("@"))

# Prevents splitting of diphthongs and triphthongs.
def _clump_diphthongs(text):
    diphthong = r"""(?i)(i[aeouàèéòóù]|u[aeioàèéìòó]|[aeouàèéòóù]i|[aeàèé]u)"""
    diphthongsep = r"""(\{.)(.\})"""
    # triphthong = r"""(?i)(i[àèé]i|u[àòó]i|iu[òó]|ui[èéà])""" #nostra
    triphthong = r"""(?i)(i[àèéòó]i|u[àèéòó]i|iu[òó]|ui[èéà])""" #nostra

    triphthongsep = r"""(\{.)(.)(.\})"""

    out = re.sub(triphthong, r"""{\1}""", text)
    out = re.sub(triphthongsep, r"""\1§\2§\3""", out)
    out = re.sub(diphthong, r"""{\1}""", out)
    out = re.sub(diphthongsep, r"""\1§\2""", out)
    out = re.sub(r"""[{}]""", "", out)

    return out

In [79]:
words = ['litoràle', 'fèudo', 'diluìto', 'paùra', 'interstallàre', 'tèmpra', 
         'vòlto', 'gnòmo', 'sciàme', 'ingràto', 'esclùso', 'aiuòle', 'acqua', 'lasciàre',
        'accreditàre', 'inglèse', 'gliène', 'guìda', 'guida', 'amìca','piìssimo',
         'piuttòsto', 'dilüìre','diluìre', 'antiàcido','pènsièro','più', 'pìo', 
         'vìa', 'noi', 'nòi', 'lui', 'lùi', 'sua', 'sùa', 'io', 'ìo', 'troia', 'tròia', 'stùdio', 'quì', 'quèi']

In [80]:
for w in words:
    print(syllabify_word(w))

li#to#rà#le
fèu#do
di#luì#to
pa#ù#ra
in#ter#stal#là#re
tèm#pra
vòl#to
gnò#mo
scià#me
in#grà#to
e#sclù#so
a#iuò#le
ac#qua
la#scià#re
ac#cre#di#tà#re
in#glè#se
gliè#ne
guì#da
gui#da
a#mì#ca
pi#ìs#si#mo
piut#tò#sto
di#lü#ì#re
di#luì#re
an#tià#ci#do
pèn#siè#ro
più
pì#o
vì#a
noi
nòi
lui
lùi
sua
sù#a
io
ì#o
troi#a
tròi#a
stù#dio
quì
quèi


In [83]:
# to check
w = 'quàle' # colpa del tone model -> corretto

#w = 'parèa'

# che  m'avea  di  paura  il  cor  compunto 

#w = 'quèi'

#w = 'tròia'
#w = 'vagliàre'
#w = 'suòi'
#w = 'mantoàna'
w = 'quèi'

print(syllabify_word(w))

quèi


In [82]:
l = [['nel'], ['<word_sep>'], ['mez', 'zo'], ['<word_sep>'], ['del'], ['<word_sep>'], ['cam', 'min'], ['<word_sep>'], ['di'], ['<word_sep>'], ['no', 'stra'], ['<word_sep>'], ['vi', 'ta'], ['<end_of_verso>']]

In [70]:
s = []
for e in l:
    s+=e
s

['nel',
 '<word_sep>',
 'mez',
 'zo',
 '<word_sep>',
 'del',
 '<word_sep>',
 'cam',
 'min',
 '<word_sep>',
 'di',
 '<word_sep>',
 'no',
 'stra',
 '<word_sep>',
 'vi',
 'ta',
 '<end_of_verso>']

In [33]:
def is_diphtong(text):
    diphthong = r"""(?i)(i[aeouàèéòóù]|u[aeioàèéìòó]|[aeouàèéòóù]i|[aeàèé]u)"""
    if re.search(diphthong, text):
        return True
    else:
        return False

def is_hiatus(text):
    hiatus = re.compile(r"""(?i)([aeoàèòóé](?=[aeoàèòóé])|^[rbd]i(?=[aeou])|^tri(?=[aeou])|[ì](?=[aeo])|[aeo](?=[ì])|[ù](?=[aeou])|[aeou](?=[ù]))""")
    if re.search(hiatus, text):
        return True
    else:
        return False

In [75]:
is_hiatus('io')

False

In [76]:
is_diphtong('ia')

True