In [10]:
import pandas as pd
import os
from pprint import pprint

In [11]:

working_dir = os.path.join(os.path.abspath("."))
accents_vocab_file = os.path.join(working_dir, 'accent_vocab.csv')


In [12]:
accents_vocab = pd.read_csv(accents_vocab_file, sep='\t', header=0)

In [13]:
accents_vocab

Unnamed: 0,word,searched_word,found_word,syllables,toned_word,mode
0,abate,abate,abate,a-bà-te,abàte,ok
1,abbandonato,abbandonato,abbandonato,ab-ban-do-nà-to,abbandonàto,ok
2,abbandono,abbandono,abbandono,ab-ban-dó-no,abbandóno,ok
3,abbarbaglio,abbarbaglio,abbarbaglio,ab-bar-bà-glio,abbarbàglio,ok
4,abbondante,abbondante,abbondante,ab-bon-dàn-te,abbondànte,ok
...,...,...,...,...,...,...
12793,zenone,not_searched,not_founded,ze-no-ne,no_toned_word,no_tone_pyphen_split
12794,èe,not_searched,not_founded,èe,no_toned_word,no_tone_pyphen_split
12795,èli,not_searched,not_founded,è-li,no_toned_word,no_tone_pyphen_split
12796,èn,not_searched,not_founded,èn,no_toned_word,no_tone_pyphen_split


#### Possible mode
* **ok**: t word found in vocabulary, with tone and syllables
* **toned_lemma_ok**: the lemma of the word found in vocabulary, with tone and syllables
* **toned_found**: no lemma found, try to search the word and get what vocabulary return with tone and syllables
* **no_tone_pyphen_split**: nothing found , syllables computed with pyphen module

In [20]:
modes = accents_vocab['mode'].unique().tolist()
pprint(modes)

['ok', 'toned_lemma_ok', 'toned_found', 'no_tone_pyphen_split']


In [31]:
m = 'ok'

for m in modes:
    print('{:4d} words with mode \'{}\''.format(len(accents_vocab[accents_vocab['mode'] == m]), m))

3818 words with mode 'ok'
4721 words with mode 'toned_lemma_ok'
 572 words with mode 'toned_found'
3687 words with mode 'no_tone_pyphen_split'


In [33]:
accents_vocab[accents_vocab['mode'] == 'ok']

Unnamed: 0,word,searched_word,found_word,syllables,toned_word,mode
0,abate,abate,abate,a-bà-te,abàte,ok
1,abbandonato,abbandonato,abbandonato,ab-ban-do-nà-to,abbandonàto,ok
2,abbandono,abbandono,abbandono,ab-ban-dó-no,abbandóno,ok
3,abbarbaglio,abbarbaglio,abbarbaglio,ab-bar-bà-glio,abbarbàglio,ok
4,abbondante,abbondante,abbondante,ab-bon-dàn-te,abbondànte,ok
...,...,...,...,...,...,...
3813,zita,zita,zita,ẓì-ta,ẓìta,ok
3814,zodiaco,zodiaco,zodiaco,ẓo-dì-a-co,ẓodìaco,ok
3815,zona,zona,zona,ẓò-na,ẓòna,ok
3816,zucca,zucca,zucca,zùc-ca,zùcca,ok


In [34]:
accents_vocab[accents_vocab['mode'] == 'toned_lemma_ok']

Unnamed: 0,word,searched_word,found_word,syllables,toned_word,mode
3818,abbagli,abbaglio,abbaglio,ab-bà-glio,abbàglio,toned_lemma_ok
3819,abbaglia,abbagliare,abbagliare,ab-ba-glià-re,abbagliàre,toned_lemma_ok
3820,abbagliato,abbagliare,abbagliare,ab-ba-glià-re,abbagliàre,toned_lemma_ok
3821,abbaia,abbaiare,abbaiare,ab-ba-ià-re,abbaiàre,toned_lemma_ok
3822,abbaiando,abbaiare,abbaiare,ab-ba-ià-re,abbaiàre,toned_lemma_ok
...,...,...,...,...,...,...
8534,vostre,vostro,vostro,vò-stro,vòstro,toned_lemma_ok
8535,vostri,vostro,vostro,vò-stro,vòstro,toned_lemma_ok
8536,voti,voto,voto,vó-to,vóto,toned_lemma_ok
8537,vuole,volere,volere,vo-lé-re,volére,toned_lemma_ok


In [35]:
accents_vocab[accents_vocab['mode'] == 'toned_found']

Unnamed: 0,word,searched_word,found_word,syllables,toned_word,mode
8539,accaffi,accaffi,accaffare,ac-caf-fà-re,accaffàre,toned_found
8540,accarno,accarno,accarnare,ac-car-nà-re,accarnàre,toned_found
8541,acceffa,acceffa,acceffare,ac-cef-fà-re,acceffàre,toned_found
8542,accisma,accisma,accismare,ac-ci-ṣmà-re,acciṣmàre,toned_found
8543,accorse,accorse,accorrere,ac-cór-re-re,accórrere,toned_found
...,...,...,...,...,...,...
9106,zanche,zanche,zanca,zàn-ca,zànca,toned_found
9107,àncora,àncora,ancora,an-có-ra,ancóra,toned_found
9108,èi,èi,ei,éi,éi,toned_found
9109,ènne,ènne,enne,èn-ne,ènne,toned_found


In [36]:
accents_vocab[accents_vocab['mode'] == 'no_tone_pyphen_split']

Unnamed: 0,word,searched_word,found_word,syllables,toned_word,mode
9111,a,not_searched,not_founded,a,no_toned_word,no_tone_pyphen_split
9112,ab,not_searched,not_founded,ab,no_toned_word,no_tone_pyphen_split
9113,abbella,not_searched,not_founded,ab-bel-la,no_toned_word,no_tone_pyphen_split
9114,abbo,not_searched,not_founded,ab-bo,no_toned_word,no_tone_pyphen_split
9115,abborra,not_searched,not_founded,ab-bor-ra,no_toned_word,no_tone_pyphen_split
...,...,...,...,...,...,...
12793,zenone,not_searched,not_founded,ze-no-ne,no_toned_word,no_tone_pyphen_split
12794,èe,not_searched,not_founded,èe,no_toned_word,no_tone_pyphen_split
12795,èli,not_searched,not_founded,è-li,no_toned_word,no_tone_pyphen_split
12796,èn,not_searched,not_founded,èn,no_toned_word,no_tone_pyphen_split


In [101]:
accents_vocab[accents_vocab['word'] == 'vai']

Unnamed: 0,word,searched_word,found_word,syllables,toned_word,mode
8327,vai,vaio,vaio,và-io,vàio,toned_lemma_ok


In [102]:
toned_df = accents_vocab[accents_vocab['mode'] == 'ok']
words_list = list(toned_df['toned_word'])

In [103]:
import string

vowels = "ÄäÁÀàáAaËëÉÈèéEeÏïÍÌíìIiÖöÓÒóòOoÜüÚÙúùUu"

to_keep = string.ascii_letters + vowels
to_keep

to_del = set()

for l in ''.join(list(toned_df['toned_word'])):
    if l not in to_keep:
        to_del.add(l)
to_del

{'ġ', 'ɑ', 'ɔ', 'ɛ', 'ʃ', 'ˈ', 'ː', 'ṣ', 'ẓ'}

In [104]:
new_word_list = []

for i, row in toned_df.iterrows():
    w = row['toned_word']
    normal_word = row['word']
    new_word = w
    if 'ɑ' in w:
        new_word = w.replace('ɑ', 'à')  
    if 'ġ' in w:
        new_word = w.replace('ġ', 'g')  
    if 'ɔ' in w:
        print(normal_word, w)
    if 'ɛ' in w:
        print(normal_word, w)
    if 'ʃ' in w:
        print(normal_word, w)
    if 'ˈ' in w:
        print(normal_word, w)
    if 'ː' in w:
        new_word = w.replace('ː', '')  
    if 'ṣ' in w:
        new_word = w.replace('ṣ', 's')  
    if 'ẓ' in w:
        new_word = w.replace('ẓ', 'z')  
           
    new_word_list.append(new_word)

toned_df['new_word'] = new_word_list      


ahi ˈai
cortes kˈɔrtes
cortes kˈɔrtes
deh dɛ
fiche fiʃ
poche pɔʃ
poche pɔʃ
uhi ˈui


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toned_df['new_word'] = new_word_list


In [105]:

for i, row in toned_df.iterrows():
    w = row['word']
    new_word = row['new_word']
    if 'k' in new_word:
        print(w, new_word)

cortes kˈɔrtes
qu ku
quai ke


In [106]:
toned_df[toned_df['word'] == 'quai']

Unnamed: 0,word,searched_word,found_word,syllables,toned_word,mode,new_word
2686,quai,quai,quai,ke,ke,ok,ke


In [107]:
toned_df.iloc[2686]['new_word'] = 'quài'

In [108]:
toned_df[toned_df['word'] == 'qu']

Unnamed: 0,word,searched_word,found_word,syllables,toned_word,mode,new_word
2682,qu,qu,qu,ku,ku,ok,ku


In [109]:
toned_df.iloc[2682]['new_word'] = 'qù'

In [112]:
toned_df[toned_df['new_word'] == 'kˈɔrtes']

Unnamed: 0,word,searched_word,found_word,syllables,toned_word,mode,new_word
802,cortes,cortes,cortes,kˈɔrtes,kˈɔrtes,ok,kˈɔrtes


In [113]:
toned_df.iloc[802]['new_word'] = 'cortès'

In [114]:
toned_df

Unnamed: 0,word,searched_word,found_word,syllables,toned_word,mode,new_word
0,abate,abate,abate,a-bà-te,abàte,ok,abàte
1,abbandonato,abbandonato,abbandonato,ab-ban-do-nà-to,abbandonàto,ok,abbandonàto
2,abbandono,abbandono,abbandono,ab-ban-dó-no,abbandóno,ok,abbandóno
3,abbarbaglio,abbarbaglio,abbarbaglio,ab-bar-bà-glio,abbarbàglio,ok,abbarbàglio
4,abbondante,abbondante,abbondante,ab-bon-dàn-te,abbondànte,ok,abbondànte
...,...,...,...,...,...,...,...
3813,zita,zita,zita,ẓì-ta,ẓìta,ok,zìta
3814,zodiaco,zodiaco,zodiaco,ẓo-dì-a-co,ẓodìaco,ok,zodìaco
3815,zona,zona,zona,ẓò-na,ẓòna,ok,zòna
3816,zucca,zucca,zucca,zùc-ca,zùcca,ok,zùcca


In [128]:
toned_vowels = "ÄäÁÀàáËëÉÈèéÏïÍÌíìÖöÓÒóòÜüÚÙúù"

toned_words = []
not_toned_words = []

for w, new_word in zip(list(toned_df['word']),list(toned_df['new_word'])):
    for c in new_word:
        if c in toned_vowels:
            toned_words.append(new_word)
            not_toned_words.append(w)
            break

In [129]:
print(len(not_toned_words))
print(len(toned_words))

3808
3808


In [133]:
import csv

toned_dataset_file = os.path.join(working_dir, 'toned_dataset_file.csv')

f = open(toned_dataset_file, 'w')

writer = csv.writer(f, delimiter='\t')
row = ['word', 'index']

writer.writerow(row)


for w, toned_w in zip(not_toned_words, toned_words):
    toned_index = 0
    for c in toned_w:
        if c in toned_vowels:
            toned_index = toned_w.index(c) + 1
            break
    row = [w, toned_index]
    print(row)
    writer.writerow(row)

f.close()



['abate', 3]
['abbandonato', 9]
['abbandono', 7]
['abbarbaglio', 7]
['abbondante', 7]
['abete', 3]
['abisso', 3]
['abito', 1]
['accedere', 4]
['acceso', 4]
['accetta', 4]
['accetto', 4]
['accidenti', 6]
['accidia', 4]
['accidioso', 7]
['acciò', 5]
['accline', 5]
['accoglienza', 8]
['accolta', 4]
['accolto', 4]
['accorgimento', 9]
['accorto', 4]
['accosto', 4]
['accusa', 4]
['acerbo', 3]
['aceto', 3]
['acqua', 1]
['acquisto', 5]
['acra', 1]
['acro', 1]
['acume', 3]
['acutamente', 7]
['acuto', 3]
['adamante', 5]
['addio', 4]
['addotto', 4]
['adesso', 3]
['adriano', 5]
['adulto', 3]
['adunque', 3]
['aere', 1]
['affanno', 4]
['affetto', 4]
['affettuoso', 8]
['affezione', 7]
['afflitto', 5]
['affocato', 6]
['affranto', 5]
['affricano', 7]
['agevole', 3]
['aggirata', 6]
['aggiunto', 5]
['agguato', 5]
['agnello', 4]
['agno', 1]
['ago', 1]
['agosto', 3]
['agra', 1]
['agricola', 4]
['agro', 1]
['agrume', 4]
['aguglia', 3]
['aguti', 3]
['aguto', 3]
['aguzzo', 3]
['ahimè', 5]
['ai', 1]
['aia', 1]

['ricolta', 4]
['ricolto', 4]
['riconoscenza', 9]
['riconoscere', 6]
['riconosciuto', 10]
['ricoperta', 6]
['ricordo', 4]
['ridente', 4]
['ridere', 2]
['ridire', 4]
['rifatto', 4]
['rifiuto', 5]
['riga', 2]
['rigagno', 4]
['rigido', 2]
['riguardare', 8]
['riguardo', 5]
['rima', 2]
['rimanente', 6]
['rimbalzo', 5]
['rimbombo', 5]
['rimembranza', 8]
['rimessa', 4]
['rimorso', 4]
['rimosso', 4]
['rimpalmare', 8]
['rimpetto', 5]
['rincalzo', 5]
['rinchiuso', 7]
['rintoppo', 5]
['rio', 2]
['ripa', 2]
['riparo', 4]
['ripieno', 5]
['riposato', 6]
['riposo', 4]
['riposto', 4]
['riprezzo', 5]
['risalire', 6]
['rischio', 2]
['risega', 4]
['risma', 2]
['riso', 2]
['rispetto', 5]
['risplendere', 6]
['rispondere', 5]
['risposta', 5]
['rissa', 2]
['ristoro', 5]
['ristretto', 6]
['ritegno', 4]
['ritenere', 6]
['ritorno', 4]
['ritrarre', 5]
['ritratto', 5]
['ritroso', 5]
['ritto', 2]
['riva', 2]
['rivedere', 6]
['rivera', 4]
['riverso', 4]
['rivestire', 7]
['riviera', 5]
['rivo', 2]
['rivolta', 4]
['r