In [None]:
import pywikibot
import time
import re

In [None]:
site = pywikibot.Site('fr', 'wiktionary')
print(site)

In [None]:
# This parameter represents the character or set of consecutive characters 
# to be replaced
old_symbol = 'g' # regular 'g' letter 
#old_symbol = 'õ' # regular 'g' letter 
#old_symbol = 'ẽ'

# This parameter represents the character or set of consecutive characters 
# that will replace the old_symbol
new_symbol = 'ɡ' # voice velar stop (U+0261)
#new_symbol = 'ɔ̃'
#new_symbol = 'ɛ̃'

In [None]:
words = [
       'rideau gonflable', 'rideaux gonflables', 'rigolard',
]

In [None]:
MODELES_SIMPLES = [
    'fr-accord-cons',
    'fr-accord-en',
    'fr-accord-on',
    'fr-accord-ot',
    'fr-accord-rég',
    'fr-inv', 
    'fr-rég',
    'fr-accord-mixte',
    'fr-accord-mf',
    'fr-rég-al',
    
]
MODELES_DOUBLES = [
    'fr-accord-al',
    'fr-accord-er',
    'fr-accord-eur',
    'fr-accord-eux',
    'fr-accord-f',
    'fr-accord-in',
    'fr-accord-mf',
    'fr-accord-mf-al',
]

MODELES_TO_IGNORE = [
    'fr-verbe-flexion',
]
MODELES = MODELES_SIMPLES + MODELES_DOUBLES 

In [None]:
ALLOWED_PHONEMES = [
        # Voyelles
        'i','e','ɛ','a','ɑ','ɔ','o','u','y','ø','œ','ə','ɛ̃','ɑ̃','ɔ̃','œ̃',
        # Semi-consonnes
        'j','w','ɥ', 
        # Consonnes
        'p','t','k','b','d','ɡ','f','s','ʃ','v','z','ʒ','l','ʁ','m','n','ɲ','h','ŋ',
        # Addĩitional ones. They should not be there (most of them are user errors), 
        # but if not taken into account, they lead to more words being discarded
        '(',')','-',' ','ˈ', '.', '\ ̃',
        #'ɡ','ǝ', 'r', 'ã', 'ʏ', 'ẽ', 'õ', 
        '‿', # pitches
]

In [None]:
def skip_word(symbol, page, word):
    if symbol == 'ẽ':
        if 'CA' in page or 'QC' in page:
            print('skip_word: symbol %s probably good in %s' % (symbol, word))
            return True
    return False

In [None]:
def check_phonemes(string, verbose=False):
    nb_chars = len(string)
    
    if nb_chars == 0:
        return('check_phonemes: received empty string !!!')
    
    nb_chars_allowed = 0
    probably = True
    for symbol in string:
        if symbol in ALLOWED_PHONEMES:
            nb_chars_allowed += 1
        else:
            if verbose:
                print('check_phonemes: in %s, found %c (ascii=%d)' % (string, symbol, ord(symbol)))
    pctg = int (nb_chars_allowed/nb_chars*100)
    if verbose:
        print('check_phonemes(%s) : %d%% of good phonemes' % (string, pctg))
    return pctg

In [None]:
def replace_pronunciations_in_ligne_de_forme(text, pronunciations, old_symbol, new_symbol, verbose=False):
    nb_replacements = 0
    for pron in pronunciations:
        if verbose:
            print('found:', pron)
        old_string = pron[0] + pron[1] + pron[2]
        pron_new = pron[1].replace(old_symbol, new_symbol)
        new_string = pron[0] + pron_new + pron[2]
        if new_string != old_string:
            print('replacing : %s\nwith      : %s' % (old_string, new_string))    
            text = text.replace(old_string, new_string)
            nb_replacements += 1
    print('nb_replacements in ldf:%d', nb_replacements)
    return text

In [None]:
def replace_pronunciations_in_model(text, pronunciations, old_symbol, new_symbol, verbose=False):
    nb_replacements = 0
    for pron in pronunciations:
        if verbose:
            print('found:', pron)
        old_string = pron[0] + pron[1] + pron[2]
        if verbose:
            print('old_string:%s ' % old_string)
        
        # extract modele from string like '{{fr-rég|'
        modele = pron[0][2:-1]
        if verbose:
            print('modele:%s' % modele)
        if modele in MODELES_TO_IGNORE:
            print('modele with no pronunciation:%s !!!' % modele)
            return
        elif modele not in MODELES:
            print('!!! unknow modele:%s !!!' % modele)
            return
        
        old_pron = pron[1]
        new_pron = ''
        if verbose:
            print('old_pron:%s ' % old_pron)
        subfields = old_pron.split('|')
        if verbose:
            print(subfields)
        
        first_untag_met = False
        for subfield in subfields:
            prefix = ''
            if verbose:
                print('subfield:%s' % subfield)
            #print('subfield splitted', subfield.split('='))
            if '=' in subfield:
                subsplits = subfield.split('=')
                subfield=subsplits[1]
                prefix = subsplits[0]+'='
                if (modele in MODELES) and (prefix in ['ps=', 'pp=', 'pms=', 'pmp=', 'pm=', 
                                                       'pfs=',  'pfp=', 'pf=', 'préfpron=', 
                                                       'pron=', 'préfps=', 'préfpp=',
                                                       'pron2=', 'pron3=','pinv=', 'pronradp=',
                                                      ]):
                    check_phonemes(subfield, verbose)
                    subfield = subfield.replace(old_symbol, new_symbol)
                    # take care leaving the loop for fr-accord-cons
            else:
                if modele in MODELES_SIMPLES:
                    check_phonemes(subfield, verbose)
                    subfield = subfield.replace(old_symbol, new_symbol)
                    # take care leaving the loop for fr-accord-cons
                elif modele in MODELES_DOUBLES:
                    if first_untag_met == False:
                        first_untag_met = True
                    else:
                        check_phonemes(subfield, verbose)
                        subfield = subfield.replace(old_symbol, new_symbol)
            new_pron += prefix + subfield + '|'
        
        # remove last '|'
        new_pron = new_pron[:-1]
        if verbose:
            print('new_pron:', new_pron)
        
        if new_pron != old_pron:
            new_string = pron[0] + new_pron + pron[2]
            print('replacing: %s\nwith     : %s' % (old_string, new_string))    
            text = text.replace(old_string, new_string)
            nb_replacements += 1
        else:
            print('no need to replace: %s' % (old_string))    
            

    print('nb_replacements in model:%d', nb_replacements)
    return text

In [None]:
def process_words(words, old_symbol, new_symbol, commit=False, verbose=False):
    
    # trick to better identify the new moifications
    if commit == False:
        if not '*' in new_symbol:
            new_symbol = '*' + new_symbol + '*'
    
    for word in words:
        
        page = pywikibot.Page(site, word)
        if not page.exists():
            print('page %s does not exists' % word)
            continue
        print('page=%s' % word)
        #print(page.getVersionHistoryTable())
        old_text = page.text
        if verbose:
            print('#########################')
            #print('old page text:', old_text)
            #print('#########################')
            
        if skip_word(old_symbol, page.text, word):
            print()
            continue
        
        
        # Look for the pronunciation within the "ligne de forme"
        pron_head = '{{pron'
        pron_foot = 'fr}}'
        regexp = '('+ pron_head + '\|)'+ '([^\|\}]{1,})' + '(\|'+pron_foot+')'
        pronunciations = re.findall(regexp, page.text)
        new_text = replace_pronunciations_in_ligne_de_forme(page.text, pronunciations, 
                                                            old_symbol, new_symbol, verbose)

        page.text = new_text
        
        # Look for the 2nd pronunciation
        pron_hd = '{{fr'
        pron_ft = '}}'
        regexp = '('+pron_hd + '.*?' + '\|)'+ '([^\}]{1,})' + '(' + pron_ft+')'
        pronunciations = re.findall(regexp, page.text)
        new_text = replace_pronunciations_in_model(page.text, pronunciations, 
                                                   old_symbol, new_symbol, verbose)
        
        if verbose:
            foo = 1
            #print('#########################')
            #print('new page text:', new_text)
            #print('#########################')
            
        if new_text != old_text:
            # string to summarize the work done in the edit comment
            summary = "prononciation: remplacement de /"+ old_symbol + "/ par /"+ new_symbol +"/."
            print('summary: ',summary)
        
            page.text = new_text
            if commit:
                if '*' in new_symbol:
                    print('* in new_symbol %s, probably an error', new_symbol)
                    break
                print('committing')
                page.save(summary=summary, botflag=True, quiet=False)
                # sleep 10 seconds before jumping to the next word
                time.sleep(10)
        else:
            print('no modification')
            
        print('')

In [None]:
process_words(words, old_symbol, new_symbol, commit=False, verbose=False)