In [1]:
import re

## German:

- country code +49
- area code 2 to 4 digits, e.g. 30 = Berlin, 4539 = some small place close to city 45
- 6-7 numbers after (subscriber number)
- space between area code and subs. number
- with country code, zero can be either omitted or be present in brackets

Possible context words:
- Telefon
- Telefonnummer
- Tel.Nr.
- Tel.
- Nummer
- Rufen

In [6]:
rule_de = r'((\+49[ ])|(0049 )|((0[ ]?))|(\+49 \(0\)))(\d{2,4}[ ])(\d{6,7})'



test_set_de = ["+49 (0)30 4851923",
            "+49 30 901820", 
            "0 1522 3433333",
            "01522 3433333",
            "+49 (0)30 901820",
            " tf 030 901820 bitte",
            "+49 (0)30 901820",
            "+49 30 901820,",
            "0049 30 901820",
            "030 901820",
            "In German, follow the DIN 5008 format: +49 1522 343333. Each part is separated by a space. Without the area code, it’s 01522 343333."
           ]



In [7]:
def replace_number(line, rule=rule_de):
    # This is for us? But does not even recognize them? cONFUSION
    return re.sub(rule, "<PHONE_NUMBER>", line)

for line in test_set_de:
    print(replace_number(line))

<PHONE_NUMBER>
<PHONE_NUMBER>
<PHONE_NUMBER>
<PHONE_NUMBER>
<PHONE_NUMBER>
 tf <PHONE_NUMBER> bitte
<PHONE_NUMBER>
<PHONE_NUMBER>,
<PHONE_NUMBER>
<PHONE_NUMBER>
In German, follow the DIN 5008 format: <PHONE_NUMBER>. Each part is separated by a space. Without the area code, it’s <PHONE_NUMBER>.


## French

- country code +33
- areacode 01-09, 00 rare but possible
- written in pairs of two rather than pairs of 3-x


Possible context words
- appeler
- numéro, numero
- appelez
- téléphone
- tél, tel

In [8]:
rule_fr = r'(\+33[ ]\d([ ]\d{2}){4})|(0\d(([ ]\d{2}){4}))'

test_set_fr = ["03 43 34 12 74",
              "+33 9 23 53 28 54"]

for line in test_set_fr:
    print(replace_number(line, rule=rule_fr))

<PHONE_NUMBER>
<PHONE_NUMBER>


## Italian

- country code +39
- areacode 1-3 digits with 0 as the first number (NOT OMITTED WITH +39)
- subs. number 5-8 digits
- sometimes 00 infrom of the number? <= calling inside the country?

Possible contex words
- numero
- chiamare, chiamata, chiama
- telefono
- tel, Tel

In [20]:
# These are from cites that have italian official phone numbers, like "City hall of Firenze"
test_set_it = ["tel. +39 06 678 4343",     # Some Roman hotel
                "+39 066 1969084",    # Roman limousine rental
                "+39 055 294883",     # Firenze musei
                "telefono (+39) 055-27681",     # City hall of Firenze
                "=>0039 041 5298711",    # Venice tourist office
                "+39-041-719078.",    # Venice boat rental office
                "041-719078",
                "041 5298711",
              ]

rule_it = r'(((\+39|0039|\(\+39\))[ ])?0\d{1,2}[ -](\d{5,8}|\d{3}[ -]\d{2,5}))|((\+39-)?0(\d{1,2}-\d{5,8}|\d{1,2}-\d{3}-\d{2,5}))'

for line in test_set_it:
    print(replace_number(line, rule=rule_it))

tel. <PHONE_NUMBER>
<PHONE_NUMBER>
<PHONE_NUMBER>
telefono <PHONE_NUMBER>
=><PHONE_NUMBER>
<PHONE_NUMBER>.
<PHONE_NUMBER>
<PHONE_NUMBER>


## Spanish

- +34
- 9 numbers with exceptions
- area code 2-3 digits
- 9 for landlines and 6 for mobile, but there are other special prefixes as well
- According to wikipedia, area(3)-3-3 format is used but websites list area(3)-2-2-2 and area(2)-3-4 as well

Possible context words
- teléfono, telefono
- telf, teléf., tel., tlf, tfno
- numero
- llama (soittaa)


In [19]:
test_set_es = ["910 342 342",
              "+34 924 349 129",
              "+34 915 29 82 10",      # Madrid City Hall
              "+34 932 85 38 32",      # Barcelona Cathedral Office
              "Tel: 932 853 834",      # Barcelona Placa de Catalunya tourist office
              "+34 93 378 8175", # Barcelona Airport Tourist office in terminal 1
              "0034 93 378 8175"]    #apparently this is also possible

rule_es = r'((\+34[ ]|0034[ ])?\d{2,3}[ ]\d{3}[ ]\d{3,4})|((\+34[ ]|0034[ ])?\d{2,3}([ ]\d{2}){3})'

for line in test_set_es:
    print(replace_number(line, rule=rule_es))

<PHONE_NUMBER>
<PHONE_NUMBER>
<PHONE_NUMBER>
<PHONE_NUMBER>
Tel: <PHONE_NUMBER>
<PHONE_NUMBER>
<PHONE_NUMBER>
