# Regex and Locale Codes

In [4]:
from pathlib import Path
import re

for f in Path(r"example files").iterdir():
    print(f.name)

blizzard_games_dir_de-DE.csv
blizzard_games_dir_en-us.csv
blizzard_games_dir_es-LA.csv
blizzard_games_dir_es-MX.csv
blizzard_games_dir_es_ES.csv
blizzard_games_dir_fr-ca.csv
blizzard_games_dir_fr_FR.csv
blizzard_games_dir_it.csv
blizzard_games_dir_ja.csv
blizzard_games_dir_pt_BR.csv


![regex_1.png](screenshots/regex_1.png)
![regex_2.png](screenshots/regex_2.png)
![regex_3.png](screenshots/regex_3.png)
![regex_4.png](screenshots/regex_4.png)
![regex_5.png](screenshots/regex_5.png)
![regex_6.png](screenshots/regex_6.png)
![regex_7.png](screenshots/regex_7.png)
![regex_8.png](screenshots/regex_8.png)
![regex_9.png](screenshots/regex_9.png)
![regex_10.png](screenshots/regex_10.png)

# Pattern 1
The below pattern uses the pipe operator | to match a 4-letter locale code before the pipe, OR a 2-letter locale code after the pipe. re.X can be used to write the regex on multiple lines with comments.

In [13]:
pattern_1 = re.compile(r"""(?<=[_-])              # look behind for underscore or dash
                           [a-z]{2}[_-][a-zA-Z]{2} # match a 4-letter locale code
                           (?=\.)                  # look ahead for full stop
                           |                       # OR
                           (?<=[_-])               # look behind for underscore or dash
                           [a-z]{2}                # match a 2-letter locale code
                           (?=\.)                  # look ahead for full stop
                           """, re.X) 

for f in Path(r"example files").iterdir():
    match = re.search(pattern_1, f.name)
    if match:
        print(f.name, match.group(0))
    else:
        print(f.name, "no match")

blizzard_games_dir_de-DE.csv de-DE
blizzard_games_dir_en-us.csv en-us
blizzard_games_dir_es-LA.csv es-LA
blizzard_games_dir_es-MX.csv es-MX
blizzard_games_dir_es_ES.csv es_ES
blizzard_games_dir_fr-ca.csv fr-ca
blizzard_games_dir_fr_FR.csv fr_FR
blizzard_games_dir_it.csv it
blizzard_games_dir_ja.csv ja
blizzard_games_dir_pt_BR.csv pt_BR


# Pattern 2
The below pattern is composed a little more cleanly. We can understand that the first two letters of the locale code may be optional. This allows us to match a 4-letter locale code and a 2-letter locale code with the same regex.

In [14]:
pattern_2 = re.compile(r"""(?<=_)          # positive lookbehind for anything after an underscore
                           (?:             # beginning of our optional "first two letters"
                           [a-zA-Z]{2}[-_] # matches two letters before dash or underscore
                           ){0,1}?         # will match if possible, but ignore if not there
                           [a-zA-Z]{2}     # matches the last two letters of the locale code
                           (?=\.)          # positive lookahead for full stop
                           """, re.X) 

for f in Path(r"example files").iterdir():
    match = re.search(pattern_2, f.name)
    if match:
        print(f.name, match.group(0))
    else:
        print(f.name, "no match")

blizzard_games_dir_de-DE.csv de-DE
blizzard_games_dir_en-us.csv en-us
blizzard_games_dir_es-LA.csv es-LA
blizzard_games_dir_es-MX.csv es-MX
blizzard_games_dir_es_ES.csv es_ES
blizzard_games_dir_fr-ca.csv fr-ca
blizzard_games_dir_fr_FR.csv fr_FR
blizzard_games_dir_it.csv it
blizzard_games_dir_ja.csv ja
blizzard_games_dir_pt_BR.csv pt_BR
