# 正規表達式 Regular Expression
## 透過一些範例來說明如何使用正規表達式

In [1]:
import re  #載入re模組

In [2]:
# 定義一個函數，用來測試正規表達式是否匹配文本
def RegexMatchingTest(regex, input_text):
    #將正規表達式轉換成pattern
    pattern = re.compile(regex)
    # pattern = re.compile(regex, flags=re.IGNORECASE)  #若要忽略大小寫，在編譯時加上flags=re.IGNORECASE
    
    # 帶入編譯後的pattern，來測試是否匹配
    # 這裡也可以用match()、split()、findall()、sub()等其他函數來測試匹配
    result = re.search(pattern, input_text)

    if result:
        # 匹配完的結果會儲存在group()的屬性中，我們可以把匹配的結果列印出來
        print("Matched: %s" % (result.group()))
        
        if result.lastindex is not None:
            # group(0)代表整個字串，group(1)、group(2)...代表分組中，匹配的內容
            for i in range(0, result.lastindex+1):
                print("  group(%d): %s" % (i, result.group(i)))
    else:
        print("Not matched.")

### 範例1:使用「\w」匹配字母，「\d」匹配數字，「\s」匹配空白

In [3]:
test_string = "My plate number is XYZ-1234."
regex = 'My plate number is \w\w\w-\d\d\d\d'
RegexMatchingTest(regex, test_string)

Matched: My plate number is XYZ-1234


In [4]:
test_string = "My phone number is 0912-345 678."
regex = 'My phone number is \d\d\d\d-\d\d\d\s\d\d\d'
RegexMatchingTest(regex, test_string)

Matched: My phone number is 0912-345 678


In [5]:
#利用量詞{n,m}來簡化寫法
test_string = "My phone number is 0912-345 678."
regex = 'My phone number is \d{4}-\d{3}\s{1}\d{3}'
RegexMatchingTest(regex, test_string)

Matched: My phone number is 0912-345 678


In [6]:
# 更偷懶的寫法，用「.」來代表任何字元
test_string = "My phone number is 0912-345 678."
regex = 'My phone number is .{4}-.{3}.{1}.{3}'
RegexMatchingTest(regex, test_string)

Matched: My phone number is 0912-345 678


### 範例2:使用[...]匹配在[ ]裡面所列出的字元

In [7]:
test_string = "I love dogs."
regex = 'I love [acdgnost]'
RegexMatchingTest(regex, test_string)

Matched: I love d


In [8]:
test_string = "I love cats."
regex = 'I love [acdgnost]'
RegexMatchingTest(regex, test_string)

Matched: I love c


In [9]:
# 若要匹配超過一個以上的字元，必須加入量詞(「+」或「*」或「?」)來表達
test_string = "I love dogs."
regex = 'I love [acdgnost]+'
RegexMatchingTest(regex, test_string)

Matched: I love dogs


In [10]:
test_string = "I love people."
regex = 'I love [acdgnost]+'
RegexMatchingTest(regex, test_string)
# people裡面只有'p'、'e'、'o'、'l'等字元，無法滿足[acdgnost]裡面所列出的條件

Not matched.


### 範例3:分組及捕捉

In [11]:
test_string = "I like baseball sport."
regex = 'I like (hiking|baseball) sport'
RegexMatchingTest(regex, test_string)

Matched: I like baseball sport
  group(0): I like baseball sport
  group(1): baseball


In [12]:
test_string = "I like hiking sport."
regex = 'I like (hiking|basketball) sport'
RegexMatchingTest(regex, test_string)

Matched: I like hiking sport
  group(0): I like hiking sport
  group(1): hiking


### 範例4:使用跳脫符號「\」
當遇到詮釋字元要被視為一般字元時，就必須要在前面加上跳脫符號「\」

In [13]:
test_string = "Please call number (02)2882-5252."
regex = 'Please call number \([0-9]{2}\)[0-9]{4}-[0-9]{4}'  #用「\(」來匹配左括號"("，用「\)」來匹配右括號")"
RegexMatchingTest(regex, test_string)

Matched: Please call number (02)2882-5252


### 範例5:比對中文字

In [14]:
test_string = "Here are 中文字 and English"  #中英夾雜的句子
regex = '[\u4e00-\u9fa5]+'                  #中文的UNICODE，範圍是0x4E00 ~ 0x9FA5
RegexMatchingTest(regex, test_string)

Matched: 中文字
