### Problem09: Basic Regex
##### Ref. https://docs.python.org/3.9/howto/regex.html

In [1]:
import re

In [64]:
# Chat (ID/email)
text = """
hello world(a8KvK_41/support@company.co.kr)
hello friend(cskp_69/customer@company.co.kr)
go~~~~~~~(mW_6/cs@company.co.kr)
Greeting!(GKvht_93/customer@company.co.kr)
hello!!(4O_37254/provider@company.co.kr)
hell33!!(/provider@company.co.kr)
"""

##### Matching Character
> \d = [0-9]
> \D = [^0-9]
> \s = [ \t\n\r\f\v]
> \S = [^ \t\n\r\f\v]
> \w = [a-zA-Z0-9_]
> \W = [^a-zA-Z0-9_]

##### Step 1: Retrieve ID and email from text

In [65]:
user_extractor = re.compile("\([\w]+/[^(/]+\)")

for data in user_extractor.finditer(text):
    user_id, user_email = data.group()[1:-1].split("/")
    print(f"ID: {user_id}, email: {user_email}")

ID: a8KvK_41, email: support@company.co.kr
ID: cskp_69, email: customer@company.co.kr
ID: mW_6, email: cs@company.co.kr
ID: GKvht_93, email: customer@company.co.kr
ID: 4O_37254, email: provider@company.co.kr


##### Step 2: Retrieve the chat data by excluding some character

In [66]:
chat_extractor = re.compile("[^()\n]+\(")

for data in chat_extractor.finditer(text):
    chat = data.group()[:-1]
    print(f"chat: {chat}")

chat: hello world
chat: hello friend
chat: go~~~~~~~
chat: Greeting!
chat: hello!!
chat: hell33!!


##### Special Character
> \* : Causes the resulting RE to match 0 or more repetitions => {0, }
> \+ : Causes the resulting RE to match 1 or more repetitions => {1, }
> \? : Causes the resulting RE to match 0 or 1 repetitions => {0, 1}
> \^ : Matches the start of the string
> \$ : Matches the end of the string or just before the newline at the end of the string

##### Step 3: Use * instead of + to include the log that no user_id in it

In [67]:
user_extractor = re.compile("\([\w]*/[^(/]+\)")

for data in user_extractor.finditer(text):
    user_id, user_email = data.group()[1:-1].split("/")
    print(f"ID: {user_id}, email: {user_email}")

ID: a8KvK_41, email: support@company.co.kr
ID: cskp_69, email: customer@company.co.kr
ID: mW_6, email: cs@company.co.kr
ID: GKvht_93, email: customer@company.co.kr
ID: 4O_37254, email: provider@company.co.kr
ID: , email: provider@company.co.kr


##### Step 4: Use ^ to get first chat

In [68]:
first_chat_extractor = re.compile("^[^(]+")
print("Object :", first_chat_extractor.search(text))
print("text :", first_chat_extractor.search(text).group().strip())

Object : <re.Match object; span=(0, 12), match='\nhello world'>
text : hello world


##### Step 5: Use $ to get last chat

In [69]:
last_log_extractor = re.compile("[^\n]+$")
print("Object :", last_log_extractor.search(text))
print("text :", last_log_extractor.search(text).group().strip())

Object : <re.Match object; span=(207, 240), match='hell33!!(/provider@company.co.kr)'>
text : hell33!!(/provider@company.co.kr)


##### Step 6: Greedy Vs Non-greedy

In [81]:
name_list = "James/Mark/Alex/Pio"

greedy_name_extractor = re.compile(".+/")
print(greedy_name_extractor.search(name_list))

non_greedy_name_extractor = re.compile(".+?/")
print(non_greedy_name_extractor.search(name_list))

<re.Match object; span=(0, 16), match='James/Mark/Alex/'>
<re.Match object; span=(0, 6), match='James/'>


##### Step 7: Retrieve ID and email from text with group name

In [72]:
user_extractor_with_group = re.compile("\((?P<user_id>[\w]*)/(?P<user_email>[^(/]+)\)")

for data in user_extractor_with_group.finditer(text):
    user_id = data.group("user_id")
    user_email = data.group("user_email")
    print(f"ID: {user_id}, email: {user_email}")

ID: a8KvK_41, email: support@company.co.kr
ID: cskp_69, email: customer@company.co.kr
ID: mW_6, email: cs@company.co.kr
ID: GKvht_93, email: customer@company.co.kr
ID: 4O_37254, email: provider@company.co.kr
ID: , email: provider@company.co.kr


##### Step 8: Find the text that the length of ~ is 3 to 5
* ###### Limit the character repetitions

In [82]:
cheering_chats = """
go2win~~~~~~!
go~~!
Fight~~~!
gogogo~~~~~!
"""
extractor_with_limitation = re.compile("[\w]+~{3,5}!")

for data in extractor_with_limitation.finditer(cheering_chats):
    print(data.group())

Fight~~~!
gogogo~~~~~!
