# Regular Expression

| word          | description               | example | match  |
| ------------- |:-------------------------:|:-------:|-------:|
| \d            | any number                |         |
| \D            | except number             |         |   
| \s            | any blank wrod            |         |  
| \S            | except blank              |    	  |
| \w            | any alphanumeric          |         | 	
| \W            | any blank wrod            |         |
| \A            | start of words            |         |
| \Z            | end of words              |         |
| .             | any one word              | a.c     | aac, abc
| ^             | start of words            | ^abc    | abca, abcdef
| \$            | end of words              | abc$    | defabc, ababc
| *             | repeat zero or more times | ab*     | a, ab, abb
| +             | repeat one or more times  | ab+     | ab, abb, abbb
| ?             | zero or one time          | ab?     | a, ab, abb
| {m}           | repeat n times            | a{3}    | aaa
| {m,n}         | repeat from m to n        | a{2,4}  | aa, aaa, aaaa
| vertical line | and set                   |         |
| []            | set                       |         |
| ()            | grouping                  | (abc)+  | abc, abcabc 

In [44]:
import re
import pandas as pd
from typing import Iterable, Sequence, List, Dict

## Methods

In [13]:
def re_composition(content: str, pattern: str) -> dict:
    """
    match method determines at the biginning of the string.
    search method determines at the any position of the string.
    findall method finds all substrings matching the regex and returns as a list.
    finditer method is same as above except returns as a iterator.
    """
    result = {
        'match': re.match(pattern, content),
        'search': re.search(pattern, content),
        'findall': re.findall(pattern, content),
        'finditer': re.finditer(pattern, content)
    }
    return result

In [18]:
def re_compositions(contents: list, pattern: str) -> list:
    objs = []
    for content in contents:
        objs.append(re_composition(content, pattern))
    return objs

In [19]:
def match_or_search_result(match_or_search_result) -> dict:
    if match_or_search_result:
        return {
            'object': match_or_search_result,
            'start': match_or_search_result.start(),
            'end': match_or_search_result.end(),
            'span': match_or_search_result.span(),
            'group': match_or_search_result.group()
        }
    else:
        return {
            'object': None,
            'start': None,
            'end': None,
            'span': None,
            'group': None
        }

## Module arguments

In [4]:
content = 'nchikun@gmail.com'

In [29]:
pattern = 'gmail'

In [30]:
df_contents = pd.DataFrame({
    'user_id': [
        '001',
        '002',
        '003'
    ],
    'email': [
        'donkonchi@gmail.com',
        'nchikun@yahoo.co.jp',
        'nchikun[at]gmail.com'
    ],
    'referrer': [
        'http://www.other.com/path1/index.php?k1=v1&k2=v2#Ref1',
        'http://www.othre.net/path1/index.php?k1=v1&k2=v2#Ref1',
        'https://www.other.com/'
    ],
    'url': [
        'http://www.example.com/video/detail?id=001',
        'http://www.example.com/video#ref',
        'https://www.example.com./book/detail?id=002'
    ]
})
df_contents.head()

Unnamed: 0,user_id,email,referrer,url
0,1,donkonchi@gmail.com,http://www.other.com/path1/index.php?k1=v1&k2=...,http://www.example.com/video/detail?id=001
1,2,nchikun@yahoo.co.jp,http://www.othre.net/path1/index.php?k1=v1&k2=...,http://www.example.com/video#ref
2,3,nchikun[at]gmail.com,https://www.other.com/,https://www.example.com./book/detail?id=002


## Output

In [37]:
df_gmail_start = df_contents.copy()
df_gmail_start['gmail_position'] = [
    match_or_search_result(i['search'])['start']
    for i in re_compositions(df_contents['email'], pattern)
]
df_gmail_start

Unnamed: 0,user_id,email,referrer,url,gmail_position
0,1,donkonchi@gmail.com,http://www.other.com/path1/index.php?k1=v1&k2=...,http://www.example.com/video/detail?id=001,10.0
1,2,nchikun@yahoo.co.jp,http://www.othre.net/path1/index.php?k1=v1&k2=...,http://www.example.com/video#ref,
2,3,nchikun[at]gmail.com,https://www.other.com/,https://www.example.com./book/detail?id=002,11.0


In [7]:
# match
mr = match_or_search_result(comp['match'])
print(mr['object'])
print(mr['start'])
print(mr['end'])
print(mr['span'])
print(mr['group'])

None
None
None
None
None


In [8]:
# search
sr = match_or_search_result(comp['search'])
print(sr['object'])
print(sr['start'])
print(sr['end'])
print(sr['span'])
print(sr['group'])

<re.Match object; span=(3, 4), match='i'>
3
4
(3, 4)
i


In [9]:
# findall (return list matching patterns)
comp['findall']

['i', 'i']

In [10]:
# finditer (return matching objects as iteration)
for i in comp['finditer']:
    mr = match_or_search_result(i)
    print(mr['object'])
    print(mr['start'])
    print(mr['end'])
    print(mr['span'])
    print(mr['group'])

<re.Match object; span=(3, 4), match='i'>
3
4
(3, 4)
i
<re.Match object; span=(11, 12), match='i'>
11
12
(11, 12)
i
