In [None]:
import re
from re import findall

st1 = '209 Tryon Pl. Paramus 07652 NJ United States of America 201-555-6543'

# finding numbers
print(findall('209', st1))
print(findall('[0-9]', st1))
print(findall('[0-9]{3}', st1))
print(findall('[0-9]{3,}', st1))
print(findall('\\d{3,}', st1))
print('='*40)

# finding strings
print(findall('[a-z]{3}', st1))
print(findall('[a-z]{3,}', st1))
print(findall('[a-z|A-Z]{3,}', st1))
print(findall('\w{3,}', st1))
print('='*40)

# finding specific place in a string
st2 = 'test1abcABC 123mbc 45test'
# 접두어 접미어
print(findall('^test', st2))
print(findall('st$', st2))

# 종료문자 찾기
print(findall('.bc', st2))

# 시작문자 찾기
print(findall('t.', st2))

# finding words(\\w)
st3 = 'test^권구달 abc 대한*민국 USA$123123'

words = findall('\\w{3,}', st3)
print(words)

#문자열 제외 : x+(x가 1개 이상 반복) [^] 제외
print(findall('[^^*$]+', st3))

## 문자열 검사

In [32]:
from re import match

# when pattern matches
social = '891234-1912345'
result = match('[0-9]{6}-[1-4][0-9]{6}', social)
print(result) #address

if result :
    print('social number matches')
else:
    print('wrong social number')

# if pattern does not match
social = '891234-5912345'
result = match('[0-9]{6}-[1-4][0-9]{6}', social)
if result :
    print('social number matches')
else:
    print('wrong social number')

<re.Match object; span=(0, 14), match='891234-1912345'>
social number matches
wrong social number


## 문자환 치환 (Replacing)

In [37]:
from re import sub

st3 = 'test^권구달 abc 대한*민국 USA$123123'

# Deleting symbols
text1 = sub('[\^*$]', '.', st3)
print(text1)

# Deleting numbers
text2 = sub('[0-9]', '', text1)
print(text2)

test.권구달 abc 대한.민국 USA.123123
test.권구달 abc 대한.민국 USA.


## Text Processing

In [39]:
from re import split, match, compile

multi_line = """http://www.naver.com
http://www.daum.net
www.hongkildong.com"""

# 구분자를 이용하여 문자열 분리
web_site = split("\n", multi_line)
print(web_site)

# pattern object
pat = compile("http://")

# collect only right web address using pattern object
sel_site = [site for site in web_site if match(pat, site)]
print(sel_site)

['http://www.naver.com', 'http://www.daum.net', 'www.hongkildong.com']
['http://www.naver.com', 'http://www.daum.net']


## Natural Language Pre-Processing

In [53]:
from re import findall, sub

texts = ['우리나라 대한민국, 우리나라%$ 만세', '비아그&라 500g 정력 최고!', 'Im Korean', 'Insurance $15 for all', 'My name is Terminator']

# 1. changing letters into small cases
texts_re1 = [t.lower() for t in texts]
print('texts_re1: ', texts_re1)

# 2. removing numbers
texts_re2 = [sub("[0-9]", '', text) for text in texts_re1]
print('texts_re2: ', texts_re2)

# 3. removing punctuation marks
texts_re3 = [sub('[,.?!:;]', '', text) for text in texts_re2]
print('texts_re3: ', texts_re3)

# 4. removing symbols
spec_str = '[@#$%^&*()]'
texts_re4 = [sub(spec_str, '', text) for text in texts_re3]
print('texts_re4: ', texts_re4)

# 5. removing enlgish
texts_re5 = [''.join(findall("[^가-힣]", text)) for text in texts_re4]
print('texts_re5: ', texts_re5)

# 6. removing spaces
texts_re6 = [''.join(text.split()) for text in texts_re5]
print('texts_re6: ', texts_re6)


texts_re1:  ['우리나라 대한민국, 우리나라%$ 만세', '비아그&라 500g 정력 최고!', 'im korean', 'insurance $15 for all', 'my name is terminator']
texts_re2:  ['우리나라 대한민국, 우리나라%$ 만세', '비아그&라 g 정력 최고!', 'im korean', 'insurance $ for all', 'my name is terminator']
texts_re3:  ['우리나라 대한민국 우리나라%$ 만세', '비아그&라 g 정력 최고', 'im korean', 'insurance $ for all', 'my name is terminator']
texts_re4:  ['우리나라 대한민국 우리나라 만세', '비아그라 g 정력 최고', 'im korean', 'insurance  for all', 'my name is terminator']
texts_re5:  ['   ', ' g  ', 'im korean', 'insurance  for all', 'my name is terminator']
texts_re6:  ['', 'g', 'imkorean', 'insuranceforall', 'mynameisterminator']


## Pre-processing functions

In [55]:
from re import findall, sub

texts = ['우리나라 대한민국, 우리나라%$ 만세', '비아그&라 500g 정력 최고!', 'Im Korean', 'Insurance $15 for all', 'My name is Terminator']

def clean_text(text):
    text_re = text.lower()
    text_re1 = sub('[0-9]', '', text_re)
    text_re2 = sub('[,.?!;:]', '', text_re1)
    text_re3 = sub('[@#$%^&*()]', '', text_re2)
    text_re4 = sub('[a-z]', '', text_re3)
    text_re5 = ' '.join(text_re4.split())
    return text_re5

text_result = [clean_text(text) for text in texts]
print(text_result)

['우리나라 대한민국 우리나라 만세', '비아그라 정력 최고', '', '', '']
