In [5]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
import re

In [9]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


# 字符串对象方法

- count
- startswith/endswith
- join
- index
- find
- rfind
- replace
- strip/lstrip/rstrip
- split
- lower/upper
- ljust/rjust
...

## string.split()

In [2]:
val = 'a,b,  guido'
val.split(',')

['a', 'b', '  guido']

## string.strip()

In [3]:
pieces = [x.strip() for x in val.split(',')] # strip过滤字符串前后的不可见字符
pieces

['a', 'b', 'guido']

## string + string

In [4]:
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

## string.join()

In [5]:
'::'.join(pieces) # cc：这个用法挺常用的。

'a::b::guido'

In [6]:
'guido' in val # 判断sub string是否存在

True

## string.index()

In [8]:
val.index(',') # 第一次出现的索引位置，与find不同，找不到就抛出异常。


1

## string.find()

In [9]:
val.find(':') # 找不到返回-1


-1

## string.count()

In [10]:
 val.count(',') # 计算某个字符出现次数


2

## string.replace()

In [11]:
print(val.replace(',', '::')) # 替换
print(val.replace(',', ''))

a::b::  guido
ab  guido


## 正则表达式

In [7]:
text = "foo   bar\t baz  \tqux"
re.split('\s+', text)
# cc：这个正则挺有用，排除字符串“异常字段”，并顺便split成列表。

['foo', 'bar', 'baz', 'qux']

In [15]:
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [16]:
regex.findall(text) # 找到所有匹配'\s+'的内容


['   ', '\t ', '  \t']

In [17]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags=re.IGNORECASE) # 忽略大小写
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [18]:
m = regex.search(text)
print(m)
text[m.start():m.end()]

<re.Match object; span=(5, 20), match='dave@google.com'>


'dave@google.com'

In [19]:
print(regex.match(text)) # 返回None，因为它只匹配出现在字符串开头的模式。


None


In [20]:
print(regex.sub('REDACTED', text)) # 匹配到的模式替换为指定字符串


Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [21]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})' # 用()包含group
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

# pandas中矢量化的字符串函数

## 函数汇总
- cat
- contains
- count
- endswith/startswith
- findall
- get
- join
- len
- lower/upper
- match
- pad
- center
- repeat
- replace
- slice
- split
- strip/lstrip/rstrip

## Series.str.contains

In [10]:
data = {'Dave': 'dave@google.com',
        'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com',
        'Wes': np.nan}
data = Series(data)
data
data.str.contains('gmail')

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

## Series.str.findall

In [25]:
# findall 直接找出匹配的正则
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

## Series.str.match

In [26]:
# match找到后返回bool类型
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches # 原教材与现状不一致，matches反映每一个key是否匹配。

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [27]:
data.str[:5] # 字符串统一切片

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object