* https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html

In [89]:
import numpy as np
import pandas as pd

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

### STRING OPERATIONS

 <img src="img/string_operations.png" style="width:400px;float:left">

In [144]:
data = ['peter', 'Paul', 'MARY', 'gUIDO']
print(data[0].capitalize())
print("======")
print([s.capitalize() for s in data])
print("======")
print(pd.Series(data).str.capitalize())

Peter
['Peter', 'Paul', 'Mary', 'Guido']
0    Peter
1     Paul
2     Mary
3    Guido
dtype: object


### REGULAR EXPRESSIONS

 <img src="img/string_operations_re.png" style="width:500px;float:left">

In [155]:
data = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

#### MATCH 
Determine if each string starts with a match of a regular expression returning a boolean

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.match.html

In [236]:
# get first name
print(data.str.match(r'([A-Za-z]+) (?:Chapman|Jones)'))

0     True
1    False
2    False
3    False
4     True
5    False
dtype: bool


#### CONTAINS 
Return boolean Series or Index based on whether a given pattern or regex is contained within a string of a Series or Index.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html

In [282]:
# get first name
print(data.str.contains('li'), regex=False)
print(data.str.contains('.+li', regex=True))

0    False
1    False
2     True
3    False
4    False
5     True
dtype: bool
0    False
1    False
2     True
3    False
4    False
5     True
dtype: bool


#### EXTRACT
Extract capture groups in the regex pattern as columns in a DataFrame.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.extract.html

In [213]:
# get first name
m = re.match(r'([A-Za-z]+) (?:Chapman|Jones)', 'Graham Chapman');print(m.group(1))
print("======")
for s in data:
    match = re.match(r'([A-Za-z]+) (?:Chapman|Jones)', s)
    if match: print(match.group(1))
print("======")
data.str.extract(r'([A-Za-z]+) (?:Chapman|Jones)', expand=False)

Graham
Graham
Terry


0    Graham
1       NaN
2       NaN
3       NaN
4     Terry
5       NaN
dtype: object

#### FINDALL
Find all occurrences of pattern or regular expression in the Series/Index.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.findall.html

In [226]:
# get first name
print(re.findall(r'([A-Za-z]+) (?:Chapman|Jones)', 'Graham Chapman'))
print("======")
print([re.findall(r'([A-Za-z]+) (?:Chapman|Jones)', s) for s in data])
print("======")
data.str.findall(r'([A-Za-z]+) (?:Chapman|Jones)')

['Graham']
[['Graham'], [], [], [], ['Terry'], []]


0    [Graham]
1          []
2          []
3          []
4     [Terry]
5          []
dtype: object

#### REPLACE
Replace each occurrence of pattern/regex in the Series/Index.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.replace.html

In [231]:
data.str.replace('(Chapman|Jones)', 'Salamo', regex=True)

0    Graham Salamo
1      John Cleese
2    Terry Gilliam
3        Eric Idle
4     Terry Salamo
5    Michael Palin
dtype: object

In [234]:
data.str.replace('a', 'A', regex=False)

0    GrAhAm ChApmAn
1       John Cleese
2     Terry GilliAm
3         Eric Idle
4       Terry Jones
5     MichAel PAlin
dtype: object

#### COUNT
Count occurrences of pattern in each string of the Series/Index.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.count.html

In [285]:
data.str.count('(Chapman|Jones)')

0    1
1    0
2    0
3    0
4    1
5    0
dtype: int64

### MISCELLANEOUS METHODS

 <img src="img/string_operations_mm.png" style="width:500px;float:left">

In [67]:
print('Graham Chapman'[0:3])
print("======")
str_series.str[0:3]
str_series.str.slice(0,3)

Gra


0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [81]:
print('Graham Chapman'.split()[0])
print("======")
str_series.str.split().str[-1]
str_series.str.split().str.get(-1)

Graham


0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

### GET_DUMMIES

In [90]:
df = pd.DataFrame({'name': monte,
                   'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C', 'B|C|D']})
display('df', 'df["info"].str.get_dummies("|")')

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


In [91]:
# describe field name length
df.name.str.len().describe()

count     6.000000
mean     11.833333
std       1.834848
min       9.000000
25%      11.000000
50%      12.000000
75%      13.000000
max      14.000000
Name: name, dtype: float64

In [92]:
# get the longest name
df.name[np.argmax(df.name.str.len())]

'Graham Chapman'

In [97]:
df.name.str.contains('[Tt]erry').sum()

2

In [131]:
import re
name_list = ['John Cleese', 'Eric Idle', 'Terry Jones', 'Michael Palin']
df_test = pd.DataFrame(dict((name, df.name.str.contains(name, re.IGNORECASE))
                             for name in name_list))
df_test

2