* https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html

In [1]:
import numpy as np
import pandas as pd
import re

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

### STRING OPERATIONS

 <img src="https://github.com/lsalamo/python-training/blob/master/PythonDataScienceHandbook/3.%20Data%20Manipulation%20with%20Pandas/img/string_operations.png?raw=1" style="width:400px;float:left">

In [None]:
data = ['peter', 'Paul', 'MARY', 'gUIDO']
print(data[0].capitalize())
print("======")
print([s.capitalize() for s in data])
print("======")
print(pd.Series(data).str.capitalize())

Peter
['Peter', 'Paul', 'Mary', 'Guido']
0    Peter
1     Paul
2     Mary
3    Guido
dtype: object


### REGULAR EXPRESSIONS

 <img src="https://github.com/lsalamo/python-training/blob/master/PythonDataScienceHandbook/3.%20Data%20Manipulation%20with%20Pandas/img/string_operations_re.png?raw=1" style="width:500px;float:left">

In [2]:
data = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

#### MATCH 
Determine if each string starts with a match of a regular expression returning a boolean

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.match.html

In [None]:
# get first name
print(data.str.match(r'([A-Za-z]+) (Chapman|Jones)'))

0     True
1    False
2    False
3    False
4     True
5    False
dtype: bool


#### CONTAINS 
Return boolean Series or Index based on whether a given pattern or regex is contained within a string of a Series or Index.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html

In [None]:
# get first name
data.str.contains('li', regex=False)
data.str.contains('.+li', regex=True)

0    False
1    False
2     True
3    False
4    False
5     True
dtype: bool

#### EXTRACT
Extract capture groups in the regex pattern as columns in a DataFrame.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.extract.html

In [None]:
# get first name
test = re.match(r'([A-Za-z]+) (Chapman|Jones)', 'Graham Chapman');print(test.groups());print(test.group(1))
print("======")
test = re.match(r'([A-Za-z]+) (?:Chapman|Jones)', 'Graham Chapman');print(test.group(1))
print("======")
for s in data:
    match = re.match(r'([A-Za-z]+) (?:Chapman|Jones)', s)
    if match: print(match.group(1))
print("======")
print(data.str.extract(r'([A-Za-z]+) (?:Chapman|Jones)', expand=False))

('Graham', 'Chapman')
Graham
Graham
Graham
Terry
0    Graham
1       NaN
2       NaN
3       NaN
4     Terry
5       NaN
dtype: object


#### FINDALL
Find all occurrences of pattern or regular expression in the Series/Index.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.findall.html

In [None]:
# get first name
print(re.findall(r'([A-Za-z]+) (?:Chapman|Jones)', 'Graham Chapman'))
print("======")
print([re.findall(r'([A-Za-z]+) (?:Chapman|Jones)', s) for s in data])
print("======")
data.str.findall(r'([A-Za-z]+) (?:Chapman|Jones)')

['Graham']
[['Graham'], [], [], [], ['Terry'], []]


0    [Graham]
1          []
2          []
3          []
4     [Terry]
5          []
dtype: object

#### REPLACE
Replace each occurrence of pattern/regex in the Series/Index.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.replace.html

In [None]:
data.str.replace('(Chapman|Jones)', 'Salamo', regex=True)

0    Graham Salamo
1      John Cleese
2    Terry Gilliam
3        Eric Idle
4     Terry Salamo
5    Michael Palin
dtype: object

In [None]:
data.str.replace('a', 'A', regex=False)

0    GrAhAm ChApmAn
1       John Cleese
2     Terry GilliAm
3         Eric Idle
4       Terry Jones
5     MichAel PAlin
dtype: object

#### COUNT
Count occurrences of pattern in each string of the Series/Index.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.count.html

In [None]:
data.str.count('(Chapman|Jones)')

0    1
1    0
2    0
3    0
4    1
5    0
dtype: int64

### MISCELLANEOUS METHODS

 <img src="https://github.com/lsalamo/python-training/blob/master/PythonDataScienceHandbook/3.%20Data%20Manipulation%20with%20Pandas/img/string_operations_mm.png?raw=1" style="width:500px;float:left">

In [None]:
print('Graham Chapman'[0:3])
print("======")
str_series.str[0:3]
str_series.str.slice(0,3)

Gra


0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [None]:
print('Graham Chapman'.split()[0])
print("======")
str_series.str.split().str[-1]
str_series.str.split().str.get(-1)

Graham


0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

### GET_DUMMIES

In [None]:
df = pd.DataFrame({'name': monte,
                   'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C', 'B|C|D']})
display('df', 'df["info"].str.get_dummies("|")')

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


In [None]:
# describe field name length
df.name.str.len().describe()

count     6.000000
mean     11.833333
std       1.834848
min       9.000000
25%      11.000000
50%      12.000000
75%      13.000000
max      14.000000
Name: name, dtype: float64

In [None]:
# get the longest name
df.name[np.argmax(df.name.str.len())]

'Graham Chapman'

In [None]:
df.name.str.contains('[Tt]erry').sum()

2

### EXAMPLE > A SIMPLE RECIPE RECOMMENDER

In [None]:
import re
name_list = ['John Cleese', 'Eric Idle', 'Terry Jones', 'Michael Palin']
df_test = pd.DataFrame(dict((name, df.name.str.contains(name, re.IGNORECASE))
                             for name in name_list))
df_test

Unnamed: 0,John Cleese,Eric Idle,Terry Jones,Michael Palin
0,False,False,False,False
1,True,False,False,False
2,False,False,False,False
3,False,True,False,False
4,False,False,True,False
5,False,False,False,True


In [None]:
df_test = pd.DataFrame(
    np.array([
        ['rice', 'salt, pepper, oregano, paprika, cumin'],
        ['fish', 'pepper, oregano, paprika'],
        ['pasta', 'pepper, paprika, cumin'],
        ['meat', 'salt, pepper, paprika, cumin']
    ]),
    index=[0, 1, 2, 3],
    columns=['recipe', 'ingredients']
)
df_test

Unnamed: 0,recipe,ingredients
0,rice,"salt, pepper, oregano, paprika, cumin"
1,fish,"pepper, oregano, paprika"
2,pasta,"pepper, paprika, cumin"
3,meat,"salt, pepper, paprika, cumin"


In [None]:
import re
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley', 'paprika', 'cumin']
df_test_spice = pd.DataFrame(dict((spice, df_test.ingredients.str.contains(spice, re.IGNORECASE))
                             for spice in spice_list))
df_test_spice

Unnamed: 0,salt,pepper,oregano,sage,parsley,paprika,cumin
0,True,True,True,False,False,True,True
1,False,True,True,False,False,True,False
2,False,True,False,False,False,True,True
3,True,True,False,False,False,True,True


In [None]:
selection = df_test_spice.query('salt & paprika & cumin')
len(selection)

2

In [None]:
df_test.recipe[selection.index]

0    rice
3    meat
Name: recipe, dtype: object

# DATAFRAME TO STRING

In [3]:
'.'.join(data)

'Graham Chapman.John Cleese.Terry Gilliam.Eric Idle.Terry Jones.Michael Palin'