# pandas string handling

In [1]:
import pandas as pd
import numpy as np

In [2]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

In [3]:
monte

0    Graham Chapman
1       John Cleese
2     Terry Gilliam
3         Eric Idle
4       Terry Jones
5     Michael Palin
dtype: object

In [4]:
monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [5]:
monte.str.upper()

0    GRAHAM CHAPMAN
1       JOHN CLEESE
2     TERRY GILLIAM
3         ERIC IDLE
4       TERRY JONES
5     MICHAEL PALIN
dtype: object

In [6]:
monte.str.capitalize()

0    Graham chapman
1       John cleese
2     Terry gilliam
3         Eric idle
4       Terry jones
5     Michael palin
dtype: object

In [7]:
monte.str.contains('a')

0     True
1    False
2     True
3    False
4    False
5     True
dtype: bool

In [12]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

In [13]:
monte.str.split('a')

0    [Gr, h, m Ch, pm, n]
1           [John Cleese]
2        [Terry Gilli, m]
3             [Eric Idle]
4           [Terry Jones]
5       [Mich, el P, lin]
dtype: object

In [14]:
monte.str.split('h')

0    [Gra, am C, apman]
1        [Jo, n Cleese]
2       [Terry Gilliam]
3           [Eric Idle]
4         [Terry Jones]
5      [Mic, ael Palin]
dtype: object

In [15]:
monte.str.extract('([A-Zd-s]+)',expand=True) ### the condition includes a regular expression

## Get all characters from A-Z and d-s
## + symbol is used to iterate the condition multiple times
## TRUE for representing it in the form of a DataFrame

Unnamed: 0,0
0,Gr
1,John
2,Terr
3,Eri
4,Terr
5,Mi


In [16]:
monte.str.extract('([A-Zd-s]+)',expand=False) 

### FALSE for representing data loosely

0      Gr
1    John
2    Terr
3     Eri
4    Terr
5      Mi
dtype: object

In [17]:
monte.str.findall(r'^[AEIOU].*[aeiou]$')

0             []
1             []
2             []
3    [Eric Idle]
4             []
5             []
dtype: object

In [18]:
monte.str[0:2]

0    Gr
1    Jo
2    Te
3    Er
4    Te
5    Mi
dtype: object

In [19]:
monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [20]:
monte.str[0:4]

0    Grah
1    John
2    Terr
3    Eric
4    Terr
5    Mich
dtype: object

In [21]:
help(monte.str.get_dummies)

Help on method get_dummies in module pandas.core.strings:

get_dummies(sep='|') method of pandas.core.strings.StringMethods instance
    Split each string in the Series by sep and return a frame of
    dummy/indicator variables.
    
    Parameters
    ----------
    sep : string, default "|"
        String to split on.
    
    Returns
    -------
    dummies : DataFrame
    
    Examples
    --------
    >>> Series(['a|b', 'a', 'a|c']).str.get_dummies()
       a  b  c
    0  1  1  0
    1  1  0  0
    2  1  0  1
    
    >>> Series(['a|b', np.nan, 'a|c']).str.get_dummies()
       a  b  c
    0  1  1  0
    1  0  0  0
    2  1  0  1
    
    See Also
    --------
    pandas.get_dummies

