In [2]:
import pandas as pd
import numpy as np

# SPLIT

In [3]:
s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
s2

0    a_b_c
1    c_d_e
2      NaN
3    f_g_h
dtype: object

In [6]:
s2.str.split('_')

0    [a, b, c]
1    [c, d, e]
2          NaN
3    [f, g, h]
dtype: object

In [11]:
s2.str.split('_', expand=True)

Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


In [10]:
s2.str.split('_', expand=True, n=1)

Unnamed: 0,0,1
0,a,b_c
1,c,d_e
2,,
3,f,g_h


In [9]:
s2.str.rsplit('_', expand=True, n=1)

Unnamed: 0,0,1
0,a_b,c
1,c_d,e
2,,
3,f_g,h


# REPLACE 

In [14]:
s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca','', np.nan, 'CABA', 'dog', 'cat'])
s3

0       A
1       B
2       C
3    Aaba
4    Baca
5        
6     NaN
7    CABA
8     dog
9     cat
dtype: object

In [15]:
s3.str.replace('^.a', 'XX', case=False)

0       A
1       B
2       C
3    XXba
4    XXca
5        
6     NaN
7    XXBA
8     dog
9     XXt
dtype: object

In [16]:
dollars = pd.Series(['12', '-$10', '$10,000'])
dollars

0         12
1       -$10
2    $10,000
dtype: object

In [21]:
dollars.str.replace('-\$','-')

0         12
1        -10
2    $10,000
dtype: object

In [20]:
dollars.str.replace('-$','-', regex=False)

0         12
1        -10
2    $10,000
dtype: object

# Concatenation

In [66]:
cs = pd.Series(['a', 'b', 'c', 'd', np.nan, 'g'])
cs.str.cat(sep=",")

'a,b,c,d,g'

In [35]:
cs.str.cat()

'abcdg'

In [39]:
cs.str.cat(sep=', ',na_rep='Nan')

'a, b, c, d, Nan, g'

## _provided that it matches the length_

In [98]:
cs1 = pd.Series(['a', 'b', 'c', 'd', 'g'], index=[1, 3, 0, 2, 4])
cs1.str.cat(sep=",")
cs1

1    a
3    b
0    c
2    d
4    g
dtype: object

In [92]:
prefix = [' A', ' B', ' C', ' D',' E']
pre = pd.Series(prefix)
pre

0     A
1     B
2     C
3     D
4     E
dtype: object

In [99]:
cs1.str.cat(pre, join="left")

1    a B
3    b D
0    c A
2    d C
4    g E
dtype: object

In [101]:
cs1.str.cat(pre, join="left").sort_index(axis="index")

0    c A
1    a B
2    d C
3    b D
4    g E
dtype: object

In [107]:
pd.Index(cs1.values)

Index(['a', 'b', 'c', 'd', 'g'], dtype='object')

# Indexing str

In [109]:
sstr = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan,'CABA', 'dog', 'cat'])
sstr

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [110]:
sstr.str[0]

0      A
1      B
2      C
3      A
4      B
5    NaN
6      C
7      d
8      c
dtype: object

In [115]:
sstr.str[:3]

0      A
1      B
2      C
3    Aab
4    Bac
5    NaN
6    CAB
7    dog
8    cat
dtype: object

In [117]:
sstr.str[1:]

0       
1       
2       
3    aba
4    aca
5    NaN
6    ABA
7     og
8     at
dtype: object

# Extracting substring

In [199]:
estr = pd.Series(['a1', 'zb34', 'c53'])
estr

0      a1
1    zb34
2     c53
dtype: object

In [207]:
estr.str.extract('([a-zA-Z])(\d)', expand=False)

Unnamed: 0,0,1
0,a,1
1,b,3
2,c,5


In [201]:
foo = pd.DataFrame(estr.str.extract('(?P<letter>[a-zA-Z])(?P<digit>[0-9])', expand=False))
foo

Unnamed: 0,letter,digit
0,a,1
1,b,3
2,c,5


In [202]:
foo.letter.str.extract("(?P<letters>[a-zA-Z])", expand=True)

Unnamed: 0,letters
0,a
1,b
2,c


In [203]:
foo.digit.str.extract("(?P<digits>[0-9])", expand=False)

0    1
1    3
2    5
Name: digits, dtype: object

In [195]:
foo.dtypes

letter    object
digit     object
dtype: object

# PAttern match

In [221]:
pts = pd.Series(['1', '2', '3a', '3b', np.nan, '03c'])
pattern = r'[0-9][a-z]'
pts

0      1
1      2
2     3a
3     3b
4    NaN
5    03c
dtype: object

In [228]:
pts.str.contains(pattern, na="NO DATA")

0      False
1      False
2       True
3       True
4    NO DATA
5       True
dtype: object

In [216]:
pts.str.match(pattern)

0    False
1    False
2     True
3     True
4    False
dtype: bool