In [1]:
import pandas as pd
import numpy as np

In [2]:
string_data = pd.Series(['Ant', 'Bull',
                         'Cat', 'Dog',
                         'Elephant'])

In [3]:
string_data

0         Ant
1        Bull
2         Cat
3         Dog
4    Elephant
dtype: object

View everything as lower case

In [4]:
string_data.str.lower()

0         ant
1        bull
2         cat
3         dog
4    elephant
dtype: object

View everything as upper case

In [5]:
string_data.str.upper()

0         ANT
1        BULL
2         CAT
3         DOG
4    ELEPHANT
dtype: object

Lets swap case

In [6]:
string_data.str.swapcase()

0         aNT
1        bULL
2         cAT
3         dOG
4    eLEPHANT
dtype: object

#### Transformations are performed on copies of the Series
The original series is unaffected

In [7]:
string_data

0         Ant
1        Bull
2         Cat
3         Dog
4    Elephant
dtype: object

In [8]:
string_data_lower = string_data.str.lower()

In [9]:
string_data_lower

0         ant
1        bull
2         cat
3         dog
4    elephant
dtype: object

#### Capitalize first letter

In [10]:
string_data_lower.str.capitalize()

0         Ant
1        Bull
2         Cat
3         Dog
4    Elephant
dtype: object

Calculate the length of each string

In [11]:
string_data.str.len()

0    3
1    4
2    3
3    3
4    8
dtype: int64

#### What if our data has characters like spaces, or missing values?

In [12]:
string_data = pd.Series(["string one",
                         " String-Two ", 
                         "  String:three  ", 
                         np.nan, 
                         "#453", 
                         "number%57"])

#### The output includes the spaces and special characters
It's right-justified, so it's clear the white spaces are there

In [13]:
string_data

0          string one
1         String-Two 
2      String:three  
3                 NaN
4                #453
5           number%57
dtype: object

In [14]:
string_data.str.upper()

0          STRING ONE
1         STRING-TWO 
2      STRING:THREE  
3                 NaN
4                #453
5           NUMBER%57
dtype: object

#### The capitalize() method 
This doesn't just convert the first letter to be uppercase - it also ensures the other letters are lowercase. 

If the first character is not an alphabet (such as the spaces in rows with index 1 and 2)

In [15]:
string_data.str.upper().str.capitalize()

0          String one
1         string-two 
2      string:three  
3                 NaN
4                #453
5           Number%57
dtype: object

In [16]:
string_data.str.len()

0    10.0
1    12.0
2    16.0
3     NaN
4     4.0
5     9.0
dtype: float64

We see that this count includes the white spaces too. What if we want to only count the characters, excluding spaces?

In [17]:
stripped_data = string_data.str.strip()

stripped_data

0      string one
1      String-Two
2    String:three
3             NaN
4            #453
5       number%57
dtype: object

In [18]:
stripped_data.str.len()

0    10.0
1    10.0
2    12.0
3     NaN
4     4.0
5     9.0
dtype: float64

Do we have any numeric strings?

In [19]:
string_data.str.isnumeric()

0    False
1    False
2    False
3      NaN
4    False
5    False
dtype: object

In [20]:
data = pd.Series([1, '#2', '3', '4', 'ant'])

data

0      1
1     #2
2      3
3      4
4    ant
dtype: object

#### The isnumeric() method works with strings
Which is why using it on an integer returns NaN

In [21]:
data.str.isnumeric()

0      NaN
1    False
2     True
3     True
4    False
dtype: object

Concatenate strings using specified pattern

In [22]:
string_data.str.cat(sep=' | ')

'string one |  String-Two  |   String:three   | #453 | number%57'

Notice that the missing nan value is ignored during concatenation

#### Concatenate with another string series
Series can be concatenated with a list of the same size or another Pandas Series. Use the na_rep argument to specify a replacement string for NaNs

In [23]:
string_data.str.cat(['A','B','C','D','E','F'], 
                    na_rep='_')

0          string oneA
1         String-Two B
2      String:three  C
3                   _D
4                #453E
5           number%57F
dtype: object

#### Series of mismatched lengths

In [24]:
string_data.str.cat(['A','B','C','D','E'], 
                    na_rep='_')

ValueError: All arrays must be same length, except those having an index if `join` is not None

In [25]:
alphabet_series = pd.Series(['A','B','C','D','E','F'])

In [26]:
string_data.str.cat(alphabet_series, 
                    na_rep='_')

0          string oneA
1         String-Two B
2      String:three  C
3                   _D
4                #453E
5           number%57F
dtype: object

Does our string contain a specific pattern?

In [27]:
string_data.str.contains('#')

0    False
1    False
2    False
3      NaN
4     True
5    False
dtype: object

#### Search for multiple patterns

In [28]:
string_data.str.contains('#|%|:thr')

0    False
1    False
2     True
3      NaN
4     True
5     True
dtype: object

In [29]:
string_data.str.contains('#|%|:thy')

0    False
1    False
2    False
3      NaN
4     True
5     True
dtype: object

#### Find the letter 'e' in all strings
* Returns the starting index of the matched substring. 
* -1 if it's not present
* The result of all operations on the NaN value is also NaN

In [30]:
string_data.str.find('e')

0     9.0
1    -1.0
2    12.0
3     NaN
4    -1.0
5     4.0
dtype: float64

In [31]:
string_data.str.find('ring')

0    2.0
1    3.0
2    4.0
3    NaN
4   -1.0
5   -1.0
dtype: float64

In [32]:
string_data.str.findall('e')

0       [e]
1        []
2    [e, e]
3       NaN
4        []
5       [e]
dtype: object

#### Search for any capital letters

In [33]:
string_data.str.findall('[A-Z]+')

0        []
1    [S, T]
2       [S]
3       NaN
4        []
5        []
dtype: object

Replace letter e in all strings with *

In [34]:
string_data.str.replace('e','*')

0          string on*
1         String-Two 
2      String:thr**  
3                 NaN
4                #453
5           numb*r%57
dtype: object

#### Using regular expressions
The replacement text can be a literal or defined by a subroutine

In [35]:
pattern = r'[a-z]+'

In [36]:
string_data.str.replace(pattern, '###')

0         ### ###
1      S###-T### 
2      S###:###  
3             NaN
4            #453
5          ###%57
dtype: object

In [37]:
replacement = lambda x: x.group(0).upper()

In [38]:
string_data.str.replace(pattern, replacement)

0          STRING ONE
1         STRING-TWO 
2      STRING:THREE  
3                 NaN
4                #453
5           NUMBER%57
dtype: object

How many spaces do we have in every string?

In [39]:
string_data.str.count(' ')

0    1.0
1    2.0
2    4.0
3    NaN
4    0.0
5    0.0
dtype: float64

Lets increase the number of times every string is repeated

In [40]:
string_data.str.repeat(3)

0                      string onestring onestring one
1                 String-Two  String-Two  String-Two 
2      String:three    String:three    String:three  
3                                                 NaN
4                                        #453#453#453
5                         number%57number%57number%57
dtype: object

#### Split the contents of each element in the series
We will now make the above our original string and then try splitting it to see the results using the split() method

In [41]:
concat_data = string_data.str.cat(['|'] * len(string_data.index), 
                                  na_rep='')

In [42]:
concat_data = concat_data.str.repeat(3)

In [43]:
concat_data

0                    string one|string one|string one|
1               String-Two | String-Two | String-Two |
2      String:three  |  String:three  |  String:thr...
3                                                  |||
4                                      #453|#453|#453|
5                       number%57|number%57|number%57|
dtype: object

In [44]:
concat_data.str.split('|')

0               [string one, string one, string one, ]
1         [ String-Two ,  String-Two ,  String-Two , ]
2    [  String:three  ,   String:three  ,   String:...
3                                             [, , , ]
4                                 [#453, #453, #453, ]
5                  [number%57, number%57, number%57, ]
dtype: object

The result is a series of lists by default. To return a dataframe,

In [45]:
concat_data.str.split('|',
                      expand=True)

Unnamed: 0,0,1,2,3
0,string one,string one,string one,
1,String-Two,String-Two,String-Two,
2,String:three,String:three,String:three,
3,,,,
4,#453,#453,#453,
5,number%57,number%57,number%57,


#### Define number of splits
The first 2 occurences of '|' will be used to do the split

In [46]:
concat_data.str.split('|' , 
                      expand=True, 
                      n=2)

Unnamed: 0,0,1,2
0,string one,string one,string one|
1,String-Two,String-Two,String-Two |
2,String:three,String:three,String:three |
3,,,|
4,#453,#453,#453|
5,number%57,number%57,number%57|
