# Pandas - Working with Text Data

In [2]:
# import libraries
import pandas as pd
import numpy as np

In [4]:
s = pd.Series(['Tom', 'William Rick', 'John', 'Alber@t', np.nan, '1234','SteveSmith'])
print(s)

0             Tom
1    William Rick
2            John
3         Alber@t
4             NaN
5            1234
6      SteveSmith
dtype: object


In [6]:
# lower()
print(s.str.lower())

0             tom
1    william rick
2            john
3         alber@t
4             NaN
5            1234
6      stevesmith
dtype: object


In [8]:
# upper()
print(s.str.upper())

0             TOM
1    WILLIAM RICK
2            JOHN
3         ALBER@T
4             NaN
5            1234
6      STEVESMITH
dtype: object


In [10]:
# len()
print(s.str.len())

0     3.0
1    12.0
2     4.0
3     7.0
4     NaN
5     4.0
6    10.0
dtype: float64


In [14]:
# strip()
s = pd.Series(['Tom ', ' William Rick', 'John\n\n', 'Alber@t\n'])
print(s, '\n')

# after stripping
print('After Stripping:')
print(s.str.strip())

0             Tom 
1     William Rick
2         John\n\n
3        Alber@t\n
dtype: object 

After Stripping:
0             Tom
1    William Rick
2            John
3         Alber@t
dtype: object


In [20]:
# cat(sep=pattern)
s = pd.Series(['Tom', 'William Rick', 'John', 'Alber@t'])
print(s.str.cat(sep=':'))
print(s.str.cat(sep='_'))

Tom:William Rick:John:Alber@t
Tom_William Rick_John_Alber@t


In [22]:
# get_dummies() - returns DataFrame with hot encoded
s = pd.Series(['Tom', 'William Rick', 'John', 'Alber@t'])
print(s.str.get_dummies())

   Alber@t  John  Tom  William Rick
0        0     0    1             0
1        0     0    0             1
2        0     1    0             0
3        1     0    0             0


In [24]:
# contains()
s = pd.Series(['Tom ', ' William Rick', 'John', 'Alber@t'])
print(s.str.contains(' '), '\n')
print(s.str.contains('@'))

0     True
1     True
2    False
3    False
dtype: bool 

0    False
1    False
2    False
3     True
dtype: bool


In [29]:
# replace(a, b) - replace 'a' with 'b'
s = pd.Series(['Tom ', ' William Rick', 'John', 'Alber@t'])
print(s.str.replace('@', '$'))

0             Tom 
1     William Rick
2             John
3          Alber$t
dtype: object


In [35]:
# repeat()
s = pd.Series(['Tom', 'William Rick ', 'John', 'Alber@t'])
print(s.str.repeat(3))

0                                  TomTomTom
1    William Rick William Rick William Rick 
2                               JohnJohnJohn
3                      Alber@tAlber@tAlber@t
dtype: object


In [43]:
# count()
s = pd.Series(['Tom ', ' William Rim', 'John', 'Alber@mt'])
print(s.str.count('m'))

0    1
1    2
2    0
3    1
dtype: int64


In [44]:
# startwith()
print(s.str.startswith('T'), '\n')

# endswith()
print(s.str.endswith('t'))

0     True
1    False
2    False
3    False
dtype: bool 

0    False
1    False
2    False
3     True
dtype: bool


In [48]:
# find()
print(s.str.find('e'), '\n')

# findall()
print(s.str.findall('e'))

0   -1
1   -1
2   -1
3    3
dtype: int64 

0     []
1     []
2     []
3    [e]
dtype: object


In [49]:
# swapcase()
print(s.str.swapcase())

0            tOM 
1     wILLIAM rIM
2            jOHN
3        aLBER@MT
dtype: object


In [54]:
# islower()
s = pd.Series(['tom ', ' William Rick', 'John', 'Alber@t'])
print(s.str.islower())

0     True
1    False
2    False
3    False
dtype: bool


In [56]:
# isupper()
s = pd.Series(['Tom ', ' William Rick', 'JOHN', 'Alber@t'])
print(s.str.isupper())

0    False
1    False
2     True
3    False
dtype: bool


In [61]:
# isnumeric()
s = pd.Series(['Tom ', ' William Rick', 'John', 'Alber@t'])
print(s.str.isnumeric())

0    False
1    False
2    False
3    False
dtype: bool
