# Text Methods
* Often text data needs to be cleaned or manipulated for processing
* While we can always use a custom <b>apply()</b> function for these tasks, pandas comes with many built-in string method calls

In [36]:
import numpy as np
import pandas as pd

https://pandas.pydata.org/docs/user_guide/text.html


In [37]:
email = 'jose@email.com'

In [38]:
email.split('@')

['jose', 'email.com']

In [39]:
names = pd.Series(['andrew', 'bobo', 'claire', 'david', '5'])

In [40]:
names

0    andrew
1      bobo
2    claire
3     david
4         5
dtype: object

In [41]:
email = 'jose@email.com'

In [42]:
email.split('@')

['jose', 'email.com']

In [43]:
# call out the method call available
email.isdigit()

False

In [44]:
# this include the number
'5'.isdigit()

True

In [45]:
# remember that 5 is the string not number
names = pd.Series(['andrew', 'bobo', 'claire', 'david', '5'])

In [46]:
names


0    andrew
1      bobo
2    claire
3     david
4         5
dtype: object

In [47]:
# Upper case everything # Callout the method
names.str.upper()

0    ANDREW
1      BOBO
2    CLAIRE
3     DAVID
4         5
dtype: object

In [48]:
# string method # similar as method call
names.str.isdigit()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [49]:
tech_finance = ['GOOG,APPL,AMZN','JPM,BAC,GS']

In [50]:
len(tech_finance)

2

In [51]:
tickers = pd.Series(tech_finance)

In [52]:
tickers

0    GOOG,APPL,AMZN
1        JPM,BAC,GS
dtype: object

In [53]:
# using the string method
# split the item in the tech_finance list
# remember to call every thing after call the 'str'
# similar as line 79
tickers.str.split(',').str[0]

0    GOOG
1     JPM
dtype: object

In [54]:
tech = 'GOOG,APPL,AMZN'


In [55]:
tech.split(',')[0]

'GOOG'

In [56]:
# expand to three column in the case the have the same amount of items
# expand to different columns => return a dataframe
tickers.str.split(',', expand=True)

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


In [57]:
# dealing with messy data
messy_name = pd.Series(['andrew ',"bo;bo",'    claire  '])

In [58]:
messy_name

0         andrew 
1           bo;bo
2        claire  
dtype: object

In [59]:
messy_name[0]


'andrew '

In [60]:
# glaring error
# replace ';' by ''
messy_name.str.replace(';','')

0         andrew 
1            bobo
2        claire  
dtype: object

In [61]:
# replace the extra space
messy_name.str.strip()[0]


'andrew'

In [62]:
# capitalize the first word
messy_name.str.capitalize()

0         Andrew 
1           Bo;bo
2        claire  
dtype: object

In [63]:
# using the apply method
def cleanup(name):
    name = name.replace(";","")
    name = name.strip()
    name = name.capitalize()
    return name

In [64]:
messy_name.apply(cleanup)


0    Andrew
1      Bobo
2    Claire
dtype: object

In [65]:
# calculate the time between apply method, string method
# setup
import timeit
setup = """
import pandas as pd
import numpy as np
messy_name = pd.Series(['andrew ',"bo;bo",'    claire  '])
def cleanup(name):
    name = name.replace(";","")
    name = name.strip()
    name = name.capitalize()
    return name
"""


In [66]:
# string method execute time
# remember the stmt is a docstring
# lowest way
stmt_str = """messy_name.str.replace(";",'').str.strip().str.capitalize()"""
timeit.timeit(setup=setup, stmt=stmt_str, number=10000)

4.146503800000001

In [67]:
# pandas apply method execute time
stmt_apply = """messy_name.apply(cleanup)"""
timeit.timeit(setup=setup, stmt=stmt_apply, number=10000)

1.1213094999999988

In [68]:
# pandas vectorized
# the vectorize method is totally faster than 2 previous one
stmt_vectorized = """np.vectorize(cleanup)(messy_name)"""
timeit.timeit(setup=setup, stmt=stmt_vectorized, number=10000)

0.2281525000000002