In [1]:
import numpy as np
import pandas as pd

# Pandas and Text

Pandas can do a lot more than what we show here. Full online documentation on things like advanced string indexing and regular expressions with pandas can be found here: https://pandas.pydata.org/docs/user_guide/text.html

## Text Methods on Pandas String Column

In [2]:
email = "jose@email.com"

In [3]:
email.split("@")

['jose', 'email.com']

In [4]:
names = pd.Series(["andrew", "bobo", "claire", "david", "5"])

In [5]:
names

Unnamed: 0,0
0,andrew
1,bobo
2,claire
3,david
4,5


In [6]:
 names.str.upper() # this does not permenantly affest the series.

Unnamed: 0,0
0,ANDREW
1,BOBO
2,CLAIRE
3,DAVID
4,5


In [7]:
names

Unnamed: 0,0
0,andrew
1,bobo
2,claire
3,david
4,5


In [8]:
email.isdigit()

False

In [9]:
"5".isdigit()

True

In [10]:
names.str.isdigit()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,True


## Splitting , Grabbing, and Expanding

In [15]:
tech_finance = ["GOOG,APPL,AMZN", "JPM,BAC,GS"]

In [16]:
len(tech_finance)

2

In [17]:
tickers = pd.Series(tech_finance)

In [18]:
tickers

Unnamed: 0,0
0,"GOOG,APPL,AMZN"
1,"JPM,BAC,GS"


In [24]:
tickers.str.split(",")

Unnamed: 0,0
0,"[GOOG, APPL, AMZN]"
1,"[JPM, BAC, GS]"


In [21]:
tech = "GOOG,APPL,AMZN"

In [23]:
tech.split(",")[0]

'GOOG'

In [27]:
tickers.str.split(",").str[1]

Unnamed: 0,0
0,APPL
1,BAC


In [28]:
tickers.str.split(",",expand=True)

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


## Cleaning or Editing Strings

In [29]:
messy_names = pd.Series(["andrew  ","bo:bo","  claire  "])

In [32]:
messy_names[0]

'andrew  '

In [34]:
messy_names.str.replace(":","")

Unnamed: 0,0
0,andrew
1,bobo
2,claire


In [35]:
messy_names.str.replace(":","").str.strip().str.capitalize()

Unnamed: 0,0
0,Andrew
1,Bobo
2,Claire


## Alternative with Custom apply() call

In [39]:
def cleanup(name):
    name = name.replace(":","")
    name = name.strip()
    name = name.capitalize()
    return name

In [40]:
messy_names.apply(cleanup)

Unnamed: 0,0
0,Andrew
1,Bobo
2,Claire


## Which one is more efficient?

In [41]:
import timeit

# code snippet to be executed only once
setup = '''
import pandas as pd
import numpy as np
messy_names = pd.Series(["andrew  ","bo;bo","  claire  "])
def cleanup(name):
    name = name.replace(";","")
    name = name.strip()
    name = name.capitalize()
    return name
'''

# code snippet whose execution time is to be measured
stmt_pandas_str = '''
messy_names.str.replace(";","").str.strip().str.capitalize()
'''

stmt_pandas_apply = '''
messy_names.apply(cleanup)
'''

stmt_pandas_vectorize='''
np.vectorize(cleanup)(messy_names)
'''

In [42]:
timeit.timeit(setup = setup,
                    stmt = stmt_pandas_str,
                    number = 10000)

3.8131107579999934

In [43]:
timeit.timeit(setup = setup,
                    stmt = stmt_pandas_apply,
                    number = 10000)

1.13411392099988

In [44]:
timeit.timeit(setup = setup,
                    stmt = stmt_pandas_vectorize,
                    number = 10000)

0.5719171129999268

Wow! While .str() methods can be extremely convienent, when it comes to performance, don't forget about np.vectorize()! Review the "Useful Methods" lecture for a deeper discussion on np.vectorize()