In [1]:
import numpy as np
import pandas as pd

In [3]:
email = 'luis@email.com'

In [7]:
email.split('@')
# normal Python string and built-in methods

['luis', 'email.com']

In [6]:
type(email.split('@'))

list

In [9]:
names = pd.Series(['andrew','bobo','claire','david','5'])

In [10]:
names

0    andrew
1      bobo
2    claire
3     david
4         5
dtype: object

In [11]:
names.str.upper()

# reassign the returned Series to makes this changes permanent

0    ANDREW
1      BOBO
2    CLAIRE
3     DAVID
4         5
dtype: object

In [12]:
email.isdigit()

False

In [13]:
'5'.isdigit()

True

In [14]:
names.str.isdigit()
# using str will apply the method on every single strin in the Series

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [16]:
tf = ['GOOG,APPL,AMZN','JPM,BAC,GS']

In [18]:
len(tf)

2

In [19]:
tickers = pd.Series(tf)

In [20]:
tickers
# we want to split this on the commas

0    GOOG,APPL,AMZN
1        JPM,BAC,GS
dtype: object

In [21]:
tickers.str.split(',')

0    [GOOG, APPL, AMZN]
1        [JPM, BAC, GS]
dtype: object

In [22]:
tickers.str.split(',').str[0]

0    GOOG
1     JPM
dtype: object

In [24]:
tickers.str.split(',', expand=True)
# expand splits the elements of the list into different columns

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


In [35]:
messy_names = pd.Series(["andrew  ","bo;bo","  claire  "])

In [36]:
messy_names

0      andrew  
1         bo;bo
2      claire  
dtype: object

In [38]:
messy_names.str.replace(';','').str.strip().str.capitalize()
# .str.strip() strips all the white spaces at the beginning and ending off a string

0    Andrew
1      Bobo
2    Claire
dtype: object

In [42]:
# we can achive the same result by defining a clean-up function and calling the apply method
# with the function as its argument

# note that in this case we don't need to use the str (Pandas) library

def cleanup(name):
    name = name.replace(";","")
    name = name.strip()
    name = name.capitalize()
    return name

In [43]:
messy_names.apply(cleanup)

0    Andrew
1      Bobo
2    Claire
dtype: object

In [44]:
import timeit 
  
# code snippet to be executed only once 
setup = '''
import pandas as pd
import numpy as np
messy_names = pd.Series(["andrew  ","bo;bo","  claire  "])
def cleanup(name):
    name = name.replace(";","")
    name = name.strip()
    name = name.capitalize()
    return name
'''
  
# code snippet whose execution time is to be measured 
stmt_pandas_str = ''' 
messy_names.str.replace(";","").str.strip().str.capitalize()
'''

stmt_pandas_apply = '''
messy_names.apply(cleanup)
'''

stmt_pandas_vectorize='''
np.vectorize(cleanup)(messy_names)
'''

In [45]:
timeit.timeit(setup = setup, 
                    stmt = stmt_pandas_str, 
                    number = 10000) 

6.53777956399972

In [46]:
timeit.timeit(setup = setup, 
                    stmt = stmt_pandas_apply, 
                    number = 10000) 

2.0947352210000645

In [47]:
timeit.timeit(setup = setup, 
                    stmt = stmt_pandas_vectorize, 
                    number = 10000) 

0.631777005999993