In [2]:
import numpy as np
import pandas as pd

In [3]:
# create a new email
email = "jose@email.com"

In [4]:
# split the email
email.split("@")

['jose', 'email.com']

In [5]:
email.isdigit()

False

In [6]:
# remember the '5' is still string not numerical
# it only contains the numerical => True
'5'.isdigit()

True

In [7]:
# Create a name series
names = pd.Series(['andrew', 'bobo', 'claire', 'david', '5'])

In [8]:
# Isdigit method
# call out the string first
# The isdigit() method returns True if all the characters are digits, otherwise False.
names.str.isdigit()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [9]:
names

0    andrew
1      bobo
2    claire
3     david
4         5
dtype: object

In [10]:
# Upper method
# call out the string first
names.str.upper()

0    ANDREW
1      BOBO
2    CLAIRE
3     DAVID
4         5
dtype: object

In [11]:
tech_finance = ['GOOG,APPL,AMZN','JPM,BAC,GS']

In [12]:
len(tech_finance)

2

In [13]:
tickers = pd.Series(tech_finance)

In [14]:
tickers

0    GOOG,APPL,AMZN
1        JPM,BAC,GS
dtype: object

In [15]:
# Split the item of tickers Series by parenthesis
# Call method after the 'str'
tk = tickers.str.split(",")

In [16]:
# Access to new position
tk.str[0]

0    GOOG
1     JPM
dtype: object

In [17]:
tech = 'GOOG,APPL,AMZN'

In [18]:
tech.split(',')[0]

'GOOG'

In [19]:
# expand to three column in the case the have the same amount of items
# expand to different columns => return a dataframe
# Expand the split strings into separate columns.

# If True, return DataFrame/MultiIndex expanding dimensionality.

# If False, return Series/Index, containing lists of strings.
tickers.str.split(',', expand=True)

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


In [20]:
tickers.str.split(',', expand=False)


0    [GOOG, APPL, AMZN]
1        [JPM, BAC, GS]
dtype: object

# Dealing with messy data



In [21]:
messy_name = pd.Series(['andrew ',"bo;bo",'    claire  '])

In [22]:
messy_name


0         andrew 
1           bo;bo
2        claire  
dtype: object

In [23]:
messy_name[0]

'andrew '

In [25]:
# glaring error
# replace ; by ''
# aware to use the string
messy_name.str.replace(';','')

0         andrew 
1            bobo
2        claire  
dtype: object

In [28]:
# replace the extra space
messy_name.str.strip()[2]

'claire'

In [29]:
# capitalize the first word
messy_name.str.capitalize()

0         Andrew 
1           Bo;bo
2        claire  
dtype: object

In [30]:
# using the function to deal with messy data
def cleanup(name):
    name = name.replace(';','')
    name = name.strip()
    name = name.capitalize()
    return name



In [32]:
# callout the function by using the apply()
messy_name.apply(cleanup)

0    Andrew
1      Bobo
2    Claire
dtype: object

* <b>Calculate the time execute between apply method and string method </b>

In [34]:
import timeit
setup = """
import pandas as pd
import numpy as np
messy_name = pd.Series(['andrew ',"bo;bo",'    claire  '])
def cleanup(name):
    name = name.replace(';','')
    name = name.strip()
    name = name.capitalize()
    return name
"""

In [35]:
# string time method
# remember stmt is a docstring
# slowest way
# setup which is the code that you run before running the stmt; it defaults to 'pass'
# stmt which is the statement you want to measure; it defaults to 'pass'
stmt_str = """messy_name.str.replace(';','').str.strip().str.capitalize()"""
timeit.timeit(setup=setup, stmt= stmt_str, number=10000)

3.817764899999929

In [36]:
# the apply method
stmt_fuc = """messy_name.apply(cleanup)"""
timeit.timeit(setup=setup, stmt=stmt_fuc, number=10000)

1.1078520000000935

In [37]:
# pandas vectorized
# this is faster than two previous one
stmt_vectorized = """np.vectorize(cleanup)"""
timeit.timeit(setup=setup, stmt=stmt_vectorized, number=10000)

0.01253999999971711