In [2]:
import pandas as pd

sample_string = 'take 5'
print(sample_string.isdigit())

sample_string = '54675643'
print(sample_string.isdigit())

False
True


# .apply()

.apply() can be added after a series to apply a function to every element in the series

reminder:

    a series is a list of items created using pandas

In [3]:
# Create a series of dirty, annoying values.
money = pd.Series([400, 111, '$20', 57, 'Lots'])

# Running `money.isdigit()` throws an error because .isdigit() is a string
# attribute, _not_ a series attribute. Uncomment the line below to see.

# print(money.isdigit())

# Instead, let's define a new function that takes a string as an argument
# and returns True if the string is all digits, otherwise False.

def is_a_string(x):
    # First make sure we're operating on a string, then use our string method.
    return str(x).isdigit()

# Now let's apply our custom function to each element in our series.
print(money.apply(is_a_string))

0     True
1     True
2    False
3     True
4    False
dtype: bool


# Lambda Function

Lambda functions are temporary functions that we typically only want to use once

Note that this lambda function does the same operation as 'is_a_string' in the previous segment

In [4]:
# Dirty list
money = pd.Series([400, 111, '$20', 57, 'Lots'])

# Here's a lambda function that mirrors the is_a_digit function above.
# Read this print statement carefully and compare to the previous one.
print(money.apply(lambda x: str(x).isdigit()))

0     True
1     True
2    False
3     True
4    False
dtype: bool


# Filter Function

In [5]:
# We're using list() on the result because filter() returns an iterator.

print('Filtering the whole series:')
print(list(filter(lambda x: str(x).isdigit(), money)))

print('\nApplying filter() to each value in the series:')
print(money.apply(lambda x: ''.join(list(filter(str.isdigit, str(x))))))

Filtering the whole series:
[400, 111, 57]

Applying filter() to each value in the series:
0    400
1    111
2     20
3     57
4       
dtype: object


# .split()

In [6]:
# Create a series of dirty, annoying strings.
words = pd.Series([
    'MollyMalone$molmal@gmail.com',
    'JeffreyJones$jefjo@hotmail.com',
    'DeadParrot$fjords@gmail.com'
])

# Split on '$'. We'll use the Pandas split method.
word_split = words.str.split('$', expand = True)
names = word_split[0]
emails = word_split[1]
print(names, '\n')
print(emails)

0     MollyMalone
1    JeffreyJones
2      DeadParrot
Name: 0, dtype: object 

0     molmal@gmail.com
1    jefjo@hotmail.com
2     fjords@gmail.com
Name: 1, dtype: object


In [7]:
import re

# We expect the first name to follow the first capital letter.
firstname = names.apply(lambda x: re.findall('[A-Z][a-z]*', x)[0])

# We expect the last name to follow the second capital letter.
lastname = names.apply(lambda x: re.findall('[A-Z][a-z]*', x)[1])

print(firstname, '\n')
print(lastname)

0      Molly
1    Jeffrey
2       Dead
Name: 0, dtype: object 

0    Malone
1     Jones
2    Parrot
Name: 0, dtype: object


# Changing the content of strings

# Replace


In [8]:
print(emails.str.replace('@', ' at '), '\n')

print(emails.str.replace('.com', ''))

0     molmal at gmail.com
1    jefjo at hotmail.com
2     fjords at gmail.com
Name: 1, dtype: object 

0     molmal@gmail
1    jefjo@hotmail
2     fjords@gmail
Name: 1, dtype: object


# Changing Case

In [9]:
print(names.str.lower(), '\n')
print(names.str.upper(), '\n')
print(names.str.capitalize())

0     mollymalone
1    jeffreyjones
2      deadparrot
Name: 0, dtype: object 

0     MOLLYMALONE
1    JEFFREYJONES
2      DEADPARROT
Name: 0, dtype: object 

0     Mollymalone
1    Jeffreyjones
2      Deadparrot
Name: 0, dtype: object


# Stripping Whitespace

In [10]:
# Note that these two aren't equal because of a whitespace
"Hello, world" == "Hello, world "

False

In [11]:
# .strip() removes the leading and trailing whitespace

spacy = '   What, on earth, is going on here?   '
print(spacy)
print(spacy.strip())

   What, on earth, is going on here?   
What, on earth, is going on here?


In [13]:
# Series of strings with annoying whitespace
words = pd.Series([' duck', 'duck ', ' duck ', 'goose'])
print(words[0] == words[1])

stripped = words.str.strip()
print(stripped[0] == stripped[1])

False
True
