# Converting data types

In [None]:
# Convert the sex column to type 'category'
tips.sex = tips.sex.astype('category')

# Convert the smoker column to type 'category'
tips.smoker = tips.smoker.astype('category')

# Print the info of tips
print(tips.info())

# Working with numeric data

In [None]:
# Convert 'total_bill' to a numeric dtype
tips['total_bill'] = pd.to_numeric(tips['total_bill'], errors='coerce')

# Convert 'tip' to a numeric dtype
tips['tip'] = pd.to_numeric(tips['tip'], errors='coerce')

# Print the info of tips
print(tips.info())

# String parsing with regular expressions

In [None]:
# Import the regular expression module
import re

# Compile the pattern: prog
# Compile a pattern that matches a phone number of the format xxx-xxx-xxxx.
prog = re.compile('\d{3}\-\d{3}\-\d{4}')

# See if the pattern matches
result = prog.match('123-456-7890')
print(bool(result))

# See if the pattern matches
result2 = prog.match('1123-456-7890')
print(bool(result2))

# Extracting numerical values from strings

In [None]:
# Import the regular expression module
import re

# Find the numeric values: matches
# \d is the pattern required to find digits. This should be followed with a + so that the previous element is 
# matched one or more times. This ensures that 10 is viewed as one number and not as 1 and 0
matches = re.findall('\d+', 'the recipe calls for 10 strawberries and 1 banana')

# Print the matches
print(matches)

# Pattern matching

In [None]:
# Write the first pattern
pattern1 = bool(re.match(pattern='\d{3}\-\d{3}\-\d{4}', string='123-456-7890'))
print(pattern1)

# Write the second pattern
# Use \$ to match the dollar sign, \d* to match an arbitrary number of digits, \. to match the decimal point, 
# and \d{x} to match x number of digits.
pattern2 = bool(re.match(pattern='\$\d*\.\d{2}', string='$123.45'))
print(pattern2)

# Write the third pattern
# Use [A-Z] to match any capital letter followed by \w* to match an arbitrary number of alphanumeric characters.
pattern3 = bool(re.match(pattern='[A-Z]\w*', string='Australia'))
print(pattern3)

# Custom functions to clean data

In [None]:
# Define recode_sex()
def recode_sex(sex_value):

    # Return 1 if sex_value is 'Male'
    if sex_value == 'Male':
        return 1
    
    # Return 0 if sex_value is 'Female'    
    elif sex_value == 'Female':
        return 0
    
    # Return np.nan    
    else:
        return np.nan

# Apply the function to the sex column
# Note that when passing in a function inside the .apply() method, you don't need to specify the parentheses after the function name.
tips['sex_recode'] = tips.sex.apply(recode_sex)

# Print the first five rows of tips
print(tips.head())

# Lambda functions

In [None]:
# Write the lambda function using replace
tips['total_dollar_replace'] = tips['total_dollar'].apply(lambda x: x.replace('$', ''))

# Write the lambda function using regular expressions
# Notice that because re.findall() returns a list, you have to slice it in order to access the actual value.
tips['total_dollar_re'] = tips['total_dollar'].apply(lambda x: re.findall('\d+\.\d+', x)[0])

# Print the head of tips
print(tips.head())

# Dropping duplicate data

In [None]:
# Create the new DataFrame: tracks
tracks = billboard[['year', 'artist', 'track', 'time']]

# Print info of tracks
print(tracks.info())

# Drop the duplicates: tracks_no_duplicates
tracks_no_duplicates = tracks.drop_duplicates()

# Print info of tracks
print(tracks_no_duplicates.info())

# Filling missing data

In [None]:
# Calculate the mean of the Ozone column: oz_mean
oz_mean = airquality.Ozone.mean()

# Replace all the missing values in the Ozone column with the mean
airquality['Ozone'] = airquality['Ozone'].fillna(oz_mean)

# Print the info of airquality
print(airquality.info())

# Testing your data with asserts

In [None]:
# Assert that there are no missing values
assert pd.notnull(ebola).all().all()

# Assert that all values are >= 0
assert (ebola>= 0).all().all()

# Use the pd.notnull() function on ebola (or the .notnull() method of ebola) and chain two .all() methods 
# (that is, .all().all()). The first .all() method will return a True or False for each column, 
# while the second .all() method will return a single True or False.