## Start by reading in data

### Finding files that match a pattern

In [None]:
# Use the glob module to find all csv files in the workspace. 
# The glob module has a function called glob that takes a pattern and returns a list of the files in the working directory that match that pattern
# For example, if you know the pattern is part_ single digit number .csv, you can write the pattern as 'part_?.csv' (which would match part_1.csv, part_2.csv, part_3.csv, etc.)
# Similarly, you can find all .csv files with '*.csv', or all parts with 'part_*'
# The ? wildcard represents any 1 character, and the * wildcard represents any number of characters

# Import necessary modules
import glob
import pandas as pd

# Write the pattern: pattern
pattern = '*.csv'

# Save all file matches: csv_files
csv_files = glob.glob(pattern)

# Print the file names
print(csv_files)

# Load the second file into a DataFrame: csv2
csv2 = pd.read_csv(csv_files[1])

# Print the head of csv2
print(csv2.head())

### Iterating and concatenating all matches

In [None]:
# Create an empty list: frames
frames = []

#  Iterate over csv_files
for csv in csv_files:

    #  Read csv into a DataFrame: df
    df = pd.read_csv(csv)
    
    # Append df to frames
    frames.append(df)

# Concatenate frames into a single DataFrame: uber
uber = pd.concat(frames)

# Print the shape of uber
print(uber.shape)

# Print the head of uber
print(uber.head())

## Converting data types

### Converting data types to categorical data to reduce memory usage (!)

In [None]:
# Convert the sex column to type 'category'
tips.sex = tips.sex.astype('category')

# Convert the smoker column to type 'category'
tips.smoker = tips.smoker.astype('category')

# Print the info of tips
print(tips.info())

### Working with numeric data

In [None]:
# You can use the pd.to_numeric() function to convert a column into a numeric data type
# If the function raises an error, you can be sure that there is a bad value within the column
# In which case either do some EDA to find the bad value, or ignore or coerce the value into a missing value, NaN

# Convert 'total_bill' to a numeric dtype
tips['total_bill'] = pd.to_numeric(tips['total_bill'], errors='coerce')

# Convert 'tip' to a numeric dtype
tips['tip'] = pd.to_numeric(tips['tip'], errors='coerce')

# Print the info of tips
print(tips.info())

## String parsing with regular expressions

In [1]:
# When working with data, it is sometimes necessary to write a regular expression to look for properly entered values
# Phone numbers in a dataset is a common field that needs to be checked for validity
# Define a regular expression to match US phone numbers that fit the pattern of xxx-xxx-xxxx

# The regular expression module in python is re
# Since the pattern will be used for a match across multiple rows, it's better to compile the pattern first using re.compile()
# You can then use the compiled pattern to match values

# Import the regular expression module
import re

# Compile the pattern: prog
prog = re.compile('\d{3}-\d{3}-\d{4}')

# See if the pattern matches
result = prog.match('123-456-7890')
print(bool(result))

# See if the pattern matches
result = prog.match('1123-456-7890')
print(bool(result))

True
False


### Extracting numerical values from strings

In [2]:
# Extracting numbers from strings is a common task, particularly when working with unstructured data or log files
# Say you have the following string: 'the recipe calls for 6 strawberries and 2 bananas'

# It would be useful to extract the 6 and the 2 to be saved for later use when comparing strawberry to banana ratios
# When using a regular expression to extract multiple numbers (or multiple pattern matches, to be exact), you can use the re.findall() function
# You pass in a pattern and a string to re.findall(), and it will return a list of the matches

# Import the regular expression module
import re

# Find the numeric values: matches
matches = re.findall('\d+', 'the recipe calls for 10 strawberries and 1 banana')

# Print the matches
print(matches)

['10', '1']


### Pattern matching

In [3]:
# Use \$ to match the dollar sign, \d* to match an arbitrary number of digits, \. to match the decimal point, and \d{x} to match x number of digit

# Write the first pattern
print(bool(re.match(pattern='\d{3}-\d{3}-\d{4}', string='123-456-7890')))

# Write the second pattern
print(bool(re.match(pattern='\$\d*\.\d{2}', string='$123.45')))

# Write the third pattern
print(bool(re.match(pattern='[A-Z]\w*', string='Australia')))

True
True
True


### Example for Gapminder data - looking at country spellings (!)

In [None]:
# See if there are any special or invalid characters you may need to deal with
# It is reasonable to assume that country names will contain:
# - The set of lower and upper case letters
# - Whitespace between words
# - Periods for any abbreviations

# To confirm that this is the case, you can leverage the power of regular expressions again
# For common operations like this, Python has a built-in string method - str.contains()
# This takes a regular expression pattern, and applies it to the Series, returning True if there is a match, and False otherwise

# Since here you want to find the values that do not match, you have to invert the boolean, which can be done using ~ 
# This Boolean series can then be used to get the Series of countries that have invalid names

# Create the series of countries: countries
countries = gapminder['country']

# Drop all the duplicates from countries
countries = countries.drop_duplicates()

# Write the regular expression: pattern
# Anchor the pattern to match exactly what you want by placing a ^ in the beginning and $ in the end
pattern = '^[A-Za-z\.\s]*$'

# Create the Boolean vector: mask
# Use str.contains() to create a Boolean vector representing values that match the pattern
mask = countries.str.contains(pattern)

# Invert the mask: mask_inverse
# Invert the mask by placing a ~ before it
mask_inverse = ~mask

# Subset countries using mask_inverse: invalid_countries
invalid_countries = countries.loc[mask_inverse]

# Print invalid_countries
print(invalid_countries)

93               Guinea-Bissau
    98            Hong Kong, China
    118    United Korea (former)\n
    131               Macao, China
    132             Macedonia, FYR
    145      Micronesia, Fed. Sts.
    161            Ngorno-Karabakh
    187             St. Barth?lemy
    193     St.-Pierre-et-Miquelon
    225                Timor-Leste
    251      Virgin Islands (U.S.)
    252       North Yemen (former)
    253       South Yemen (former)
    258                      ?land

## Writing functions to clean data

In [None]:
# Your job is to write a function that will recode 'Male' to 1, 'Female' to 0, and return np.nan for all entries of 'sex' that are neither 'Male' nor 'Female'.
# Recoding variables like this is a common data cleaning task
# Functions provide a mechanism for you to abstract away complex bits of code as well as reuse code
# This makes your code more readable and less error prone

# You can use the .apply() method to apply a function across entire rows or columns of DataFrames
# However, note that each column of a DataFrame is a pandas Series
# Functions can also be applied across Series. Here, you will apply your function over the 'sex' column

# Define recode_sex()
def recode_sex(sex_value):

    # Return 1 if sex_value is 'Male'
    if sex_value == 'Male':
        return 1
    
    # Return 0 if sex_value is 'Female'
    elif sex_value == 'Female':
        return 0
        
    # Return np.nan    
    else:
        return np.nan

# Apply the function to the sex column
tips['sex_recode'] = tips.sex.apply(recode_sex)

### Lambda functions

In [None]:
# lambda functions can make your code concise and Pythonic
# our job is to clean its 'total_dollar' column by removing the dollar sign
# You'll do this using two different methods: With the .replace() method, and with regular expressions
# The regular expression module re has been pre-imported.

# Write the lambda function using replace
tips['total_dollar_replace'] = tips.total_dollar.apply(lambda x: x.replace('$', ''))

# Write the lambda function using regular expressions
tips['total_dollar_re'] = tips.total_dollar.apply(lambda x: re.findall('\d+\.\d+', x))

# Print the head of tips
print(tips.head())

<script.py> output:
       total_bill   tip     sex smoker  day    time  size total_dollar  \
    0       16.99  1.01  Female     No  Sun  Dinner     2       $16.99   
    1       10.34  1.66    Male     No  Sun  Dinner     3       $10.34   
    2       21.01  3.50    Male     No  Sun  Dinner     3       $21.01   
    3       23.68  3.31    Male     No  Sun  Dinner     2       $23.68   
    4       24.59  3.61  Female     No  Sun  Dinner     4       $24.59   
    
      total_dollar_replace total_dollar_re  
    0                16.99         [16.99]  
    1                10.34         [10.34]  
    2                21.01         [21.01]  
    3                23.68         [23.68]  
    4                24.59         [24.59]

## Data cleaning starts with basic exploratory data analysis

In [None]:
# Import pandas
import pandas as pd

# Read the file into a DataFrame: df
df = pd.read_csv('dob_job_application_filings_subset.csv')

# Print the head of df
print(df.head())

# Print the tail of df
print(df.tail())

# Print the shape of df
print(df.shape)

# Print the columns of df
print(df.columns)

# Print the head and tail of df_subset
print(df_subset.head())
print(df_subset.tail())

In [None]:
# Print the info of df
print(df.info())

# Print the info of df_subset
print(df_subset.info())

## Dropping duplicate data

In [None]:
# Duplicate data causes a variety of problems
# From the point of view of performance, they use up unnecessary amounts of memory and cause unneeded calculations to be performed when processing data
# In addition, they can also bias any analysis results

# Create the new DataFrame: tracks
tracks = billboard[['year', 'artist', 'track', 'time']]

# Print info of tracks
print(tracks.info())

# Drop the duplicates: tracks_no_duplicates
tracks_no_duplicates = tracks.drop_duplicates()

# Print info of tracks
print(tracks_no_duplicates.info())

## Filling missing data

In [None]:
# It's rare to have a (real-world) dataset without any missing values
# Certain calculations cannot handle missing values while some calculations will, by default, skip over missing values
# Understand how much missing data you have, and where missing data comes from
# This will ensure you make unbiased interpretations of data

# Calculate the mean of the Ozone column: oz_mean
oz_mean = airquality.Ozone.mean()

# Replace all the missing values in the Ozone column with the mean
airquality['Ozone'] = airquality.Ozone.fillna(oz_mean)

# Print the info of airquality
print(airquality.info())

### Dropping missing data - Gapminder example

In [None]:
# In general, it is not good to drop missing values, because you may end up throwing away useful information
# In this data, missing values refer to years where no estimate for life expectancy is available for a given country
# You could fill in, or guess what these life expectancies could be by looking at the average life expectancies for other countries in that year, for example
# Whichever strategy you go with, it is important to carefully consider all options and understand how they will affect your data

# Your job is to drop all the rows that have NaN in the life_expectancy column
# Before doing so, it would be valuable to use assert statements to confirm that year and country do not have any missing values

# If there is no reasonable way to fill in or impute missing values, then dropping the missing data may be the best solution

# Assert that country does not contain any missing values
assert pd.notnull(gapminder.country).all()

# Assert that year does not contain any missing values
assert pd.notnull(gapminder.year).all()

# Drop the missing values
# Drop the values in the data where life_expectancy is missing
# As you confirmed that country and year don't have missing values
# You can use the .dropna() method on the entire gapminder DataFrame, because any missing values would have to be in the life_expectancy column
# Specify the keyword argument how='any
gapminder = gapminder.dropna(how='any')

# Print the shape of gapminder
print(gapminder.shape)

## Testing your data with asserts

In [None]:
# use the .all() method together with the .notnull() DataFrame method to check for missing values in a column
# The .all() method returns True if all values are True
# When used on a DataFrame, it returns a Series of Booleans - one for each column in the DataFrame
# So if you are using it on a DataFrame, like in this exercise, you need to chain another .all() method so that you return only one True or False value

# Note: You can use pd.notnull(df) as an alternative to df.notnull()

# Assert that there are no missing values
# Use the pd.notnull() function on ebola (or the .notnull() method of ebola) and chain two .all() methods
# That is, .all().all())
# The first .all() method will return a True or False for each column
# The second .all() method will return a single True or False
assert ebola.notnull().all().all()

# Assert that all values are >= 0
assert (ebola >= 0).all().all()

In [None]:
# Another example!
# Convert the year column to numeric
gapminder.year = pd.to_numeric(gapminder.year)

# Test if country is of type object
assert gapminder.country.dtypes == np.object

# Test if year is of type int64
assert gapminder.year.dtypes == np.int64

# Test if life_expectancy is of type float64
assert gapminder.life_expectancy.dtypes == np.float64

## Get in the habit of testing your data with asserts

In [None]:
# Write a function to check the following:
# - 'Life expectancy' is the first column (index 0) of the DataFrame
# - The other columns contain either null or numeric values
# - The numeric values are all greater than or equal to 0
# - There is only one instance of each country

def check_null_or_valid(row_data):
    """Function that takes a row a data,
    drops all missing values,
    and checks if all remaining values are greater than or equal to 0
    """
    no_na = row_data.dropna()[1:-1]
    numeric = pd.to_numeric(no_na)
    ge0 = numeric >= 0
    return ge0

# Check whether the first column is 'Life expectancy'
assert g1800s.columns[0] == 'Life expectancy'

# Check whether the values in the row are valid
assert g1800s.iloc[:, 1:].apply(check_null_or_valid, axis=1).all().all()

# Check that there is only one instance of each country
assert g1800s['Life expectancy'].value_counts()[0] == 1

## Exploratory data analysis

### Frequency counts for categorical data

In [None]:
# Print the value counts for 'Borough'
print(df['Borough'].value_counts(dropna=False))

# Print the value_counts for 'State'
print(df['State'].value_counts(dropna=False))

# Print the value counts for 'Site Fill'
print(df['Site Fill'].value_counts(dropna=False))

## Visual exploratory data analysis

### Visualizing single variables with histograms

In [None]:
# You'll notice that there are extremely large differences between the min and max values, and the plot will need to be adjusted accordingly
# In such cases, it's good to look at the plot on a log scale
# The keyword arguments logx=True or logy=True can be passed in to .plot() depending on which axis you want to rescale

# Import matplotlib.pyplot
import matplotlib.pyplot as plt

# Plot the histogram
df['Existing Zoning Sqft'].plot(kind='hist', rot=70, logx=True, logy=True)

# Display the histogram
plt.show()

### Visualizing multiple variables with boxplots (!)

In [None]:
# Import necessary modules
import pandas as pd
import matplotlib.pyplot as plt

# Create the boxplot
df.boxplot(column='initial_cost', by='Borough', rot=90)

# Display the plot
plt.show()

### Visualizing multiple variables with scatter plots

In [None]:
# Import necessary modules
import pandas as pd
import matplotlib.pyplot as plt

# Create and display the first scatter plot
df.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70)
plt.show()

# Create and display the second scatter plot
df_subset.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70)
plt.show()

In [None]:
# Import matplotlib.pyplot
import matplotlib.pyplot as plt

# Create the scatter plot
g1800s.plot(kind='scatter', x='1800', y='1899')

# Specify axis labels
plt.xlabel('Life Expectancy by Country in 1800')
plt.ylabel('Life Expectancy by Country in 1899')

# Specify axis limits
plt.xlim(20, 55)
plt.ylim(20, 55)

# Display the plot
plt.show()

In [None]:
# Add first subplot
plt.subplot(2, 1, 1) 

# Create a histogram of life_expectancy
gapminder.life_expectancy.plot(kind='hist')

# Group gapminder: gapminder_agg
gapminder_agg = gapminder.groupby('year')['life_expectancy'].mean()

# Print the head of gapminder_agg
print(gapminder_agg.head())

# Print the tail of gapminder_agg
print(gapminder_agg.tail())

# Add second subplot
plt.subplot(2, 1, 2)

# Create a line plot of life expectancy per year
gapminder_agg.plot()

# Add title and specify axis labels
plt.title('Life expectancy over the years')
plt.ylabel('Life expectancy')
plt.xlabel('Year')

# Display the plots
plt.tight_layout()
plt.show()

# Save both DataFrames to csv files
gapminder.to_csv('gapminder.csv')

## Tidy data

### Reshaping your data using melt

In [None]:
# Melting data is the process of turning columns of your data into rows of data
Consider the DataFrames from the previous exercise. 
In the tidy DataFrame, the variables Ozone, Solar.R, Wind, and Temp each had their own column.
If, however, you wanted these variables to be in rows instead, you could melt the DataFrame. 
In doing so, however, you would make the data untidy! 
This is important to keep in mind: Depending on how your data is represented, you will have to reshape it differently.

In this exercise, you will practice melting a DataFrame using pd.melt(). 
There are two parameters you should be aware of: id_vars and value_vars. 
The id_vars represent the columns of the data you do not want to melt (i.e., keep it in its current shape), while the value_vars represent the columns you do wish to melt into rows. 
By default, if no value_vars are provided, all columns not set in the id_vars will be melted. 
This could save a bit of typing, depending on the number of columns that need to be melted.

The (tidy) DataFrame airquality has been pre-loaded. 
Your job is to melt its Ozone, Solar.R, Wind, and Temp columns into rows. 
Later in this chapter, you'll learn how to bring this melted DataFrame back into a tidy form.

In [None]:
# Print the head of airquality
print(airquality.head())

# Melt airquality: airquality_melt
airquality_melt = pd.melt(airquality, id_vars=['Month', 'Day'])

# Print the head of airquality_melt
print(airquality_melt.head())

### Customizing melted data

In [None]:
# Print the head of airquality
print(airquality.head())

# Melt airquality: airquality_melt
airquality_melt = pd.melt(airquality, id_vars=['Month', 'Day'], var_name='measurement', value_name='reading')

# Print the head of airquality_melt
print(airquality_melt.head())

### Reshaping your data - Gapminder data example

In [None]:
# Currently, the gapminder DataFrame has a separate column for each year
# What you want instead is a single column that contains the year, and a single column that represents the average life expectancy for each year and country
# By having year in its own column, you can use it as a predictor variable in a later analysis

# You can convert the DataFrame into the desired tidy format by melting it

# Melt gapminder: gapminder_melt
gapminder_melt = pd.melt(gapminder, id_vars='Life expectancy')

# Rename the columns
gapminder_melt.columns = ['country','year','life_expectancy']

# Print the head of gapminder_melt
print(gapminder_melt.head())

### Pivoting data

In [None]:
# .pivot_table() has an index parameter which you can use to specify the columns that you don't want pivoted
# It is similar to the id_vars parameter of pd.melt()
# Two other parameters that you have to specify are columns (the name of the column you want to pivot), and values (the values to be used when the column is pivoted)

# Print the head of airquality_melt
print(airquality_melt.head())

# Pivot airquality_melt: airquality_pivot
airquality_pivot = airquality_melt.pivot_table(index=['Month', 'Day'], columns='measurement', values='reading')

# Print the head of airquality_pivot
print(airquality_pivot.head())

### Resetting the index of a DataFrame

In [None]:
# After pivoting airquality_melt in the previous exercise, you didn't quite get back the original DataFrame
# What you got back instead was a pandas DataFrame with a hierarchical index (also known as a MultiIndex)
# In essence, they allow you to group columns or rows by another variable - in this case, by 'Month' as well as 'Day'
# To get back the original DataFrame from the pivoted DataFrame use .reset_index()

# Print the index of airquality_pivot
print(airquality_pivot.index)

# Reset the index of airquality_pivot: airquality_pivot
airquality_pivot = airquality_pivot.reset_index()

# Print the new index of airquality_pivot
print(airquality_pivot.index)

# Print the head of airquality_pivot
print(airquality_pivot.head())

### Pivoting duplicate values

In [None]:
# You can also use pivot tables to deal with duplicate values by providing an aggregation function through the aggfunc parameter
# Let's say your data collection method accidentally duplicated your dataset. 
# Such a dataset, in which each row is duplicated, has been pre-loaded as airquality_dup
# In addition, the airquality_melt DataFrame from the previous exercise has been pre-loaded

# By using .pivot_table() and the aggfunc parameter, you can not only reshape your data, but also remove duplicates!
# Finally, you can then flatten the columns of the pivoted DataFrame using .reset_index()

# Pivot airquality_dup: airquality_pivot
airquality_pivot = airquality_dup.pivot_table(index=['Month', 'Day'], columns='measurement', values='reading', aggfunc=np.mean)

# Reset the index of airquality_pivot
airquality_pivot = airquality_pivot.reset_index()

# Print the head of airquality_pivot
print(airquality_pivot.head())

# Print the head of airquality
print(airquality.head())

### Splitting a column with .str

In [None]:
# ou're going to tidy the 'm014' column, which represents males aged 0-14 years of age
# In order to parse this value, you need to extract the first letter into a new column for gender, and the rest into a column for age_group
# Here, since you can parse values by position, you can take advantage of pandas' vectorized string slicing by using the str attribute of columns of type object

# Melt tb: tb_melt
tb_melt = pd.melt(tb, id_vars=['country', 'year'])

# Create the 'gender' column
tb_melt['gender'] = tb_melt.variable.str[0]

# Create the 'age_group' column
tb_melt['age_group'] = tb_melt.variable.str[1:]

# Print the head of tb_melt
print(tb_melt.head())

### Splitting a column with .split() and .get() (!)

In [None]:
# Melt ebola: ebola_melt
ebola_melt = pd.melt(ebola, id_vars=['Date', 'Day'], var_name='type_country', value_name='counts')

# Create the 'str_split' column
ebola_melt['str_split'] = ebola_melt.type_country.str.split('_')

# Create the 'type' column
ebola_melt['type'] = ebola_melt.str_split.str.get(0)

# Create the 'country' column
ebola_melt['country'] = ebola_melt.str_split.str.get(1)

# Print the head of ebola_melt
print(ebola_melt.head())

## Concatenating data

### Combining rows of data

In [None]:
# Concatenate uber1, uber2, and uber3: row_concat
row_concat = pd.concat([uber1,uber2,uber3])

# Print the shape of row_concat
print(row_concat.shape)

# Print the head of row_concat
print(row_concat.head())

### Combining columns of data

In [None]:
# Concatenate ebola_melt and status_country column-wise: ebola_tidy
ebola_tidy = pd.concat([ebola_melt, status_country], axis=1)

# Print the shape of ebola_tidy
print(ebola_tidy.shape)

# Print the head of ebola_tidy
print(ebola_tidy.head())

## Merging data

In [None]:
# Merge the DataFrames: o2o
o2o = pd.merge(left=site, right=visited, left_on='name', right_on='site')

# Print o2o
print(o2o)

### Many-to-1 data merge

In [None]:
# Merge the DataFrames: m2o (same as before ONLY data is different)
m2o = pd.merge(left=site, right=visited, left_on='name', right_on='site')

# Print m2o
print(m2o)

### Many-to-many data merge

In [None]:
# Merge site and visited: m2m
m2m = pd.merge(left=site, right=visited, left_on='name', right_on='site')

# Merge m2m and survey: m2m
m2m = pd.merge(left=m2m, right=survey, left_on='ident', right_on='taken')

# Print the first 20 lines of m2m
print(m2m.head(20))