### Data Cleaning

In [None]:
# data cleaning means fixing bad data in your data set like:
# empty cells, data in wrong format, wrong data, duplicates...

### Empty Cells - Remove Rows

In [None]:
# one way to deal with empty cells is to remove rows that contain empty cells

# return a new dataframe with no empty cells
import pandas as pd

# read in the csv file
df = pd.read_csv('data.csv')
# df.dropna() method returns a new dataframe, with no empty cells. this method
# does not change the original dataframe
new_df = df.dropna()

print(new_df.to_string())

# to change the original dataframe, use the inplace = True argument

df = pd.read_csv('data.csv')
df.dropna(inplace=True)

print(df.to_string())

# the dropna(inplace=True) will NOT return a new dataframe, it removes all rows
# containing null values from the original dataframe

### Replace Empty Values

In [None]:
# The fillna() method allows us to replace empty cells with a value

# Replace Null values with the number 130
import pandas as pd

# read in csv file
df = pd.read_csv('data.csv')

#fill null values
df.fillna(130, inplace=True)
print(df.info())

#now the data is consistent with 169 non null values per each column

### Replace Only for Specified Columns

In [None]:
# the example above replaces all empty cells in the whole dataframe
# to replace empty values for one column, specify the COLUMN NAME in the
# data frame

# replace null values in the CALORIES column with the number 130
import pandas as pd

# read in csv file
df = pd.read_csv('data.csv')

# fill null values in the calories column
df.fillna({'Calories': 130}, inplace=True)

print(df['Calories'])

### Replace Using Mean, Median and Mode

In [None]:
# a common way to replace empty cells is to calculate the mean, median or mode
# value of the column. 

# calculate the MEAN, and replace any empty values with it
import pandas as pd

#read in file
df = pd.read_csv('data.csv')

#calculate mean of the calories column
x = df['Calories'].mean()

#fill null values with the value of x
df.fillna({'Calories':375.7}, inplace=True)

print(df.to_string())

In [None]:
# calculate the MEDIAN and replace empty values with it
import pandas as pd

# read in file
df = pd.read_csv('data.csv')

#calculate the median
y = df['Calories'].median()
print(y)

#fill null values with the value of y
df.fillna({'Calories':318.6}, inplace=True)

print(df.to_string())

In [None]:
# calculate the MODE and replace any empty values with it
# the mode is the value that appears most frequently
import pandas as pd

#read in file
df = pd.read_csv('data.csv')

#calculate the mode
z = df['Calories'].mode()[0]
print(z)

# fill null values with the value of z
df.fillna({'Calories':300}, inplace=True)
print(df.to_string())