# Data Cleaning

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data.csv')
new_df = df.dropna() 
# dropna() -returns a new DataFrame(by dropping the entries with empty cells
# and won't change the original
print(new_df.to_string())

In [None]:
# If you want to change the original DataFrame, use the inplace = True argument

df.dropna(inplace = True)
print(df.to_string())

# the dropna(inplace = True) will NOT return a new DataFrame, but it will remove all rows containg NULL values from the original DataFrame.

In [None]:
df.fillna(130, inplace = True)
print(df)

In [None]:
# Replace Only For a Specified Columns

df["Calories"].fillna(130, inplace = True)

In [None]:
# Replace Using Mean, Median, or Mode

x = df["Calories"].mean()
df["Calories"].fillna(x, inplace = True)

med = df["Calories"].median()
df["Calories"].fillna(med, inplace = True)

mod = df["Calories"].mode()[0]
df["Calories"].fillna(mod, inplace = True)

## Cleaning Data of Wrong Format

In [None]:
df = pd.read_csv("dirtydata.csv")
df['Date'] = pd.to_datetime(df['Date'])
print(df)

In [None]:
df.dropna(subset=["Date"], inplace = True)

## Fixing Wrong Data

In [None]:
# For small data sets

df.loc[7, 'Duration'] = 45
print(df)

In [None]:
# For larger datasets
for i in df.index:
    if df.loc[i, "Duration"] > 120:
        df.loc[i, "Duration"] = 120

In [None]:
# Removing Rows
for i in df.index:
    if df.loc[i, "Duration"] > 120:
        df.drop(i, inplace = True)

## Removing Duplicates

In [None]:
# To discover duplicates, we can use the duplicated() method.

# The duplicated() method returns a Boolean values for each row:

print(df.duplicated())

In [None]:
# Removing Duplicates
# To remove duplicates, use the drop_duplicates() method.

df.drop_duplicates(inplace = True)
print(df)