# 2: Cleaning and Profiling

In [6]:
""" Notebook Setup/Imports
"""

# Project path setup
from pathlib import Path
project_dir = Path(__name__).resolve().parents[1]

# Library import
import plotly
import pandas as pd
import numpy as np
import pandas_profiling
import plotly.figure_factory as ff

# Print strings as markdown
from IPython.display import Markdown

# Data variables from previous notebook
dataset = pd.read_pickle(Path.joinpath(project_dir, "data/processed/0_data.pickle"))
data_dictionary = pd.read_pickle(Path.joinpath(project_dir, "data/processed/0_data_dictionary.pickle"))

## Data Type Validation & Cleaning

In [7]:
""" Get the count for data types in the dataset
"""
d = dataset.dtypes
dataset.get_dtype_counts()

bool         4
float64    314
object       9
dtype: int64

In [8]:
""" Find columns as strings or other objects
"""
object_columns = d[d == 'object'].index.to_list()
print(object_columns)

['2010 Census Population', 'Population Estimate, 2011', 'Population Estimate, 2012', 'Population Estimate, 2013', 'Population Estimate, 2014', 'Population Estimate, 2015', 'Population Estimate, 2016', 'School Breakfast Program participants FY 2011', 'School Breakfast Program participants, FY 2012']


In [9]:
""" Parse objects to float
"""
for column in object_columns:
    sub = dataset[column].str.replace(",","").str.replace("-","").str.strip().apply(lambda x: np.nan if x == "" else x)
    dataset[column] = sub.astype(float)

In [10]:
""" Check types
"""
dataset.get_dtype_counts()

bool         4
float64    323
dtype: int64

In [None]:
""" Save the cleaned data
"""
dataset.to_pickle(Path.joinpath(project_dir, "data/processed/1_data.pickle"))
data_dictionary.to_pickle(Path.joinpath(project_dir, "data/processed/1_data_dictionary.pickle"))

## Data Profiling
[View externally](_static/profile.html)

In [19]:
""" Create the data profile object and save it to an HTML file
"""
profile = pandas_profiling.ProfileReport(dataset, check_recoded=True)
profile.to_file(outputfile="../docs/_static/profile.html")

In [30]:
Markdown(profile.to_html())

