# 2: Cleaning and Profiling

In [1]:
""" Notebook Setup/Imports
"""

# Project path setup
from pathlib import Path
project_dir = Path(__name__).resolve().parents[1]

# Library import
import pandas as pd
import numpy as np
import pandas_profiling

# Print strings as markdown
from IPython.display import Markdown

# Data variables from previous notebook
dataset = pd.read_pickle(Path.joinpath(project_dir, "data/processed/0_data.pickle"))
data_dictionary = pd.read_pickle(Path.joinpath(project_dir, "data/processed/0_data_dictionary.pickle"))

## Data Type Validation & Cleaning

In [2]:
""" Get the count for data types in the dataset
"""
d = dataset.dtypes
dataset.get_dtype_counts()

bool         4
float64    314
object       9
dtype: int64

In [3]:
""" Find columns as strings or other objects
"""
object_columns = d[d == 'object'].index.to_list()
print(object_columns)

['2010 Census Population', 'Population Estimate, 2011', 'Population Estimate, 2012', 'Population Estimate, 2013', 'Population Estimate, 2014', 'Population Estimate, 2015', 'Population Estimate, 2016', 'School Breakfast Program participants FY 2011', 'School Breakfast Program participants, FY 2012']


In [4]:
""" Parse objects to float
"""
for column in object_columns:
    sub = dataset[column].str.replace(",","").str.replace("-","").str.strip().apply(lambda x: np.nan if x == "" else x)
    dataset[column] = sub.astype(float)

In [5]:
""" Check types
"""
dataset.get_dtype_counts()

bool         4
float64    323
dtype: int64

In [6]:
""" Save the cleaned data
"""
dataset.to_pickle(Path.joinpath(project_dir, "data/processed/1_data.pickle"))
data_dictionary.to_pickle(Path.joinpath(project_dir, "data/processed/1_data_dictionary.pickle"))

In [7]:
""" Create the data profile object and save it to an HTML file
"""
profile = pandas_profiling.ProfileReport(dataset, check_recoded=True)
profile.to_file(outputfile="../docs/_static/profile.html")

## Variable Selection

In [15]:
""" Create a list of rejected variables with > .975 correlation from the profile analysis
"""
rejected_variables = profile.get_rejected_variables(.975)

In [16]:
rejected_variables

['BERRY_ACRES12',
 'BERRY_ACRESPTH12',
 'CONVS14',
 'Child and Adult Care participants, FY 2012',
 'Child and Adult Care participants, FY 2013',
 'Child and Adult Care participants, FY 2014',
 'Child and Adult Care participants, FY 2015',
 'Child and Adult Care particpants FY 2011',
 'FFR14',
 'FMRKT_BAKED16',
 'FMRKT_OTHERFOOD16',
 'FOOD_TAX14',
 'FRESHVEG_ACRES12',
 'FSR14',
 'GROC14',
 'LACCESS_CHILD15',
 'LACCESS_LOWI15',
 'LACCESS_POP15',
 'LACCESS_SENIORS15',
 'National School Lunch Program participants FY 2011',
 'National School Lunch Program participants, FY 2012',
 'National School Lunch Program participants, FY 2013',
 'National School Lunch Program participants, FY 2014',
 'National School Lunch Program participants, FY 2015',
 'ORCHARD_ACRES12',
 'ORCHARD_FARMS12',
 'PCH_BERRY_ACRESPTH_07_12',
 'PCH_CONVSPTH_09_14',
 'PCH_FFRPTH_09_14',
 'PCH_FMRKTPTH_09_16',
 'PCH_FRESHVEG_ACRESPTH_07_12',
 'PCH_FSRPTH_09_14',
 'PCH_GHVEG_SQFTPTH_07_12',
 'PCH_GROCPTH_09_14',
 'PCH_LACCES