# Test Data Cleaning Class

Unit testing for class defined in 'Data_Cleaning_Class.py'

#### Setup

In [1]:
# Imports for testing
import unittest
import pandas.testing as pd_testing # Using assert_frame_equal() for unit tests

In [2]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Import class with functions for data cleaning
import sys
sys.path.append("/content/drive/My Drive/fe-eda/Python Code/")

from Data_Cleaning_Class import *

#### Create Testing Class with Unit Tests

Includes unit tests for each function and tests potential edge cases (e.g. empty string or unknown for State names). 

In [6]:
# Create class that inherits from unittest.TestCase
class DataCleaning_Test(unittest.TestCase):
  def setUp(self):
    # Set up
    # Create class with small dataset to use for testing
    sample_df = pd.DataFrame({'FIPS':[1,1,20,20,1001,1001,99999, 99999], 
                              'State': ['AL','AL','KS','KS ', 'AL', 'AL ', 'ZZ', 'ZZ'],
                              'County': ['Total','Total','Total','Total','Autauga','Autauga County','Fake','Fake'],
                              'Variable_Code':['Var1', 'Var2', 'Var1', 'Var2', 'Var1', 'Var2', 'Var1', 'Var2'],
                              'Value': [5,np.nan,10,np.nan,np.nan,20,np.nan,15]})
    # Create instance of class using sample data
    self.clean1 = DataCleaning(sample_df)

    # Perform webscraping
    self.clean1.webscrape_fips_lookup()

  def test_clean_state_col(self):
    # Is any white space removed from states?
    # Clean state column
    self.clean1.clean_state_column()
    # Test: using assertEqual() method
    self.assertEqual(list(self.clean1.df.State), ['AL','AL','KS','KS', 'AL', 'AL', 'ZZ','ZZ']) 

  def test_state_name_col(self):
    # Is the state code correct based on FIPS code?
    self.clean1.full_data_cleaning()    
    df = self.clean1.df_state
    result = df[df['FIPS'] == 20].State.to_string(index=False)
    # Test: using assetEqual() method
    self.assertEqual(result.strip(), 'KS') 

  def test_county_name_col(self):
    # Is the County name correct based on FIPS code?
    self.clean1.full_data_cleaning()    
    df = self.clean1.df_county
    result = df[df['FIPS'] == 1001].County.to_string(index=False)
    # Test: using assetEqual() method
    self.assertEqual(result.strip(), 'Autauga')  

  def test_clean_state_col_handle_empty_string(self):
    # If state is an empty string does it keep the empty string?
    # Create test dataframe with empty string for state
    empty_state_df = pd.DataFrame({'State':['']})
    # Create instance of class with this test data
    empty_state_clean = DataCleaning(empty_state_df)
    # Clean state column
    empty_state_clean.clean_state_column()
    # Test: using assertEqual() method
    self.assertEqual(list(empty_state_clean.df.State), ['']) 

  def test_clean_state_col_handle_nan(self):
    # Can this method handle unknown (np.nan) in state column?
    # Create test dataframe with nan for state
    nan_state_df = pd.DataFrame({'State':[np.nan]})
    # Create instance of class with this test data
    nan_state_clean = DataCleaning(nan_state_df)
    # Clean state column
    nan_state_clean.clean_state_column()
    # Test: using assertEqual() method
    # Should change any nan states to empty string
    self.assertEqual(list(nan_state_clean.df.State), ['']) 
  
  def test_webscraping_returns_expected_columns(self):
    # Does webscraping method retrieve the columns we expect?
    # Test: using assertEqual() method
    # Are the columns of the fips lookup table what we expect
    self.assertEqual(list(self.clean1.fips_table.columns), ['FIPS', 'County', 'State']) 

  def test_webscraped_fips_codes_are_ints(self):
    # After webscraping and prepare fips lookup table, are the FIPS codes integers?
    # Test: using assertEqual() method
    # Is the type of FIPS what we expect?
    self.assertEqual(self.clean1.fips_table.FIPS.dtype, int)

  def test_webscraping_returns_correct_info_for_example(self):
    # Does webscrape return correct state and county name for example FIPS code?
    # Example FIPS code: 1001
    # Test: using assert_frame_equal() method from pandas.testing to check if dataframe is what we expect
    # Is State and county for FIPS 1001 what we expect?
    expected = pd.DataFrame({'FIPS':[1001], 
                              'County': ['Autauga'],
                              'State': ['AL'],})
    actual = self.clean1.fips_table[self.clean1.fips_table['FIPS']==1001]
    pd_testing.assert_frame_equal(actual, expected)

  def test_are_missing_fips_added(self):
    # If a FIPS in our data isn't in the lookup table is it added?
    # Add missing fips
    self.clean1.add_missing_fips()
    # Test: is fake 99999 code added?
    expected = pd.DataFrame({'FIPS':[99999], 
                              'County': ['Fake'],
                              'State': ['ZZ']})
    actual = self.clean1.fips_table[self.clean1.fips_table['FIPS']==99999]
    pd_testing.assert_frame_equal(actual.reset_index(drop=True), expected.reset_index(drop=True))

  def test_are_all_fips_in_lookup(self):
    # Test whether all FIPS codes in original dataframe are also in FIPS table
    # (after adding missing)
    # Add missing fips
    self.clean1.add_missing_fips()
    # Get lists of FIPS codes in original data and lookup table
    original_fips = self.clean1.df.FIPS.to_list()
    lookup_fips = self.clean1.fips_table.FIPS.to_list()
    result =  all(elem in lookup_fips for elem in original_fips)
    # Use assert true to check
    self.assertTrue(result)

  def test_are_cols_correct_after_reformat(self):
    # Reformat sample data
    self.clean1.reformat_data()
    # Are the new columns what we expect?
    self.assertEqual(list(self.clean1.pivot.columns), ['FIPS', 'State', 'County', 'Var1', 'Var2']) 

  def test_are_num_rows_correct_after_reformat(self):
    # Reformat sample data
    self.clean1.reformat_data()
    # Are the dimensions (number of rows) what we expect after reformatting?
    # Should have same number of rows as unique FIPS codes
    self.assertEqual(self.clean1.pivot.shape[0], self.clean1.df.FIPS.nunique())

  def test_are_county_fips_split_out(self):
    # Reformat sample data
    self.clean1.reformat_data()
    # Split state and county data based on FIPS
    self.clean1.split_state_county_data()
    # Are all county level fips codes are in df_county?
    self.assertEqual(self.clean1.df_county.FIPS.to_list(), [1001, 99999])

  def test_are_state_fips_split_out(self):
    # Reformat sample data
    self.clean1.reformat_data()
    # Split state and county data based on FIPS
    self.clean1.split_state_county_data()
    # Are all state level fips codes are in df_state?
    self.assertEqual(self.clean1.df_state.FIPS.to_list(), [1, 20])

  def test_are_cols_with_all_missing_droped(self):
    # Reformat sample data
    self.clean1.reformat_data()
    # Split state and county data based on FIPS
    self.clean1.split_state_county_data()
    # Are state level columns (i.e. those that have nas for county level FIPS codes) 
    # removed from df_county()?
    # Var1 is state level so it should be dropped from df_county
    self.assertEqual(list(self.clean1.df_county.columns), ['FIPS', 'State', 'County', 'Var2'])

#### Execute Tests

In [7]:
# if __name__ == '__main__':
#     unittest.main() 
unittest.main(argv=[''],exit=False) 

...............
----------------------------------------------------------------------
Ran 15 tests in 198.445s

OK


<unittest.main.TestProgram at 0x7f25bfc54050>