In [None]:
"""
countyAnalysis class used for county level analysis of food environment dataset

"""
import pandas as pd
import numpy as np
import random

# Define a class for county level analysis
class countyAnalysis:
  # Decription of class and fields

    def __init__(self, df_county, target_var = None, var_list = None):
      '''
      Class constructor
      df_county: county level dataframe (pre-formatted and cleaned)
      target_var: optional, specify variable of interest for future analysis
      var_list: optional, specific list of variables you're interested in exploring (i.e. their relation to target)
      '''
      self.df_county = df_county.copy() # Original dataframe, copy so that changes within class doesn't change original
      self.target_var = target_var
      self.var_list = var_list
    
    def select_state(self, state):
      # Filter dataframe to counties from a specfific state
      self.df_county = self.df_county[self.df_county['State']==state]
    
    def calculate_na_summary(self):
      # Calculate percent missing
      self.pct_missing = self.df_county.isnull().sum() * 100 / len(self.df_county)
      # Find variable with largest % missing
      print('Missing values summary: ')
      print('The 10 variables with the highest percent missing are: ') 
      print(self.pct_missing.nlargest(10))
    
    def remove_missing_threshold_cols(self, threshold):
      print(f"Removing columns with greater than {(threshold)*100}% missing")
      # Remove variables with more than "theshold"% missing
      # Threshold in drop na is number of observations so multiply theshold % by number of rows
      self.df_county.dropna(thresh=self.df_county.shape[0]*(1-threshold), axis=1, inplace=True)
    
    def remove_missing_threshold_rows(self, threshold):
      print(f"Removing rows (counties) with greater than {(threshold)*100}% missing")
      # Remove rows (counties) with more than "theshold"% missing
      # Threshold in drop na is number of observations so multiply theshold % by number of columns
      self.df_county.dropna(thresh=self.df_county.shape[1]*(1-threshold), axis=0, inplace=True)
    
    def calculation_correlations_with_variable(self, num_pos_corr = 10, num_neg_corr = 10, display_all = False):
      '''
      num_pos_corr: number of top postive correlations to display. Default 10
      num_neg_corr: number of top negative correlations to display. Default 10   
      display_all: (default false), but if true will display all correlations (not just top pos/neg)
      '''
      # todo: skip non-numeric columns
      # todo: option to remove high correlations
      
      # If haven't defined variable of interest, prompt user to select
      if self.target_var is None:
        self.select_target_var()
      if self.var_list is not None: # If provided a list of variables only calculte correlations for those
        self.corrs_with_var = self.df_county[self.var_list].apply(lambda x: x.corr(self.df_county[self.target_var]))
      else: # Otherwise use all other variables (but only numeric columns)
        numeric_cols = self.df_county.select_dtypes(include=np.number).columns.tolist() # Select only numeric columns to calculate correlations with target
        self.corrs_with_var = self.df_county[numeric_cols].drop([self.target_var],axis=1).apply(lambda x: x.corr(self.df_county[self.target_var]))
      if not display_all:
        # Display top correlations
        print(f'Top {num_pos_corr} largest (positives) correlations with {self.target_var}: ')
        print(self.corrs_with_var.nlargest(num_pos_corr))
        print(f'Top {num_neg_corr} smallest (negative) correlations {self.target_var}: ')
        print(self.corrs_with_var.nsmallest(num_neg_corr))
      else:
        print(f"Top correlations with {self.target_var}:")
        print(self.corrs_with_var.sort_values())
    
    
    def find_zero_variance_state_cols(self, drop = False):
      # Calculate standard deviation for each variable within each state
      # To identify columns that are really at a state level 
      # (i.e. all counties within the state have the same value)
      state_stdev = self.df_county.groupby(['State']).std()
    
      self.state_zero_var_cols = []
      for col in state_stdev.columns:
        if state_stdev[col].sum() == 0:
          self.state_zero_var_cols.append(col)
    
      # If desired, drop columns that are really state level
      if drop == True:
        self.df_county.drop(self.state_zero_var_cols, axis=1, inplace=True)
    
    
    def find_most_recent_data(self, drop = False):
      '''
      Some of the columns in this dataset measure the same thing but for different years.
      For example: 'LACCESS_POP10' and 'LACCESS_POP15'.
      For some analysis we may be interested in both of these columns while for others 
      we may only want to consider the most recent year.
      This function helps to find the columns that contain data for multiple years
      and gives the option of dropping any that are not the most recent.
      '''
      column_stub_dict = {}
    
      # Create dictionary of column name stubs (column name minus last two elements)
      # With list of the last two elements (i.e. years) that match each name stub
      for i, col in enumerate(self.df_county.columns):
        if f'{col[:-2]}' in column_stub_dict: # If we've already searched for columns that match this beginning
          continue # Skip this column
        column_stub_dict[f'{col[:-2]}'] = [col[-2:]] # Create dictionary row for this beginning
        for j, x in enumerate(self.df_county.columns[i+1:]): # Search subsequent columns for match
          if col[:-2] == x[:-2]: # If the elements are the same (other than the last two (years))
            column_stub_dict[f'{col[:-2]}'].append(x[-2:]) # Add the years to the dictionary
    
      # Use column name stub dictionary to find latest (and oldest) datapoint for each
      self.list_recent_cols = []
      self.list_non_recent_cols = []
      for stub in column_stub_dict:
        if len(column_stub_dict[stub])>1: # if there were more than one column (year) for this stub
          int_year_lst = [int(year) for year in column_stub_dict[stub]] # Create list of ints so we can check for max
          for year in column_stub_dict[stub]: # Loop through each year
            if int(year) == max(int_year_lst): # If it is the max add to list_recent_cols
              self.list_recent_cols.append(f'{stub}{year}')
            else: # Otherwise add to list of non-recent columns
              self.list_non_recent_cols.append(f'{stub}{year}')
    
      # Keep only the most recent data points (if desired)
      if drop == True:
        self.df_county.drop(self.list_non_recent_cols, axis=1, inplace=True)
    
    
    def select_variables_to_analyze(self, n):
      '''
      n: number of variables to select
      '''
      print(f"Randomly selecting {n} variables for analysis...")
      all_vars = [var for var in self.df_county.columns if var not in ["FIPS",    "State",    "County"]]
      self.var_list = random.sample(all_vars, n)
      print(f"Selected variables: {self.var_list}")
    
    def select_target_var(self):
        print("Please select a variable of interest: ")
        self.target_var = input()
        if self.target_var not in self.df_county.columns:
          print("This variable is not in this dataset.")
          print("Will be using default (LACCESS_POP15) until a valid variable is chosen.")
          self.target_var = "LACCESS_POP15"
    
    def append_region(self, data_path):
        # Read in region data
        df_region = pd.read_csv(data_path + 'State and Region.csv')
        # Join region data to df_county
        self.df_county = pd.merge(self.df_county, df_region, how='inner', on = 'State')
    

    def average_by_category(self, by_col, new_var_list = None):
        '''
        Calculate average value of variables by another column
        '''
        if new_var_list is not None: # If provided a variable list for this function use it
            self.average_by = self.df_county.groupby(by_col, as_index=False)[new_var_list].mean()
        elif self.var_list is not None: # Otherwise use var_list for the class
          self.average_by = self.df_county.groupby(by_col, as_index=False)[self.var_list].mean()
        else: # Otherwise calculate for all variables
          self.average_by = self.df_county.groupby(by_col, as_index=False).mean()

        return self.average_by

    def labeled_categorical_cols(self):
        # Create column that labels 0/1 in METRO13 variable
        self.df_county['Metro'] = np.where(self.df_county['METRO13']==0, "Non-metro", "Metro")
        # Create column that labels 0/1 in PERPOV10 variable
        self.df_county['Persistent_Poverty'] = np.where(self.df_county['PERPOV10'] == 1, "Persistent-Poverty", "Other")


'''
Other ideas:
- return county with largest/smallest value for a specified variable of interest
- more plotting/visualizing
- exploring/visualizing across years (2012/2017) - like side by side graphs grace had

Other to do:
- Build in check for whether or not variables are in columns (for target_var and var_list)....maybe do some fuzzy matching to match user input to variable column
'''

'\nOther ideas:\n- return county with largest/smallest value for a specified variable of interest\n- more plotting/visualizing\n- exploring/visualizing across years (2012/2017) - like side by side graphs grace had\n\nOther to do:\n- Build in check for whether or not variables are in columns (for target_var and var_list)....maybe do some fuzzy matching to match user input to variable column\n'

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas.testing as pd_testing # Using assert_frame_equal() for unit tests
import unittest

In [None]:
#testing class that inherits from unittest
'''
    When asked for a variable of interest:
    1. 2010_Census_Population
    2. 2010_Census_Population
    3. Census_Population
    4. 2010_Census_Population
    
    will need to input data_path for your computer in test_append_region_merge and test_append_region_not_equal
'''
class countyAnalysis_Test(unittest.TestCase):
    def setUp(self):
        #create small data base for testing
        df_county = pd.DataFrame({'State': ['CA', 'VA', 'VA', 'MA', 'NY', 'TX', 'TN'],
                                 'County': ['Autauga', 'Baldwin', 'Orange', 'King George', 'Stafford', 'Los Angeles', 'Carroll'],
                                 '2010_Census_Population': [1928344.0, 1928345.0, 2734.0, 58392.0, 182394.0, 39328.0, 3827.0],
                                 'VLFOODSEC_12_14': [7.2, 7.2, 7.2, 7.2, 7.2, 7.2, 7.2],
                                 'BERRY_ACRES07': [np.nan, 81.0, np.nan, 79.0, np.nan, 28.0, np.nan],
                                 'variable1': [1.2, 2.6, np.nan, 3.6, 7.8, 9.1, 8.1],
                                 'LACCESS_POP15': [1400000.0, 5434000.0, 320000.0, 1230000.0, np.nan, np.nan, 304000.0],
                                 'AGRITRSM_OPS07': [10.0, 16.0, 32.0, 6.0, 8.0, 10.0, 5.0],
                                 'AGRITRSM_OPS12': [7.1, 7.1, 7.1, 7.1, 7.1, 7.1, 7.1],
                                 'AGRITRSM_RCT12': [123943.0, 23954.0, 239234.0, 1234593.0, 98734.0, 93456.0, 92347.0],
                                 'BERRY_ACRES12': [12.2, 5.3, 7.8, 0.1, 39.4, 3.4, 10.2],
                                 'VEG_ACRESPTH07': [34.0, 51.0, 11.0, 10.0, 67.0, 45.0, 91.0],
                                 'VEG_ACRESPTH12': [1.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
                                 'VEG_FARMS07': [np.nan, 2.0, 10.0, 5.0, 4.3, np.nan, np.nan],
                                 'VEG_FARMS12': [234.0, 2358.0, 5734.0, 9485.0, 983.0, np.nan, np.nan],
                                 'METRO13': [0, 1, 0, 1, 0, 1, 0],
                                 'PERPOV10': [0, 1, 0, 1, 0 , 1, 0]})
        self.analysis1 = countyAnalysis(df_county)
        
    def test_select_state(self):
        #does it grab the right state
        self.analysis1.select_state('VA')
        #test using assert equal, state should be VA
        self.assertEqual(self.analysis1.df_county['State'].iloc[0], 'VA')
       
    def test_select_state_not_equal(self):
        #does it not grab a state if it is not selected
        self.analysis1.select_state('VA')
        #test using assert not equal, should return 'VA' not 'NJ'
        self.assertNotEqual(self.analysis1.df_county['State'].iloc[0], 'NJ')
    
    def test_pct_missing(self):
        #test if na percentage is calculated correctly
        self.analysis1.calculate_na_summary()
        #test using assert equal, should return 85.71428571428571
        self.assertEqual(self.analysis1.pct_missing.nlargest(1).iloc[0], 85.71428571428571)
    
    def test_calculate_na_summary(self): 
        #test if 10 are called
        self.analysis1.calculate_na_summary()
        #test using assert equal, should have 6 items in the list
        self.assertEqual(len(self.analysis1.pct_missing.nlargest(10)), 10)
       
    def test_remove_missing_threshold_cols(self):
        #test if it removes columns with a threshold greater than a given number
        self.analysis1.remove_missing_threshold_cols(0.8)
        #test using assert equal, should only remove 1 column
        self.assertEqual(len(self.analysis1.df_county.columns), 16)
        
    def test_remove_missing_threshold_cols_no_cols(self):
        #test if the threshold is too high none will be dropped
        self.analysis1.remove_missing_threshold_cols(0.9)
        #test using assert equal, shouldn't remove any columns
        self.assertEqual(len(self.analysis1.df_county.columns), 17)
        
    def test_remove_missing_threshold_rows(self):
        #test if it removes rows with a threshold greater than a given number
        self.analysis1.remove_missing_threshold_rows(0.25)
        #test using assert equal, should only remove one row
        self.assertEqual(len(self.analysis1.df_county), 7)
        
    def test_remove_missing_threshold_rows_no_rows(self):
        #test if the threshold is too high none will be dropped
        self.analysis1.remove_missing_threshold_rows(0.3)
        #test using assert equal, shouldn't remove any rows
        self.assertEqual(len(self.analysis1.df_county), 7)
        
    def test_target_variable(self):
        #test if the target variable works
        target_var = '2010_Census_Population'
        self.analysis1.target_var = target_var
        #test using assert equal, target_var should be 2010_Census_Population
        self.assertEqual(self.analysis1.target_var, '2010_Census_Population')
        
    def test_calculation_correlation_with_variable(self):
        #test if method is calculating correlations
        self.analysis1.calculation_correlations_with_variable()
        #test using assert equal, the first correlation should be 0.7243301990584146 when using 2010_Census_Population
        self.assertEqual(self.analysis1.corrs_with_var.nlargest(1).iloc[0], 0.7243301990584146)
        
    def test_calculation_correlation_with_variable_length(self):
        #test if method calculates all correlations
        self.analysis1.calculation_correlations_with_variable()
        #test using assert equal, using a numeric column, there are 12 other numeric columns, use 2010_Census_Population
        self.assertEqual(len(self.analysis1.corrs_with_var), 14)
        
    def test_find_zero_variance_state_cols(self):
        #test if find_zero_variance_state_cols adds cols to list
        self.analysis1.find_zero_variance_state_cols()
        #test using assert equal, VLFOODSEC_12_14 should be the first column to have zero variance
        self.assertEqual(self.analysis1.state_zero_var_cols[0], 'VLFOODSEC_12_14')
        
    def test_find_zero_variance_state_cols_length(self):
        #test if method addes multipl cols to list
        self.analysis1.find_zero_variance_state_cols()
        #test using assert equal, length of list should be 5
        self.assertEqual(len(self.analysis1.state_zero_var_cols), 5)
        
    #had trouble with find_most_recent_data method
    
    def test_select_variables_to_analyze_list(self):
        #test if method selects the right number of varialbes
        self.analysis1.select_variables_to_analyze(3)
        #test using assert equal, the list should have 3 variables
        self.assertEqual(len(self.analysis1.var_list), 3)
        
    def test_select_variables_to_analyze_exclusions(self):
        #make sure FIPS, State, and County aren't in the list
        self.analysis1.select_variables_to_analyze(3)
        #test using assert not in, FIPS, State, and County should not be in the list
        self.assertNotIn('FIPS', self.analysis1.var_list)
        self.assertNotIn('State', self.analysis1.var_list)
        self.assertNotIn('County', self.analysis1.var_list)         
    
    def test_select_target_var_not_in_dataset(self):
        #test if target_var is "Census_Population" if variable is not in dataset
        self.analysis1.select_target_var()
        #test using assert equal, default target_var should be LACCESS_POP15
        self.assertEqual(self.analysis1.target_var, '2010_Census_Population')
        
    def test_select_target_var_in_dataset(self):
        #test if target_var is correct
        self.analysis1.select_target_var()
        #test using assert equal, target_var should be 2010_Census_Population
        self.assertEqual(self.analysis1.target_var, 'LACCESS_POP15')
        
    def test_append_region_merge(self):
        #test if region columns was added
        self.analysis1.append_region('')
        #test using assert in, Region should be in the dataset
        self.assertIn('Region', self.analysis1.df_county)
        
    def test_append_region_first(self):
        #test if region is correctly added
        self.analysis1.append_region('')
        #test using assert equal, the first instance in Region should be 'West'
        self.assertEqual(self.analysis1.df_county['Region'].iloc[0], 'West')
        
    def test_average_by_category_new_var_list(self):
        #test if average works with a new_var_list
        self.analysis1.average_by_category('variable1', new_var_list = ['AGRITRSM_OPS07', 'AGRITRSM_OPS12', 'AGRITRSM_RCT12'])
        #test using assert in, 'AGRITRSM_OPS07' is in the new average_by data frame
        self.assertIn('AGRITRSM_OPS07', self.analysis1.average_by)
        
    def test_average_by_category_no_new_var_list(self):
        #test if method works with no new_var_list
        self.analysis1.average_by_category('variable1') #might need to create new instance
        #test using assert equal, length of average_by should be 6
        self.assertEqual(len(self.analysis1.average_by), 6)
        
    def test_labeled_categorical_cols_metro(self):
        #test if Metro column is added
        self.analysis1.labeled_categorical_cols()
        #test using assert equal, 
        self.assertEqual(self.analysis1.df_county['Metro'].iloc[0], 'Non-metro')
        
    def test_labeled_categorical_cols_perpov(self):
        #test if 'Persistent_Poverty' column is added
        self.analysis1.labeled_categorical_cols()
        #test using assert equal,
        self.assertEqual(self.analysis1.df_county['Persistent_Poverty'].iloc[0], 'Other')  
        
    def test_find_most_recent_data_new_data(self):
        #test if 'BERRY_ACRES12' is in list_recent_cols
        self.analysis1.find_most_recent_data()
        #test using assert in and assert not in, 'BERRY_ACRES12' should be in list_recent_cols and not in list_non_recent_cols
        self.assertIn('BERRY_ACRES12', self.analysis1.list_recent_cols)
        self.assertNotIn('BERRY_ACRES12', self.analysis1.list_non_recent_cols)
        
    def test_find_most_recent_data_old_data(self):
        #test if 'BERRY_ACRES07' is in list_recent_cols
        self.analysis1.find_most_recent_data()
        #test using assert in and assert not in, 'BERRY_ACRES07' should not be in list_recent_cols and in list_non_recent_cols
        self.assertNotIn('BERRY_ACRES07', self.analysis1.list_recent_cols)
        self.assertIn('BERRY_ACRES07', self.analysis1.list_non_recent_cols)
      
            
    

In [None]:
unittest.main(argv=[''],exit=False) 

EE...

Missing values summary: 
The 10 variables with the highest percent missing are: 
VEG_ACRESPTH12            85.714286
BERRY_ACRES07             57.142857
VEG_FARMS07               42.857143
LACCESS_POP15             28.571429
VEG_FARMS12               28.571429
variable1                 14.285714
State                      0.000000
County                     0.000000
2010_Census_Population     0.000000
VLFOODSEC_12_14            0.000000
dtype: float64
Please select a variable of interest: 
2010_Census_Population


  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
.

Top 10 largest (positives) correlations with 2010_Census_Population: 
LACCESS_POP15      7.243302e-01
BERRY_ACRES07      5.359893e-01
METRO13            8.531247e-02
PERPOV10           8.531247e-02
AGRITRSM_OPS07     1.852800e-02
VLFOODSEC_12_14   -1.177987e-16
AGRITRSM_OPS12    -1.177987e-16
VEG_ACRESPTH07    -2.515006e-02
BERRY_ACRES12     -7.339985e-02
AGRITRSM_RCT12    -3.176141e-01
dtype: float64
Top 10 smallest (negative) correlations 2010_Census_Population: 
variable1         -8.129570e-01
VEG_FARMS07       -7.050619e-01
VEG_FARMS12       -6.216922e-01
AGRITRSM_RCT12    -3.176141e-01
BERRY_ACRES12     -7.339985e-02
VEG_ACRESPTH07    -2.515006e-02
VLFOODSEC_12_14   -1.177987e-16
AGRITRSM_OPS12    -1.177987e-16
AGRITRSM_OPS07     1.852800e-02
METRO13            8.531247e-02
dtype: float64
Please select a variable of interest: 
2010_Census_Population


..............

Top 10 largest (positives) correlations with 2010_Census_Population: 
LACCESS_POP15      7.243302e-01
BERRY_ACRES07      5.359893e-01
METRO13            8.531247e-02
PERPOV10           8.531247e-02
AGRITRSM_OPS07     1.852800e-02
VLFOODSEC_12_14   -1.177987e-16
AGRITRSM_OPS12    -1.177987e-16
VEG_ACRESPTH07    -2.515006e-02
BERRY_ACRES12     -7.339985e-02
AGRITRSM_RCT12    -3.176141e-01
dtype: float64
Top 10 smallest (negative) correlations 2010_Census_Population: 
variable1         -8.129570e-01
VEG_FARMS07       -7.050619e-01
VEG_FARMS12       -6.216922e-01
AGRITRSM_RCT12    -3.176141e-01
BERRY_ACRES12     -7.339985e-02
VEG_ACRESPTH07    -2.515006e-02
VLFOODSEC_12_14   -1.177987e-16
AGRITRSM_OPS12    -1.177987e-16
AGRITRSM_OPS07     1.852800e-02
METRO13            8.531247e-02
dtype: float64
Missing values summary: 
The 10 variables with the highest percent missing are: 
VEG_ACRESPTH12            85.714286
BERRY_ACRES07             57.142857
VEG_FARMS07               42.857143
LACCES

.

This variable is not in this dataset.
Will be using default (LACCESS_POP15) until a valid variable is chosen.
Please select a variable of interest: 
2010_Census_Population


....

Randomly selecting 3 variables for analysis...
Selected variables: ['variable1', 'AGRITRSM_OPS07', 'VEG_FARMS12']
Randomly selecting 3 variables for analysis...
Selected variables: ['VEG_ACRESPTH07', 'PERPOV10', 'BERRY_ACRES12']



ERROR: test_append_region_first (__main__.countyAnalysis_Test)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-4-a25b7e0797a5>", line 148, in test_append_region_first
    self.analysis1.append_region('')
  File "<ipython-input-1-be46f0cb36dc>", line 149, in append_region
    df_region = pd.read_csv(data_path + 'State and Region.csv')
  File "/usr/local/lib/python3.7/dist-packages/pandas/io/parsers.py", line 688, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/usr/local/lib/python3.7/dist-packages/pandas/io/parsers.py", line 454, in _read
    parser = TextFileReader(fp_or_buf, **kwds)
  File "/usr/local/lib/python3.7/dist-packages/pandas/io/parsers.py", line 948, in __init__
    self._make_engine(self.engine)
  File "/usr/local/lib/python3.7/dist-packages/pandas/io/parsers.py", line 1180, in _make_engine
    self._engine = CParserWrapper(self.f, **self.options)
  File "/usr/local/lib/python

<unittest.main.TestProgram at 0x7f64d09c9b50>