# Data Analysis


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define directory where cleaned data was saved
data_dest =  "/content/drive/My Drive/DS 5100 Food Environment Project/Data_Cleaned/"

In [None]:
# Read in cleaned data
df_state = pd.read_csv(data_dest + 'State_Level_Data.csv')
df_county = pd.read_csv(data_dest + 'County_Level_Data.csv')

In [None]:
# Define a class for county level analysis
class countyAnalysis:
  # Decription of class and fields

  def __init__(self, df_county):
    self.df_county = df_county.copy() # Original dataframe
    # copy so that changes within class doesn't change original

  def select_state(self, state):
    # Filter dataframe to counties from a specfific state
    self.df_county = self.df_county[self.df_county['State']==state]

  def calculate_na_summary(self):
    # Calculate percent missing
    self.pct_missing = self.df_county.isnull().sum() * 100 / len(self.df_county)
    # Find variable with largest % missing
    print('Missing values summary: ')
    print('The 10 variables with the highest percent missing are: ') 
    print(self.pct_missing.nlargest(10))

  def calculation_correlations_with_variable(self, target_var, var_list = None, num_pos_corr = 10, num_neg_corr = 10):
    '''
    target_var: variable of interest, want to see how other variables are correlated to this one variable
    var_list: list of variables to calculate their correlation with target_var. Optional, if not specified will consider all other variables.
    num_pos_corr: number of top postive correlations to display. Default 10
    num_neg_corr: number of top negative correlations to display. Default 10   
    '''
    if var_list is not None: # If provided a list of variables only calculte correlations for those
      self.corrs_with_var = self.df_county[var_list].drop([target_var] + ['FIPS', 'State', 'County'], axis=1).apply(lambda x: x.corr(self.df_county[target_var]))
    else: # Otherwise use all other variables
      self.corrs_with_var = self.df_county.drop([target_var] + ['FIPS', 'State', 'County'], axis=1).apply(lambda x: x.corr(self.df_county[target_var]))
    # Display top positive correlations
    print(f'Top {num_pos_corr} largest (positives) correlations: ')
    print(self.corrs_with_var.nlargest(10))
    print(f'Top {num_neg_corr} smallest (negative) correlations: ')
    print(self.corrs_with_var.nsmallest(10))


  '''
  Other ideas:
  - threshold for % missing (i.e. choose to remove variables with more than X% missing)
  - plotting
  - randomly select a few variables to visualize/analyze
  - return county with largest/smallest value for a specified variable of interest
  - create urban vs rural indicator and look at averages/variables for those groups
  '''

In [None]:
# Create instance of county level analysis class
analysis = countyAnalysis(df_county)

In [None]:
df_county.head()

Unnamed: 0,FIPS,State,County,2010_Census_Population,AGRITRSM_OPS07,AGRITRSM_OPS12,AGRITRSM_RCT07,AGRITRSM_RCT12,BERRY_ACRES07,BERRY_ACRES12,BERRY_ACRESPTH07,BERRY_ACRESPTH12,BERRY_FARMS07,BERRY_FARMS12,CHILDPOVRATE15,CHIPSTAX_STORES14,CHIPSTAX_VENDM14,CH_FOODINSEC_14_17,CH_VLFOODSEC_14_17,CONVS11,CONVS16,CONVSPTH11,CONVSPTH16,CSA07,CSA12,DIRSALES07,DIRSALES12,DIRSALES_FARMS07,DIRSALES_FARMS12,FARM_TO_SCHOOL13,FARM_TO_SCHOOL15,FDPIR12,FDPIR15,FFR11,FFR16,FFRPTH11,FFRPTH16,FMRKT13,FMRKT18,FMRKTPTH13,...,REDEMP_WICS11,REDEMP_WICS16,SLHOUSE07,SLHOUSE12,SNAPS12,SNAPS17,SNAPSPTH12,SNAPSPTH17,SNAP_BBCE09,SNAP_BBCE16,SNAP_CAP09,SNAP_CAP16,SNAP_OAPP09,SNAP_OAPP16,SNAP_PART_RATE11,SNAP_PART_RATE16,SNAP_REPORTSIMPLE09,SNAP_REPORTSIMPLE16,SODATAX_STORES14,SODATAX_VENDM14,SPECS11,SPECS16,SPECSPTH11,SPECSPTH16,SUPERC11,SUPERC16,SUPERCPTH11,SUPERCPTH16,VEG_ACRES07,VEG_ACRES12,VEG_ACRESPTH07,VEG_ACRESPTH12,VEG_FARMS07,VEG_FARMS12,VLFOODSEC_12_14,VLFOODSEC_15_17,WICS11,WICS16,WICSPTH11,WICSPTH16
0,1001,AL,Autauga,54571.0,7.0,10.0,228000.0,146000.0,,5.0,,0.090621,3.0,5.0,18.8,4.0,4.0,-0.5,-0.1,31.0,31.0,0.561604,0.560802,2.0,3.0,100.0,308.0,25.0,51.0,,0.0,0.0,0.0,34.0,44.0,0.615953,0.795977,1.0,1.0,0.018277,...,172391.75,161530.2969,0.0,0.0,37.416667,44.666667,0.674004,0.804747,0.0,1.0,0.0,0.0,0.0,1.0,84.02,86.898,1.0,1.0,4.0,4.0,1.0,1.0,0.018116,0.01809,1.0,1.0,0.018116,0.01809,948.0,1230.0,18.089877,17.181695,34.0,45.0,7.2,7.1,5.0,5.0,0.090567,0.090511
1,1003,AL,Baldwin,182265.0,18.0,16.0,124000.0,204000.0,79.0,93.0,0.458226,0.488456,36.0,41.0,19.6,4.0,4.0,-0.5,-0.1,107.0,118.0,0.573622,0.56865,13.0,7.0,715.0,648.0,80.0,103.0,0.0,1.0,0.0,0.0,121.0,156.0,0.648675,0.751775,4.0,4.0,0.020525,...,122739.7109,102920.0859,1.0,1.0,138.333333,189.416667,0.725055,0.890836,0.0,1.0,0.0,0.0,0.0,1.0,84.02,86.898,1.0,1.0,4.0,4.0,20.0,27.0,0.107219,0.130115,6.0,7.0,0.032166,0.033733,2280.0,1958.0,13.224751,11.975041,51.0,50.0,7.2,7.1,26.0,28.0,0.13938,0.134802
2,1005,AL,Barbour,27457.0,27.0,32.0,163000.0,304000.0,,42.0,,1.546449,3.0,5.0,45.2,4.0,4.0,-0.5,-0.1,22.0,19.0,0.804358,0.737177,1.0,0.0,11.0,13.0,18.0,13.0,1.0,0.0,0.0,0.0,19.0,23.0,0.694673,0.892372,3.0,4.0,0.111342,...,85699.46875,103414.9219,0.0,0.0,34.833333,36.0,1.28059,1.424614,0.0,1.0,0.0,0.0,0.0,1.0,84.02,86.898,1.0,1.0,4.0,4.0,3.0,2.0,0.109685,0.077598,0.0,1.0,0.0,0.038799,32.0,41.0,1.152862,1.178247,11.0,7.0,7.2,7.1,7.0,6.0,0.255942,0.232387
3,1007,AL,Bibb,22915.0,5.0,6.0,,21000.0,6.0,,0.267404,,6.0,2.0,29.3,4.0,4.0,-0.5,-0.1,19.0,15.0,0.835348,0.662749,2.0,3.0,46.0,20.0,12.0,13.0,0.0,0.0,0.0,0.0,6.0,7.0,0.263794,0.309283,1.0,1.0,0.044413,...,81445.39844,99703.79688,0.0,0.0,16.25,18.166667,0.719122,0.801423,0.0,1.0,0.0,0.0,0.0,1.0,84.02,86.898,1.0,1.0,4.0,4.0,0.0,0.0,0.0,0.0,1.0,1.0,0.043966,0.044183,65.0,14.0,2.896871,2.870771,10.0,11.0,7.2,7.1,6.0,5.0,0.263771,0.221474
4,1009,AL,Blount,57322.0,10.0,8.0,293000.0,30000.0,28.0,38.0,0.497866,0.657713,14.0,25.0,22.2,4.0,4.0,-0.5,-0.1,30.0,27.0,0.521177,0.469059,7.0,4.0,429.0,495.0,84.0,88.0,1.0,0.0,0.0,0.0,20.0,23.0,0.347451,0.399569,1.0,1.0,0.017358,...,123098.5625,99651.75781,0.0,0.0,38.0,40.166667,0.657144,0.692374,0.0,1.0,0.0,0.0,0.0,1.0,84.02,86.898,1.0,1.0,4.0,4.0,1.0,0.0,0.017373,0.0,1.0,1.0,0.017373,0.017373,585.0,677.0,10.401849,10.125312,67.0,64.0,7.2,7.1,8.0,8.0,0.139,0.139089


In [None]:
analysis.calculation_correlations_with_variable('CHILDPOVRATE15')

Top 10 largest (positives) correlations: 
POVRATE15                0.938094
PCT_FREE_LUNCH15         0.845508
PCT_FREE_LUNCH10         0.841873
PC_SNAPBEN12             0.826849
PC_SNAPBEN17             0.797352
PERCHLDPOV10             0.678418
PCT_DIABETES_ADULTS08    0.625580
PC_WIC_REDEMP11          0.612297
PC_WIC_REDEMP16          0.609494
PCT_DIABETES_ADULTS13    0.607158
dtype: float64
Top 10 smallest (negative) correlations: 
MEDHHINC15            -0.814689
PCT_NHWHITE10         -0.450372
PCT_REDUCED_LUNCH15   -0.305613
RECFACPTH11           -0.291240
RECFACPTH16           -0.290947
PCH_SNAP_12_17        -0.265927
METRO13               -0.241876
FSRPTH16              -0.226397
DIRSALES_FARMS12      -0.218137
DIRSALES_FARMS07      -0.213527
dtype: float64


In [None]:
analysis.calculate_na_summary()

Missing values summary: 
The variable(s) with the highest percent missing are: 
FOODHUB18                      94.276630
PCH_GHVEG_SQFTPTH_07_12        89.125596
PCH_GHVEG_SQFT_07_12           89.125596
PCH_FRESHVEG_ACRESPTH_07_12    66.550079
PCH_FRESHVEG_ACRES_07_12       66.550079
PCH_GHVEG_FARMS_07_12          61.812401
PCH_AGRITRSM_RCT_07_12         59.713831
PCH_BERRY_ACRESPTH_07_12       59.523052
PCH_BERRY_ACRES_07_12          59.523052
PCH_PC_WIC_REDEMP_11_16        42.511924
dtype: float64


In [None]:
analysis.select_state('MA')