In [53]:

# Importing the libraries
import numpy as np
import pandas as pd
from pandas import DataFrame
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr


In [21]:

#Importing consolidated data file

raw_data=pd.read_excel(r"C:\Users\SwBatta\Documents\ML Guild contest\Hackathon\Consolidated dataset piece.xlsx")


In [22]:

#Data manipulation: Convert FIPS to string

raw_data['FIPS']=raw_data['FIPS'].astype(str)


In [23]:

#One-hot encoding State variable
categorical = ['State']
data_N = pd.get_dummies(raw_data, columns = categorical)


In [24]:

#Definiing numeric data(drops State)

numerical_features = data_N.select_dtypes(include=[np.number]).columns.tolist()


In [25]:

#Checking for missing values

Missing_values=DataFrame(data_N).isnull().sum().sort_values(ascending=False)


In [26]:

#Imputing missing values with median

for column in numerical_features:
    data_N[column]=data_N[column].fillna(data_N[column].median())


In [27]:

#Checking for missing values

Missing_values_new=DataFrame(data_N).isnull().sum().sort_values(ascending=False)


In [28]:

#Defining Target variable as the dependent variable
Target=[]
for row in data_N['Grocery stores (% change), 2009-14']:
    if row >=0:
        Target.append(0)
    else:
        Target.append(1)
    
data_N['Target']=Target
data_N['Target'].describe()

dep_var = ["Target"]

y = data_N[dep_var]


In [29]:
#Selecting numeric data for analysis

data_T=data_N[numerical_features]


In [30]:

#Plotting univariate distributions after imputation

sns.set(color_codes=True)
sns.distplot(data_N['Target'])

#Plotting bivariate distributions after imputation

sns.lmplot(x='Grocery stores (% change), 2009-14', y="Target",data=data_N);


In [31]:

# Feature Scaling

sc = StandardScaler()
X = sc.fit_transform(data_T)

X_T = DataFrame(X, columns = numerical_features)


In [32]:

#Splitting the data between train and test data sets

X_train, X_test, y_train, y_test = train_test_split(X_T, y, test_size=0.33, random_state=42)


In [33]:

#Observing the correlations between variables

corrmat = X_train.corr()

#Plotting a heatmap to observe the correlations
#sns.heatmap(corrmat, vmax=1., square=False).xaxis.tick_top()


In [None]:

#Observing the correlations of features with target variable

target = y_train.columns.values[0]

features = X_train.columns.tolist()
#features.remove('SNAP simplified reporting, 2016*') #this column has the same value for all counties(0 variance, add no value. Hence can be deleted), returns nan and a warning for pearson coefficient if not removed

#Features to be removed based on the number of missing values

#remove = ['WIC redemptions per capita (% change), 2008-12', 'WIC redemptions/WIC-authorized stores (% change), 2008-12', 'WIC redemptions/WIC-authorized stores, 2012', 'WIC redemptions per capita, 2012', 'WIC redemptions/WIC-authorized stores, 2008', 'WIC redemptions per capita, 2008', 'SNAP redemptions/SNAP-authorized stores (% change), 2012-16', 'SNAP redemptions/SNAP-authorized stores, 2012', 'Students eligible for free lunch (%), 2014', 'Students eligible for reduced-price lunch (%), 2014', 'Specialized food stores (% change), 2009-14', 'Specialized food stores/1,000 pop (% change), 2009-14', 'SNAP redemptions/SNAP-authorized stores, 2016', 'Supercenters & club stores (% change), 2009-14', 'Supercenters & club stores/1,000 pop (% change), 2007-14', 'Recreation & fitness facilities (% change), 2009-14', 'Recreation & fitness facilities/1,000 pop (% change), 2009-14', 'SNAP benefits per capita, 2015', 'SNAP benefits per capita (% change), 2010-15', 'SNAP benefits per capita, 2010', 'Students eligible for free lunch (%), 2009', 'Students eligible for reduced-price lunch (%), 2009', 'SNAP-authorized stores/1,000 pop (% change), 2012-16', 'SNAP-authorized stores (% change), 2012-16', 'SNAP-authorized stores/1,000 pop, 2016', 'Low income & low access to store (% change), 2010 - 15', 'Population, low access to store (% change), 2010 -15', 'SNAP-authorized stores, 2016', 'Convenience stores (% change), 2009-14', 'SNAP households, low access to store, 2015', 'Grocery stores (% change), 2009-14', 'Low income & low access to store (%), 2015', 'Low income & low access to store, 2015', 'SNAP households, low access to store (%), 2015', 'Convenience stores/1,000 pop (% change), 2009-14', 'Population, low access to store, 2015', 'Population, low access to store (%), 2015', 'White, low access to store, 2015', 'WIC-authorized stores (% change), 2008-12', 'WIC-authorized stores/1,000 pop (% change), 2008-12', 'Asian, low access to store (%), 2015', 'White, low access to store (%), 2015', 'American Indian or Alaska Native, low access to store, 2015', 'Multiracial, low access to store (%), 2015', 'Multiracial, low access to store, 2015', 'Black, low access to store, 2015', 'Hawaiian or Pacific Islander, low access to store, 2015', 'American Indian or Alaska Native, low access to store (%), 2015', 'Hawaiian or Pacific Islander, low access to store (%), 2015', 'Hispanic ethnicity, low access to store (%), 2015', 'Black, low access to store (%), 2015', 'Asian, low access to store, 2015', 'Hispanic ethnicity, low access to store, 2015', 'Grocery stores/1,000 pop (% change), 2009-14', 'Households, no car & low access to store (% change), 2010 - 15', 'Adult obesity rate, 2008', 'Adult diabetes rate, 2008', 'Poverty rate, 2015', 'Median household income, 2015', 'Child poverty rate, 2015', 'Households, no car & low access to store, 2015', 'Households, no car & low access to store (%), 2015', 'Population-loss counties, 2010', 'Adult obesity rate, 2013', 'Adult diabetes rate, 2013', 'Population, low access to store (%), 2010', 'SNAP participants (% pop), 2016*', 'Population, low access to store, 2010', 'SNAP participants (% pop), 2012*', 'Low income & low access to store, 2010', 'SNAP participants (change % pop), 2012-16*', 'Specialized food stores/1,000 pop, 2009', 'Specialized food stores/1,000 pop, 2014', 'Specialized food stores, 2014', 'Specialized food stores, 2009', 'Convenience stores/1,000 pop, 2014', 'Convenience stores/1,000 pop, 2009', 'Convenience stores, 2014', 'Convenience stores, 2009', 'Supercenters & club stores/1,000 pop, 2014', 'Supercenters & club stores/1,000 pop, 2009', 'SNAP participants (% eligible pop), 2013*', 'Supercenters & club stores, 2014', 'Supercenters & club stores, 2009', 'Grocery stores/1,000 pop, 2014', 'Grocery stores/1,000 pop, 2009', 'Grocery stores, 2014', 'Grocery stores, 2009', 'Households, no car & low access to store (%), 2010', 'Households, no car & low access to store, 2010', 'SNAP-authorized stores, 2012', 'Low income & low access to store (%), 2010', 'SNAP-authorized stores/1,000 pop, 2012', 'WIC-authorized stores, 2008', 'WIC-authorized stores, 2012', 'WIC-authorized stores/1,000 pop, 2008', 'WIC-authorized stores/1,000 pop, 2012', 'SNAP participants (% eligible pop), 2008*', 'State_WY', 'SNAP online application, 2009*', 'State_MI', 'State_ME', 'State_MD', 'State_MA', 'State_LA', 'State_KY', 'State_KS', 'State_IN', 'State_IL', 'State_ID', 'State_IA', 'State_HI', 'State_GA', 'State_FL', 'State_DE', 'State_DC', 'State_CT', 'State_CO', 'State_CA', 'State_AZ', 'State_AR', 'State_AL', 'State_AK', 'State_MN', 'State_MO', 'State_MS', 'State_PA', 'State_WI', 'State_WA', 'State_VT', 'State_VA', 'State_UT', 'State_TX', 'State_TN', 'State_SD', 'State_SC', 'State_RI', 'State_OR', 'State_MT', 'State_OK', 'State_OH', 'State_NY', 'State_NV', 'State_NM', 'State_NJ', 'State_NH', 'State_NE', 'State_ND', 'State_NC', 'Metro/nonmetro counties, 2010', 'Persistent-poverty counties, 2010', 'SNAP online application, 2016*', 'Household food insecurity (%, three-year average), 2010-12*', 'FDPIR Sites, 2012', 'Child & Adult Care (change % pop), 2009-15*', 'Child & Adult Care (% pop), 2015*', 'Child & Adult Care (% pop), 2009*', 'WIC participants (change % pop), 2009-15*', 'WIC participants (% pop), 2015*', 'WIC participants (% pop), 2009*', 'Summer Food Program participants (change % pop), 2009-15*', 'Summer Food Service Program participants (% pop), 2015*', 'Summer Food Service Program participants (% pop), 2009*', 'School Breakfast Program participants (change % pop), 2009-15*', 'School Breakfast Program participants (% pop), 2015*', 'School Breakfast Program participants (% pop), 2009*', 'National School Lunch Program participants (change % pop), 2009-15*', 'National School Lunch Program participants (% pop), 2015*', 'National School Lunch Program participants (% pop), 2009*', 'SNAP simplified reporting, 2016*', 'SNAP simplified reporting, 2009*', 'SNAP Broad-based Categorical Eligibility, 2016*', 'SNAP Broad-based Categorical Eligibility, 2009*', 'SNAP Combined Application Project , 2016*', 'SNAP Combined Application Project , 2009*', 'Household food insecurity (%, three-year average), 2013-15*', 'Household food insecurity (change %),2010-12 to 2013-15*', 'Household very low food security (%, three-year average), 2010-12*', 'Recreation & fitness facilities, 2009', '% Hawaiian or Pacific Islander, 2010', '% American Indian or Alaska Native, 2010', '% Asian, 2010', '% Hispanic, 2010', '% Black, 2010', '% White, 2010', 'Recreation & fitness facilities/1,000 pop, 2014', 'Recreation & fitness facilities/1,000 pop, 2009', 'Recreation & fitness facilities, 2014', 'Household very low food security (%, three-year average), 2013-15*', 'State_WV', 'General food sales tax, retail stores, 2014*', 'Child food insecurity (% households, multiple-year average), 2003-11*', 'Child food insecurity (% households, multiple-year average), 2001-07*', 'Household very low food security (change %),2010-12 to 2013-15*']
remove = ['SNAP simplified reporting, 2016*','Grocery stores (% change), 2009-14','Grocery stores/1,000 pop (% change), 2009-14','Greenhouse veg and fresh herb sq feet/1,000 pop (% change), 2007 - 12', 'Greenhouse veg and fresh herb sq feet (% change), 2007 - 12', 'Vegetable acres harvested for fresh market/1,000 pop (% change), 2007 - 12', 'Vegetable acres harvested for fresh market (% change), 2007 - 12', 'Greenhouse vegetable and fresh herb farms (% change), 2007 - 12', 'Agritourism receipts (% change), 2007 - 12', 'Berry acres (% change), 2007 - 12', 'Berry acres/1,000 pop (% change), 2007 - 12', 'Vegetable acres harvested for fresh market/1,000 pop, 2007', 'Vegetable acres harvested for fresh market, 2007', 'Vegetable acres harvested for fresh market/1,000 pop, 2012', 'Vegetable acres harvested for fresh market, 2012', 'Agritourism receipts, 2007', 'WIC redemptions/WIC-authorized stores, 2012', 'Orchard acres/1,000 pop (% change), 2007 - 12', 'Orchard acres (% change), 2007 - 12', 'High schoolers physically active (%), 2015*', 'Vegetable acres harvested (% change), 2007 - 12', 'WIC redemptions/WIC-authorized stores, 2008', 'WIC redemptions per capita, 2008', 'Agritourism receipts, 2012', 'Berry farms (% change), 2007 - 12', 'Vegetable acres harvested/1,000 pop (% change), 2007 - 12', 'Greenhouse veg and fresh herb sq feet/1,000 pop, 2012', 'Farmers markets that report accepting WIC Cash (%), 2016', 'Farmers markets that report selling other food products (%), 2016', 'Greenhouse veg and fresh herb sq feet, 2012', 'Farmers markets that report accepting SNAP, 2016', 'Farmers markets that report accepting SNAP (%), 2016', 'Farmers markets that report accepting WIC, 2016', 'Farmers markets that report accepting SFMNP, 2016', 'Farmers markets that report accepting WIC (%), 2016', 'Farmers markets that report accepting credit cards, 2016', 'Farmers markets that report selling other food products, 2016', 'Farmers markets that report selling fruit & vegetables (%), 2016', 'Farmers markets that report accepting SFMNP (%), 2016', 'Farmers markets that report accepting WIC Cash, 2016', 'Farmers markets that report accepting credit cards (%), 2016', 'Farmers markets that report selling fruit & vegetables, 2016', 'Farmers markets that report selling animal products, 2016', 'Farmers markets that report selling animal products (%), 2016', 'Farmers markets that report selling baked/prepared food products, 2016', 'Farmers markets that report selling baked/prepared food products (%), 2016', 'Greenhouse veg and fresh herb sq feet, 2007', 'Greenhouse veg and fresh herb sq feet/1,000 pop, 2007', 'Berry acres, 2007', 'Berry acres/1,000 pop, 2007', 'Berry acres/1,000 pop, 2012', 'Berry acres, 2012', 'CSA farms (% change), 2007 - 12', 'Vegetable acres harvested/1,000 pop, 2012', 'Vegetable acres harvested, 2007', 'Vegetable acres harvested/1,000 pop, 2007', 'Orchard acres/1,000 pop, 2012', 'Orchard acres, 2012', 'Vegetable acres harvested, 2012', 'Farmers markets (% change), 2009-16', 'Farmers markets/1,000 pop (% change), 2009-16', 'Orchard acres/1,000 pop, 2007', 'Orchard acres, 2007', 'Direct farm sales (% change), 2007 - 12', 'Direct farm sales per capita (% change), 2007 - 12', 'Agritourism operations (% change), 2007 - 12', 'Orchard farms (% change), 2007 - 12', 'Farms with vegetables harvested for fresh market (% change), 2007 - 12', 'Vegetable farms (% change), 2007 - 12', 'Direct farm sales (%), 2007', 'Direct farm sales per capita, 2007', 'Direct farm sales, 2007', 'Direct farm sales (%), 2012', 'Direct farm sales per capita, 2012', 'SNAP redemptions/SNAP-authorized stores, 2012', 'Direct farm sales, 2012', 'Students eligible for free lunch (%), 2014', 'Students eligible for reduced-price lunch (%), 2014', 'Farm to school program, 2013', 'SNAP redemptions/SNAP-authorized stores, 2016', 'Small slaughterhouse facilities (% change), 2007 - 12', 'Farms with direct sales (% change), 2007 - 12', 'SNAP benefits per capita, 2015', 'SNAP benefits per capita, 2010', 'Students eligible for free lunch (%), 2009', 'Students eligible for reduced-price lunch (%), 2009', 'Farms with direct sales (%), 2007', 'Farms with direct sales (%), 2012', 'Vegetable farms, 2007', 'CSA farms, 2012', 'Farms with direct sales, 2007', 'Berry farms, 2007', 'Berry farms, 2012', 'Orchard farms, 2012', 'Orchard farms, 2007', 'Agritourism operations, 2007', 'CSA farms, 2007', 'Greenhouse vegetable and fresh herb farms, 2012', 'Farms with direct sales, 2012', 'Farms with vegetables harvested for fresh market, 2012', 'Farms with vegetables harvested for fresh market, 2007', 'Vegetable farms, 2012', 'Agritourism operations, 2012', 'Greenhouse vegetable and fresh herb farms, 2007', 'Fast-food restaurants (% change), 2009-14', 'Fast-food restaurants/1,000 pop (% change), 2009-14', 'Full-service restaurants (% change), 2009-14', 'Price of low-fat milk/price of sodas, 2010**', 'Price of low-fat milk/national average, 2010**', 'Price of sodas/national average, 2010**', 'Full-service restaurants/1,000 pop (% change), 2009-14', 'SNAP-authorized stores/1,000 pop, 2016', 'Change children low access to store(2010-15)(%)', 'SNAP-authorized stores, 2016', 'Seniors, low access to store (% change), 2010 -15', 'SNAP households, low access to store, 2015', 'Low income & low access to store (%), 2015', 'Low income & low access to store, 2015', 'SNAP households, low access to store (%), 2015', 'Seniors, low access to store, 2015', 'Children, low access to store (%), 2015', 'Children, low access to store, 2015', 'Population, low access to store, 2015', 'Population, low access to store (%), 2015', 'White, low access to store, 2015', 'Seniors, low access to store (%), 2015', 'White, low access to store (%), 2015', 'American Indian or Alaska Native, low access to store, 2015', 'Multiracial, low access to store (%), 2015', 'Multiracial, low access to store, 2015', 'Black, low access to store, 2015', 'Hawaiian or Pacific Islander, low access to store, 2015', 'American Indian or Alaska Native, low access to store (%), 2015', 'Hawaiian or Pacific Islander, low access to store (%), 2015', 'Hispanic ethnicity, low access to store (%), 2015', 'Black, low access to store (%), 2015', 'Asian, low access to store, 2015', 'Hispanic ethnicity, low access to store, 2015', 'Farmers markets/1,000 pop, 2009', 'Farmers markets, 2009', 'Farm to school program, 2009', 'Farmers markets/1,000 pop, 2016', 'Small slaughterhouse facilities, 2007', 'Small slaughterhouse facilities, 2012', 'Fast-food restaurants/1,000 pop, 2014', 'SNAP participants (% pop), 2016*', 'Full-service restaurants, 2009', 'Full-service restaurants, 2014', 'County', 'Expenditures per capita, fast food, 2007*', 'Expenditures per capita, restaurants, 2007*', 'Expenditures per capita, fast food, 2012*', 'Full-service restaurants/1,000 pop, 2014', 'SNAP participants (% pop), 2012*', 'Fast-food restaurants/1,000 pop, 2009', 'Full-service restaurants/1,000 pop, 2009', 'Expenditures per capita, restaurants, 2012*', 'Children, low access to store (%), 2010', 'Fast-food restaurants, 2014', 'Fast-food restaurants, 2009', 'Specialized food stores/1,000 pop, 2009', 'Specialized food stores/1,000 pop, 2014', 'Specialized food stores, 2014', 'Specialized food stores, 2009', 'Convenience stores/1,000 pop, 2014', 'Convenience stores/1,000 pop, 2009', 'Convenience stores, 2014', 'Convenience stores, 2009', 'Supercenters & club stores/1,000 pop, 2014', 'Supercenters & club stores/1,000 pop, 2009', 'SNAP participants (% eligible pop), 2013*', 'Supercenters & club stores, 2014', 'Supercenters & club stores, 2009', 'Grocery stores/1,000 pop, 2009', 'Grocery stores, 2014', 'Grocery stores, 2009', 'Seniors, low access to store (%), 2010', 'Seniors, low access to store, 2010', 'Children, low access to store, 2010', 'SNAP-authorized stores, 2012', 'SNAP-authorized stores/1,000 pop, 2012', 'WIC-authorized stores, 2008', 'WIC-authorized stores, 2012', 'WIC-authorized stores/1,000 pop, 2008', 'WIC-authorized stores/1,000 pop, 2012', 'SNAP participants (% eligible pop), 2008*', 'SNAP online application, 2009*', 'Persistent-child-poverty counties, 2010', 'SNAP online application, 2016*', '% Population under age 18, 2010', 'FDPIR Sites, 2012', 'Child & Adult Care (% pop), 2015*', 'Child & Adult Care (% pop), 2009*', 'WIC participants (% pop), 2015*', 'WIC participants (% pop), 2009*', 'Summer Food Service Program participants (% pop), 2015*', 'Summer Food Service Program participants (% pop), 2009*', 'School Breakfast Program participants (% pop), 2015*', 'School Breakfast Program participants (% pop), 2009*', 'National School Lunch Program participants (% pop), 2015*', 'National School Lunch Program participants (% pop), 2009*', 'SNAP simplified reporting, 2016*', 'SNAP simplified reporting, 2009*', 'SNAP Broad-based Categorical Eligibility, 2016*', 'SNAP Broad-based Categorical Eligibility, 2009*', 'SNAP Combined Application Project , 2016*', 'SNAP Combined Application Project , 2009*', '% Population 65 years or older, 2010', 'Food hubs, 2016', 'Farmers markets, 2016', 'Chip & pretzel sales tax, vending, 2014*', 'Chip & pretzel sales tax, retail stores, 2014*', 'Soda sales tax, vending, 2014*', 'Soda sales tax, retail stores, 2014*', 'FIPS']


features_N = list(set(features) - set(remove))

correlations = {}
for f in features_N:
    x1 = X_train[f].values
    x2 = y_train[target].values
    key = f + ' vs ' + target
    
    try:
        correlations[key] = pearsonr(x1,x2)[0]
    except:
        print("Div by zero")
            

data_correlations = pd.DataFrame(correlations, index=['Value']).T
data_correlations = data_correlations.loc[data_correlations['Value'].abs().sort_values(ascending=False).index]


In [39]:

#Data sets with new features

X_train_new = X_train[features_N]
X_test_new = X_test[features_N]


In [58]:

# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 42)
classifier.fit(X_train_new, y_train.values.flatten())

feature_importances = pd.DataFrame(classifier.feature_importances_,
                                   index = X_train_new.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

new_features = feature_importances[feature_importances.importance>0.02].index.values
X_train_new1 = X_train_new[new_features]
X_test_new1 = X_test_new[new_features]

classifier.fit(X_train_new1, y_train)

#Applying Cross Validation; printing the mean score and the 95% confidence interval of the score estimate
scores = cross_val_score(classifier, X_train_new1, y_train.values.flatten(), cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Predicting the Test set results 
y_pred = classifier.predict(X_test_new1)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
test_set_accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1])
print("test set accuracy:",test_set_accuracy)


  from ipykernel import kernelapp as app


Accuracy: 0.61 (+/- 0.01)
test set accuracy: 0.61753371869
