In [1]:
import pandas as pd
import seaborn as sns

In [2]:
### Read in the data

In [3]:
# List sheetnames in excel file from USDA
xl = pd.ExcelFile('DataDownload.xls')
xl.sheet_names  # see all sheet names

In [4]:
# Read in county-level data
stores=pd.read_excel('DataDownload.xls', sheet_name='STORES')
stores.head()

In [5]:
stores.columns

In [6]:
# create target variable
stores['decline']=0
stores.loc[stores['PCH_GROC_09_14']<0, 'decline']=1
stores['decline'].value_counts()

In [7]:
stores['PCH_GROC_09_14'].describe()

In [8]:
# possible features
stores.columns

Percent change:  
PCH_GROCPTH_09_14  
PCH_SUPERCPTH_09_14  
PCH_CONVSPTH_09_14  
PCH_SPECSPTH_09_14  
PCH_SNAPSPTH_12_16  
PCH_WICSPTH_08_12  

Percent change per 1000 population:  
PCH_SUPERC_09_14  
PCH_CONVS_09_14  
PCH_SPECS_09_14  
PCH_SNAPS_12_16  
PCH_WICS_08_12  

In [9]:
# drop missing values
stores=stores.dropna(how='any')

## Exploratory Data Analysis

In [10]:
stores['PCH_WICSPTH_08_12'].describe()

In [11]:
# Grocery stores/1,000 pop (% change), 2009-14
sns.barplot(y='PCH_GROCPTH_09_14', x='decline', data=stores);

In [12]:
# Supercenters & club stores/1,000 pop (% change), 2007-14
sns.barplot(y='PCH_SUPERCPTH_09_14', x='decline', data=stores);

In [13]:
# Convenience stores/1,000 pop (% change), 2009-14
sns.barplot(y='PCH_CONVSPTH_09_14', x='decline', data=stores);

In [14]:
# Specialized food stores/1,000 pop (% change), 2009-14
sns.barplot(y='PCH_SPECSPTH_09_14', x='decline', data=stores);

In [15]:
# SNAP-authorized stores/1,000 pop (% change), 2012-16
sns.barplot(y='PCH_SNAPSPTH_12_16', x='decline', data=stores);

In [16]:
# WIC-authorized stores/1,000 pop (% change), 2008-12
sns.barplot(y='PCH_WICSPTH_08_12', x='decline', data=stores);

## Modeling

In [17]:
import sklearn 

In [18]:
from sklearn.cross_validation import train_test_split

In [19]:
stores.columns

In [20]:
X = stores[['PCH_GROCPTH_09_14', 'PCH_SUPERCPTH_09_14', 'PCH_CONVSPTH_09_14', 'PCH_SPECSPTH_09_14', 'PCH_SNAPSPTH_12_16', 'PCH_WICSPTH_08_12']]
y = stores['decline']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=20, 
                                                    random_state=0)

In [23]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model = model.fit(X_train, y_train)
