In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import *
import csv


Read merged data file with charging stations and ACS data

In [2]:
#import the merged charging station and ACS data
# lines = open('data/add_0_stations.csv').read().splitlines()
# preprocessed the data file and removed missing values
# to do: use imputer to handle missing vaules in scikit
# http://stackoverflow.com/questions/30317119/classifiers-in-scikit-learn-that-handle-nan-null?lq=1
lines = open('data/missing_values_removed.csv').read().splitlines()
#Remove the first row, it's just the headings
lines.pop(0)

zip_code, count, population, male_pct, age, college_pct, income, home_pct = [], [], [], [], [], [], [], []
for line in lines:
    items = line.split(',')
    zip_code.append(items[0])
    count.append(float(items[1]))
    population.append(float(items[2]))
    male_pct.append(float(items[3]))
    age.append(float(items[4]))
    college_pct.append(float(items[5]))
    income.append(float(items[6]))
    home_pct.append(float(items[7]))
    
# combine lists to numpy arrays for data and labels
Y=np.array(count)
X=np.vstack((population,male_pct, age, college_pct, income, home_pct))
# X=np.vstack((population, male_pct, home_pct))
X=np.transpose(X)
print Y.shape, X.shape, X[0], Y[0]

# add shuffle code before splitting the data into train and dev sets
# easiest shuffle utility is sklearn shuffle function
from sklearn.utils import shuffle

# shuffle two arrays in unison
X, Y = shuffle(X, Y, random_state=0)
print X.shape, Y.shape, X[0], Y[0]
# determine the data type of the two arrays
print X.dtype, Y.dtype
# convert to strings
# must handle missing values first by making missing values zeros
X[X==''] = '0'
# now convert to float or leave them alone - converting now to make the code cleaner
X1=X.astype(np.float)
print X1.dtype

# Split the imported data into training 75% and development data 25%

split = 3*len(zip_code)/4

print len(zip_code), split

train_data, train_labels = X[:split], Y[:split]
dev_data, dev_labels = X[split:], Y[split:]

print train_data.shape, train_labels.shape
print dev_data.shape, dev_labels.shape
print train_data[1], train_labels[1]
print dev_data[0], dev_labels[0]
# dev_dat=np.array(dev_data)
# print dev_dat.sum()

(30514,) (30514, 6) [  6.40200000e+03   4.97188379e-01   1.94000000e+01   6.00000000e-01
   0.00000000e+00   0.00000000e+00] 1.0
(30514, 6) (30514,) [  1.21300000e+03   4.50123660e-01   3.97000000e+01   0.00000000e+00
   3.52120000e+04   8.70689655e-01] 0.0
float64 float64
float64
30514 22885
(22885, 6) (22885,)
(7629, 6) (7629,)
[  2.64000000e+03   4.96969697e-01   4.05000000e+01   1.06000000e+01
   2.84090000e+04   7.18568665e-01] 0.0
[  3.23560000e+04   5.01730745e-01   3.64000000e+01   2.18000000e+01
   4.57570000e+04   6.60303392e-01] 2.0


In [22]:
# try lineary regression mode
from sklearn import datasets, linear_model

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(train_data, train_labels)

# The coefficients
print('Coefficients: \n', regr.coef_)

# score
print('Variance score: %.2f' % regr.score(dev_data, dev_labels))

# predicted charging station count
X1 = np.array([5000,0.60, 45, 30, 38400, 0.62])
print regr.predict(X1)

# predicting charging station count for Coppell Texas
X_Coppell=np.array([39551,0.488938333,37.9,22.1,64572,0.721044394])
print regr.predict(X_Coppell)

('Coefficients: \n', array([  2.06276543e-05,   7.74429448e-01,   7.38819633e-03,
         9.43223599e-03,   1.42421493e-05,  -1.71728883e+00]))
Variance score: 0.19
[ 0.7910512]
[ 1.49000013]


Interpretation of the Linear Regression model: Number of charging stations in each region is positively correlated to the population, percentage of male population, age, percentage of the population which is college educated, and income and negatively correlated to the percentage of home owners. The model predicts two charging stations for each 100,000 increase in population, ~0.08 charging stations for 10% increase in male percentage, ~0.07 charging stations for every 10 years increase in median age, ~0.1 charging station for every 10% increase in % of college educated, and ~0.14 charging stations for every $10,000 increase in median income; and -0.17 charging stations for every 10 percentage increase in home ownership.  As an example, the simple linear model predicts 1.5 charging stations for Coppell, Texas (vs. 3 actual charging stations). The underestimate is due to the combination of the high percentage of home owership (74 percent) and relatively small population (less than 40,000).

In [21]:
# fit the first model with logistic regression 
#
logreg = LogisticRegression(penalty='l1', C=1.0, tol=0.01) 
logreg.fit(train_data, train_labels)

# examine predict prob for test data
print logreg.predict_proba(dev_data).shape, dev_labels.shape

# coeff_ for predictor, e.g., 0th=pop, 1st=male pct, 2nd=age, 3rd=college pct, 4th income, 5th home owning pct
# vs labels - 0th =0 charging station, 1st =1 charging station etc....
print('Population vs Count of Charging Stations: \n', np.transpose(logreg.coef_)[0])

print('Home Ownership Percentage vs Count of Charging Stations: \n', np.transpose(logreg.coef_)[5])

# examine score for test data
print logreg.score(dev_data, dev_labels)

# predicting Coppell Texas with this LogReg model
X_Coppell=np.array([39551,0.488938333,37.9,22.1,64572,0.721044394])

print logreg.predict_proba(X_Coppell)

(7629, 22) (7629,)
('Population vs Count of Charging Stations: \n', array([ -6.24965092e-05,   3.84905880e-05,   4.11146155e-05,
         4.20765276e-05,   3.71488656e-05,   4.27376529e-05,
         3.00211929e-05,   3.59851999e-05,   2.33181054e-05,
         2.95837782e-05,   3.71766846e-05,   2.16639585e-05,
         1.47126372e-05,   2.06326063e-05,   1.09043719e-05,
         2.35479542e-05,   1.23702781e-05,   3.01320246e-05,
         8.13223412e-05,  -1.79392722e-05,   3.47318109e-05,
        -2.34569037e-05]))
('Home Ownership Percentage vs Count of Charging Stations: \n', array([ 3.53924789, -1.20996458, -2.34742788, -2.25901343, -1.95431473,
       -2.76163876, -3.74625369, -1.82988655, -4.0288416 , -3.6799433 ,
       -2.17591589, -3.01125168, -2.97108184,  0.        , -0.2805755 ,
       -2.18403424, -3.53738405,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ]))
0.860663258618
[[  3.05071818e-01   3.19601599e-01   1.71346812e-01   9.17765977e-02
    4.

Logistic Regression showed similar results although the score is much higher for the test data set 86%. However, in reality, the predicting power of the model is limited. E.g., for Coppell, Texas, the model predicts 30.5% probability of 0, 32.0% of 1, 17.1% of 2 charging stations, etc.. The example is slightly unfair as Coppell is a small town of less than 40,000 residents, and with moderately high income (positive) and home owing percegtage (negative).

Improvements to the model. Play with the class weight parameter, perhaps to underweigh data points with 0 charging stations and overweigh these with higher counts of charging stations. 