In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
import csv

In [141]:
solar=pd.read_csv('../deepsolar_tract.csv',encoding = "ISO-8859-1")
solar.head()
solar_fields=pd.read_csv('../deepsolar fields.csv')
solar_fields.head()

Unnamed: 0,Field,Description,Unit,Data Type,Formula,Possible Values,Observed Max,Observed Min,Theoretical Min,Theoretical Max,Relevant Feature,Mentioned in Supplemental Info,API documentation,API Field Name,API In Line With Collected Data
0,Unnamed: 0,Index,,Numeric,,,72537.0,0.0,,,0,,,,
1,tile_count,total number of tiles in census tract,,Numeric,,,4468.0,0.0,0.0,,0,,,,
2,solar_system_count,Total number of solar systems in census tract,,Numeric,,,1535.0,0.0,0.0,,0,,,,
3,total_panel_area,,,Numeric,,,592031.075,0.0,0.0,,0,,,,
4,fips,FIPS identifier for the census tract,,String,,,,,,,0,,,,


In [142]:
features=solar_fields.loc[(solar_fields['Mentioned in Supplemental Info']==1)]['Field'].tolist()
all_variables=features+['number_of_solar_system_per_household']

#all_variables=features+['number_of_solar_system_per_household']+['incentive_count_residential']+['incentive_residential_state_level']

solar2=solar[all_variables]

#Create binary version of number_of_solar_system_per_household for RF classifier

solar2['solar_flag']=solar2['number_of_solar_system_per_household'].apply(lambda x: int(x>0))
solar2=solar2.loc[np.isfinite(solar2['number_of_solar_system_per_household'])]

#designate independent variable frame
independent_vars=solar2.loc[:,~solar2.columns.isin(['number_of_solar_system_per_household','solar_flag'])]
X=independent_vars.values
yc=solar2['solar_flag'].values
yr=solar2['number_of_solar_system_per_household'].values

#impute missing values
missing_val_imputer=Imputer(strategy='median')
X=missing_val_imputer.fit_transform(X)

#loop through different max depths and numbers of estimators

#max_depths=[15, 20, 30, None]
#n_estimators=[100,150,200,300]

n_estimators=[100,150,200]
max_depths=[15, 30, None]


model_params=[(x,y,a,b) for x in max_depths for y in n_estimators for a in max_depths for b in n_estimators]

# split the data 3 times into train/test folds

X_trains=[]
X_tests=[]
yr_trains=[]
yr_tests=[]
yc_trains=[]
yc_tests=[]

folds=KFold(n_splits=3, random_state=None, shuffle=True)
for train_index, test_index in folds.split(X):

    X_trains.append(X[train_index])
    X_tests.append(X[test_index])
    yr_trains.append(yr[train_index])
    yc_trains.append(yc[train_index])
    yr_tests.append(yr[test_index])
    yc_tests.append(yc[test_index])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [8]:
X_trains[2].shape

(47844, 95)

In [10]:
regressor=RandomForestRegressor(max_depth=None,n_estimators=25, n_jobs=-1)
regressor.fit(X_trains[0],yr_trains[0])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [63]:
regressor_preds=regressor.predict(X_tests[0])
r2_score(yr_tests[0],regressor_preds)

-0.027264847800063219

In [17]:
abs_diffs=np.absolute(regressor_preds-yr_tests[0])

In [56]:
full_frame=np.hstack([abs_diffs.reshape([-1,1]),yr_tests[0].reshape([-1,1]),regressor_preds.reshape([-1,1]),X_tests[0]])

In [88]:
colnames=['absolute_prediction_error']+['number_of_solar_system_per_household']+['prediction']+list(independent_vars.columns)

In [89]:
prediction_frame=pd.DataFrame(full_frame,columns=colnames)

In [90]:
sorted_frame=prediction_frame.sort_values('absolute_prediction_error',ascending=False)
sorted_frame

Unnamed: 0,absolute_prediction_error,number_of_solar_system_per_household,prediction,average_household_income,population_density,education_less_than_high_school_rate,education_high_school_graduate_rate,education_college_rate,education_bachelor_rate,education_master_rate,...,voting_2016_gop_percentage,number_of_years_of_education,diversity,net_metering,feedin_tariff,cooperate_tax,property_tax,sales_tax,rebate,avg_electricity_retail_rate
20407,2.018298e+00,0.000000,2.018298,64270.939490,0.904260,0.000000,0.000000,1.000000,0.000000,0.000000,...,0.707196,14.000000,0.000000,0.0,0.0,0.0,7.0,0.0,0.0,9.27
2232,1.791005e+00,0.200000,1.991005,64270.939490,14.768990,0.000000,0.000000,1.000000,0.000000,0.000000,...,0.400340,14.000000,0.000000,12.0,0.0,0.0,0.0,0.0,0.0,10.26
10684,1.412173e+00,0.000000,1.412173,64270.939490,1.881980,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.553913,12.000000,0.000000,9.0,0.0,0.0,4.0,20.0,0.0,10.35
5493,1.102116e+00,0.000000,1.102116,41088.095240,6.380721,0.280630,0.316016,0.368787,0.027204,0.006341,...,0.491262,11.771119,0.436005,8.0,0.0,0.0,11.0,20.0,0.0,10.15
11873,9.301663e-01,1.000000,0.069834,64270.939490,2624.838000,0.246400,0.381600,0.325800,0.031400,0.010000,...,0.645781,11.894800,0.570922,18.0,0.0,0.0,0.0,0.0,0.0,9.57
10920,7.788359e-01,1.000000,0.221164,64270.939490,2969.011000,0.000000,0.178161,0.482759,0.333333,0.005747,...,0.438250,14.333333,0.424401,19.0,0.0,0.0,40.0,10.0,0.0,16.65
16054,6.587851e-01,0.571429,1.230214,64270.939490,5.182328,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.487735,12.000000,0.000000,0.0,0.0,0.0,36.0,0.0,0.0,8.66
17692,6.495759e-01,0.750000,0.100424,64270.939490,860.882900,0.400613,0.369443,0.163771,0.035003,0.030148,...,0.752461,11.055187,0.520050,1.0,0.0,0.0,0.0,0.0,0.0,9.10
170,6.206666e-01,0.000000,0.620667,64270.939490,8.223341,0.000000,0.000000,0.250000,0.250000,0.500000,...,0.233887,16.500000,0.632231,21.0,9.0,0.0,38.0,0.0,10.0,14.72
5072,5.835685e-01,0.622540,0.038972,45645.259390,1484.845000,0.087473,0.491361,0.312095,0.050756,0.058315,...,0.452763,12.827214,0.109260,21.0,9.0,0.0,38.0,0.0,10.0,14.72


In [86]:
for i in sorted_frame.columns:
    print((i,sorted_frame.iloc[0][i],sorted_frame.iloc[1][i],sorted_frame.iloc[2][i],sorted_frame.iloc[3][i]))

('absolute_prediction_error', 2.0182978247599999, 1.7910053712000003, 1.4121728089199999, 1.1021163824799998)
('solar_density', 0.0, 0.20000000000000001, 0.0, 0.0)
('prediction', 2.0182978247599999, 1.9910053712000002, 1.4121728089199999, 1.1021163824799998)
('average_household_income', 64270.939489999997, 64270.939489999997, 64270.939489999997, 41088.095239999995)
('population_density', 0.90426019999999985, 14.768989999999999, 1.88198, 6.3807210000000003)
('education_less_than_high_school_rate', 0.0, 0.0, 0.0, 0.280629986)
('education_high_school_graduate_rate', 0.0, 0.0, 1.0, 0.31601554500000001)
('education_college_rate', 1.0, 1.0, 0.0, 0.36878707299999997)
('education_bachelor_rate', 0.0, 0.0, 0.0, 0.027203927000000003)
('education_master_rate', 0.0, 0.0, 0.0, 0.0063407649999999999)
('education_professional_school_rate', 0.0, 0.0, 0.0, 0.0)
('education_doctoral_rate', 0.0, 0.0, 0.0, 0.0010227039999999999)
('race_white_rate', 1.0, 1.0, 1.0, 0.731429573)
('race_black_africa_rate', 0.

In [81]:
r2_score(sorted_frame.iloc[2:]['solar_density'],sorted_frame.iloc[2:]['prediction'])

0.30311574826409571

In [71]:
r2_score(sorted_frame.iloc[0:]['solar_density'],sorted_frame.iloc[0:]['prediction'])

-0.027264847800063663

In [60]:
regressor1=RandomForestRegressor(max_depth=None,n_estimators=25, n_jobs=-1)
regressor1.fit(X_trains[1],yr_trains[1])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [62]:
regressor_preds1=regressor1.predict(X_tests[1])
r2_score(yr_tests[1],regressor_preds1)

0.32154773215626253

In [83]:
solar2.loc[solar2['education_college_rate']==0]

Unnamed: 0,average_household_income,population_density,education_less_than_high_school_rate,education_high_school_graduate_rate,education_college_rate,education_bachelor_rate,education_master_rate,education_professional_school_rate,education_doctoral_rate,race_white_rate,...,diversity,net_metering,feedin_tariff,cooperate_tax,property_tax,sales_tax,rebate,avg_electricity_retail_rate,number_of_solar_system_per_household,solar_flag
473,,9.046628,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,21,9,0,38,0,10,14.72,0.0,0
3949,55176.13636,39.29414,0.466165,0.24812,0.0,0.082707,0.203008,0.0,0.0,0.9875,...,0.024688,9,0,0,4,20,0,10.35,0.0,0
6545,,48.53258,0.0,0.0,0.0,0.722222,0.277778,0.0,0.0,1.0,...,0.0,12,0,0,0,0,0,10.26,0.111111,1
6557,,6.4596,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,12,0,0,0,0,0,10.26,1.0,1
6658,,7.906332,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,12,0,0,0,0,0,10.26,1.333333,1
10629,,625.8911,0.0,0.378378,0.0,0.0,0.216216,0.405405,0.0,0.513514,...,0.581446,20,0,0,40,12,7,15.32,0.0,0
11062,,58.15681,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,20,0,0,40,12,7,15.32,0.0625,1
11219,,16.88204,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,20,0,0,40,12,7,15.32,0.0,0
11520,,3.783903,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.5,...,0.5,9,0,0,4,20,0,10.35,0.0,0
12635,91663.63636,254.3293,0.0,0.533333,0.0,0.466667,0.0,0.0,0.0,0.733333,...,0.391111,20,0,0,40,12,7,15.32,0.0,0


In [97]:
for i in solar2.columns:
    print(i,solar2.loc[solar2[i]==sorted_frame.iloc[1][i]].shape)

average_household_income (1, 97)
population_density (1, 97)
education_less_than_high_school_rate (318, 97)
education_high_school_graduate_rate (64, 97)
education_college_rate (10, 97)
education_bachelor_rate (93, 97)
education_master_rate (865, 97)
education_professional_school_rate (11923, 97)
education_doctoral_rate (17670, 97)
race_white_rate (205, 97)
race_black_africa_rate (5884, 97)
race_indian_alaska_rate (34284, 97)
race_asian_rate (14272, 97)
race_islander_rate (61838, 97)
race_other_rate (15579, 97)
race_two_more_rate (3863, 97)
employ_rate (231, 97)
poverty_family_below_poverty_level_rate (0, 97)
heating_fuel_gas_rate (335, 97)
heating_fuel_electricity_rate (82, 97)
heating_fuel_fuel_oil_kerosene_rate (38651, 97)
heating_fuel_coal_coke_rate (37539, 97)
heating_fuel_solar_rate (67000, 97)
heating_fuel_other_rate (47476, 97)
heating_fuel_none_rate (39840, 97)
average_household_size (764, 97)
housing_unit_median_value (36, 97)
housing_unit_median_gross_rent (68, 97)
elevation (

KeyError: 'solar_flag'

In [98]:
sorted_frame.iloc[1]['age_65_74_rate']

1.0

In [112]:
solar.loc[solar['fips'].isin(['47053980100','42003981100'])]['population']

6675      5
61960    18
Name: population, dtype: int64

In [117]:
solar.loc[np.isnan(solar['number_of_solar_system_per_household'])]['population']

256      2142
458         0
470         0
471         0
633         0
635         0
637         0
2631        0
2682        0
2803        0
2840        0
3086        0
3204        0
3232        0
3350        0
3548        0
3554        0
3558        0
3640        0
3676     2872
3703        0
4084        0
4085        0
4159        0
4265        0
4539        0
4754        0
4913        0
4923        0
4967        0
         ... 
68854       0
69020       0
69259       0
69301       0
69335       0
69357    4026
69388       0
69759       0
70028       0
70080       0
70081       0
70105       0
70639    1229
70710       0
70741       0
70742       0
70827       0
70854       0
70961       0
70968       0
70984       0
71166    1456
71193     410
71241       0
71286       0
71805       0
71832    1354
71974       0
72416    3038
72425       0
Name: population, Length: 546, dtype: int64

In [118]:
solar.loc[solar['population']==0]['number_of_solar_system_per_household']

383      inf
456      inf
457      inf
458      NaN
460      inf
461      inf
462      inf
468      inf
470      NaN
471      NaN
472      inf
475      inf
477      inf
478      inf
480      inf
633      NaN
635      NaN
637      NaN
667      inf
822      inf
2631     NaN
2682     NaN
2803     NaN
2840     NaN
3085     inf
3086     NaN
3204     NaN
3232     NaN
3350     NaN
3548     NaN
        ... 
68295    NaN
68308    NaN
68337    NaN
68795    NaN
68816    NaN
68817    NaN
68854    NaN
69020    NaN
69259    NaN
69301    NaN
69335    NaN
69388    NaN
69759    NaN
70028    NaN
70080    NaN
70081    NaN
70105    NaN
70710    NaN
70741    NaN
70742    NaN
70827    NaN
70854    NaN
70961    NaN
70968    NaN
70984    NaN
71241    NaN
71286    NaN
71805    NaN
71974    NaN
72425    NaN
Name: number_of_solar_system_per_household, Length: 597, dtype: float64

In [125]:
solar.loc[(~np.isfinite(solar['number_of_solar_system_per_household']))
         | (solar['population']<20)]

Unnamed: 0.1,Unnamed: 0,tile_count,solar_system_count,total_panel_area,fips,average_household_income,county,education_bachelor,education_college,education_doctoral,...,incentive_count_nonresidential,incentive_residential_state_level,incentive_nonresidential_state_level,net_metering,feedin_tariff,cooperate_tax,property_tax,sales_tax,rebate,avg_electricity_retail_rate
256,256,33,6,3767.692177,6037574601,,Los Angeles County,18,30,0,...,85,15,16,21,9,0,38,0,10,14.72
259,259,20,6,2076.412403,6037574700,,Los Angeles County,17,66,0,...,85,15,16,21,9,0,38,0,10,14.72
383,383,12,4,691.239085,6037320000,,Los Angeles County,0,0,0,...,85,15,16,21,9,0,38,0,10,14.72
456,456,17,11,1273.364989,6037980001,,Los Angeles County,0,0,0,...,85,15,16,21,9,0,38,0,10,14.72
457,457,20,16,743.642371,6037980002,,Los Angeles County,0,0,0,...,85,15,16,21,9,0,38,0,10,14.72
458,458,0,0,0.000000,6037980003,,Los Angeles County,0,0,0,...,85,15,16,21,9,0,38,0,10,14.72
459,459,1463,249,216732.195600,6037980004,,Los Angeles County,0,0,0,...,85,15,16,21,9,0,38,0,10,14.72
460,460,106,27,18315.877960,6037980005,,Los Angeles County,0,0,0,...,85,15,16,21,9,0,38,0,10,14.72
461,461,36,6,3454.940635,6037980006,,Los Angeles County,0,0,0,...,85,15,16,21,9,0,38,0,10,14.72
462,462,4,4,79.332979,6037980007,,Los Angeles County,0,0,0,...,85,15,16,21,9,0,38,0,10,14.72


In [126]:
solar.loc[(solar['population']>0) & (solar['population']<20)]

Unnamed: 0.1,Unnamed: 0,tile_count,solar_system_count,total_panel_area,fips,average_household_income,county,education_bachelor,education_college,education_doctoral,...,incentive_count_nonresidential,incentive_residential_state_level,incentive_nonresidential_state_level,net_metering,feedin_tariff,cooperate_tax,property_tax,sales_tax,rebate,avg_electricity_retail_rate
459,459,1463,249,216732.1956,6037980004,,Los Angeles County,0,0,0,...,85,15,16,21,9,0,38,0,10,14.72
464,464,10,3,899.305968,6037980009,,Los Angeles County,1,0,0,...,85,15,16,21,9,0,38,0,10,14.72
473,473,0,0,0.0,6037980023,,Los Angeles County,0,0,0,...,85,15,16,21,9,0,38,0,10,14.72
4912,4912,9,5,410.593979,36047070202,,Kings County,4,9,0,...,36,17,16,20,0,0,40,12,7,15.32
4974,4974,0,0,0.0,36047085200,,Kings County,0,8,0,...,36,17,16,20,0,0,40,12,7,15.32
6557,6557,15,6,767.322074,42003980500,,Allegheny County,0,0,0,...,31,6,9,12,0,0,0,0,0,10.26
6631,6631,0,0,0.0,42003980700,,Allegheny County,0,4,0,...,31,6,9,12,0,0,0,0,0,10.26
6658,6658,4,4,112.677877,42003980600,,Allegheny County,0,0,0,...,31,6,9,12,0,0,0,0,0,10.26
6675,6675,1,1,15.584607,42003981100,,Allegheny County,0,5,0,...,31,6,9,12,0,0,0,0,0,10.26
11062,11062,1,1,14.965209,36081060701,,Queens County,16,0,0,...,36,17,16,20,0,0,40,12,7,15.32


## Maybe Exclude FIPS Codes 47053980100 and 42003981100

In [100]:
regressor2=RandomForestRegressor(max_depth=None,n_estimators=25, n_jobs=-1)
regressor2.fit(X_trains[2],yr_trains[2])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [101]:
regressor_preds2=regressor2.predict(X_tests[2])
r2_score(yr_tests[2],regressor_preds2)

0.23632139058628387

# Try Again Excluding Rows Where Population < 20

In [177]:
features=solar_fields.loc[(solar_fields['Mentioned in Supplemental Info']==1)]['Field'].tolist()
all_variables=features+['number_of_solar_system_per_household']

#all_variables=features+['number_of_solar_system_per_household']+['incentive_count_residential']+['incentive_residential_state_level']

solar2=solar[all_variables].loc[(np.isfinite(solar['number_of_solar_system_per_household']))
                               & (solar['population']>100)]

#Create binary version of number_of_solar_system_per_household for RF classifier

solar2['solar_flag']=solar2['number_of_solar_system_per_household'].apply(lambda x: int(x>0))
#solar2=solar2.loc[np.isfinite(solar2['number_of_solar_system_per_household'])]

#designate independent variable frame
independent_vars=solar2.loc[:,~solar2.columns.isin(['number_of_solar_system_per_household','solar_flag'])]
X=independent_vars.values
yc=solar2['solar_flag'].values
yr=solar2['number_of_solar_system_per_household'].values

#impute missing values
missing_val_imputer=Imputer(strategy='median')
X=missing_val_imputer.fit_transform(X)

#loop through different max depths and numbers of estimators

#max_depths=[15, 20, 30, None]
#n_estimators=[100,150,200,300]

n_estimators=[100,150,200]
max_depths=[15, 30, None]


model_params=[(x,y,a,b) for x in max_depths for y in n_estimators for a in max_depths for b in n_estimators]

# split the data 3 times into train/test folds

X_trains=[]
X_tests=[]
yr_trains=[]
yr_tests=[]
yc_trains=[]
yc_tests=[]

folds=KFold(n_splits=3, random_state=None, shuffle=True)
for train_index, test_index in folds.split(X):

    X_trains.append(X[train_index])
    X_tests.append(X[test_index])
    yr_trains.append(yr[train_index])
    yc_trains.append(yc[train_index])
    yr_tests.append(yr[test_index])
    yc_tests.append(yc[test_index])


In [178]:
regressor=RandomForestRegressor(max_depth=None,n_estimators=25, n_jobs=-1)
regressor.fit(X_trains[0],yr_trains[0])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [179]:
regressor_preds=regressor.predict(X_tests[0])
r2_score(yr_tests[0],regressor_preds)

0.46096283814683836

In [150]:
regressor1=RandomForestRegressor(max_depth=None,n_estimators=25, n_jobs=-1)
regressor1.fit(X_trains[1],yr_trains[1])
regressor_preds1=regressor1.predict(X_tests[1])
r2_score(yr_tests[1],regressor_preds1)

0.37480951599843859

In [151]:
regressor2=RandomForestRegressor(max_depth=None,n_estimators=25, n_jobs=-1)
regressor2.fit(X_trains[2],yr_trains[2])
regressor_preds2=regressor2.predict(X_tests[2])
r2_score(yr_tests[2],regressor_preds2)

0.4773959984584325

In [134]:
X_tests[0].shape

(23908, 95)

In [180]:
classifier=RandomForestClassifier(max_depth=None,n_estimators=100, n_jobs=-1)
classifier.fit(X_trains[0],yc_trains[0])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [181]:
classifier_preds=classifier.predict(X_tests[0])
accuracy_score(yc_tests[0],classifier_preds)

0.81257324627490368

In [182]:
final_preds=regressor_preds*classifier_preds

In [183]:
r2_score(yr_tests[0],final_preds)

0.46831814893492663

In [158]:
solar2.shape

(71675, 97)