# DeepSolar Modeling

## Introduction

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from numpy import mean
import time
from datetime import datetime
import calendar
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import re
from scipy.stats import pearsonr
import math
from statistics import median
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error, accuracy_score
import shap
plt.style.use('ggplot')
pd.options.mode.chained_assignment = None  # default='warn'

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# Load in the data
data = pd.read_csv("/Users/nick/Desktop/PythonData/deepsolar/deepsolar_tract.csv", engine='python')


In [3]:
# data.info

In [4]:
print ('The data has {} rows and {} columns'.format(data.shape[0],data.shape[1]))


The data has 72537 rows and 169 columns


In [5]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,tile_count,solar_system_count,total_panel_area,fips,average_household_income,county,education_bachelor,education_college,education_doctoral,...,incentive_count_nonresidential,incentive_residential_state_level,incentive_nonresidential_state_level,net_metering,feedin_tariff,cooperate_tax,property_tax,sales_tax,rebate,avg_electricity_retail_rate
0,0,0.0,0.0,0.0,27145011200,70352.789869,Stearns County,569,1690,13,...,39,11,13,34,0,0,25,12,0,9.46
1,1,25.0,21.0,1133.436461,27145011301,61727.085202,Stearns County,674,1434,108,...,39,11,13,34,0,0,25,12,0,9.46
2,2,3.0,3.0,64.505776,27145011302,71496.886583,Stearns County,854,1459,31,...,39,11,13,34,0,0,25,12,0,9.46
3,3,0.0,0.0,0.0,27145011304,86840.152755,Stearns County,640,1116,68,...,39,11,13,34,0,0,25,12,0,9.46
4,4,5.0,5.0,164.583303,27145011400,89135.315597,Stearns County,654,1314,15,...,39,11,13,34,0,0,25,12,0,9.46
5,5,0.0,0.0,0.0,27145011500,62225.903614,Stearns County,522,1395,24,...,39,11,13,34,0,0,25,12,0,9.46
6,6,2.0,2.0,25.299013,27145011600,41068.93617,Stearns County,49,278,32,...,39,11,13,34,0,0,25,12,0,9.46
7,7,0.0,0.0,0.0,27145010500,74073.833671,Stearns County,242,867,10,...,39,11,13,34,0,0,25,12,0,9.46
8,8,0.0,0.0,0.0,27145011100,69412.192435,Stearns County,527,1665,6,...,39,11,13,34,0,0,25,12,0,9.46
9,9,11.0,10.0,415.36535,27145010102,82502.407069,Stearns County,1582,1949,6,...,39,11,13,34,0,0,25,12,0,9.46


In [6]:
print("Dataset missing values:\n", data.isna().sum())

Dataset missing values:
 Unnamed: 0                     0
tile_count                     0
solar_system_count             0
total_panel_area               0
fips                           0
                              ..
cooperate_tax                  0
property_tax                   0
sales_tax                      0
rebate                         0
avg_electricity_retail_rate    0
Length: 169, dtype: int64


In [7]:
cols = data.columns.tolist()
print(cols)


['Unnamed: 0', 'tile_count', 'solar_system_count', 'total_panel_area', 'fips', 'average_household_income', 'county', 'education_bachelor', 'education_college', 'education_doctoral', 'education_high_school_graduate', 'education_less_than_high_school', 'education_master', 'education_population', 'education_professional_school', 'employed', 'gini_index', 'heating_fuel_coal_coke', 'heating_fuel_electricity', 'heating_fuel_fuel_oil_kerosene', 'heating_fuel_gas', 'heating_fuel_housing_unit_count', 'heating_fuel_none', 'heating_fuel_other', 'heating_fuel_solar', 'land_area', 'per_capita_income', 'population', 'population_density', 'poverty_family_below_poverty_level', 'poverty_family_count', 'race_asian', 'race_black_africa', 'race_indian_alaska', 'race_islander', 'race_other', 'race_two_more', 'race_white', 'state', 'total_area', 'unemployed', 'water_area', 'education_less_than_high_school_rate', 'education_high_school_graduate_rate', 'education_college_rate', 'education_bachelor_rate', 'edu

In [8]:
# Feature separation
train_features = ['tile_count', 'solar_system_count', 'total_panel_area', 'fips', 'average_household_income', 'education_bachelor', 'education_college', 'education_doctoral', 'education_high_school_graduate', 'education_less_than_high_school', 'education_master', 'education_population', 'education_professional_school', 'employed', 'gini_index', 'heating_fuel_coal_coke', 'heating_fuel_electricity', 'heating_fuel_fuel_oil_kerosene', 'heating_fuel_gas', 'heating_fuel_housing_unit_count', 'heating_fuel_none', 'heating_fuel_other', 'heating_fuel_solar', 'land_area', 'per_capita_income', 'population', 'population_density', 'poverty_family_below_poverty_level', 'poverty_family_count', 'race_asian', 'race_black_africa', 'race_indian_alaska', 'race_islander', 'race_other', 'race_two_more', 'race_white', 'state', 'total_area', 'unemployed', 'water_area', 'education_less_than_high_school_rate', 'education_high_school_graduate_rate', 'education_college_rate', 'education_bachelor_rate', 'education_master_rate', 'education_professional_school_rate', 'education_doctoral_rate', 'race_white_rate', 'race_black_africa_rate', 'race_indian_alaska_rate', 'race_asian_rate', 'race_islander_rate', 'race_other_rate', 'race_two_more_rate', 'employ_rate', 'poverty_family_below_poverty_level_rate', 'heating_fuel_gas_rate', 'heating_fuel_electricity_rate', 'heating_fuel_fuel_oil_kerosene_rate', 'heating_fuel_coal_coke_rate', 'heating_fuel_solar_rate', 'heating_fuel_other_rate', 'heating_fuel_none_rate', 'solar_panel_area_divided_by_area', 'solar_panel_area_per_capita', 'tile_count_residential', 'tile_count_nonresidential', 'solar_system_count_residential', 'solar_system_count_nonresidential', 'total_panel_area_residential', 'total_panel_area_nonresidential', 'median_household_income', 'electricity_price_residential', 'electricity_price_commercial', 'electricity_price_industrial', 'electricity_price_transportation', 'electricity_price_overall', 'electricity_consume_residential', 'electricity_consume_commercial', 'electricity_consume_industrial', 'electricity_consume_total', 'household_count', 'average_household_size', 'housing_unit_count', 'housing_unit_occupied_count', 'housing_unit_median_value', 'housing_unit_median_gross_rent', 'lat', 'lon', 'elevation', 'heating_design_temperature', 'cooling_design_temperature', 'earth_temperature_amplitude', 'frost_days', 'air_temperature', 'relative_humidity', 'daily_solar_radiation', 'atmospheric_pressure', 'wind_speed', 'earth_temperature', 'heating_degree_days', 'cooling_degree_days', 'age_18_24_rate', 'age_25_34_rate', 'age_more_than_85_rate', 'age_75_84_rate', 'age_35_44_rate', 'age_45_54_rate', 'age_65_74_rate', 'age_55_64_rate', 'age_10_14_rate', 'age_15_17_rate', 'age_5_9_rate', 'household_type_family_rate', 'dropout_16_19_inschool_rate', 'occupation_construction_rate', 'occupation_public_rate', 'occupation_information_rate', 'occupation_finance_rate', 'occupation_education_rate', 'occupation_administrative_rate', 'occupation_manufacturing_rate', 'occupation_wholesale_rate', 'occupation_retail_rate', 'occupation_transportation_rate', 'occupation_arts_rate', 'occupation_agriculture_rate', 'occupancy_vacant_rate', 'occupancy_owner_rate', 'mortgage_with_rate', 'transportation_home_rate', 'transportation_car_alone_rate', 'transportation_walk_rate', 'transportation_carpool_rate', 'transportation_motorcycle_rate', 'transportation_bicycle_rate', 'transportation_public_rate', 'travel_time_less_than_10_rate', 'travel_time_10_19_rate', 'travel_time_20_29_rate', 'travel_time_30_39_rate', 'travel_time_40_59_rate', 'travel_time_60_89_rate', 'health_insurance_public_rate', 'health_insurance_none_rate', 'age_median', 'travel_time_average', 'voting_2016_dem_percentage', 'voting_2016_gop_percentage', 'voting_2012_dem_percentage', 'voting_2012_gop_percentage', 'number_of_years_of_education', 'diversity', 'number_of_solar_system_per_household', 'incentive_count_residential', 'incentive_count_nonresidential', 'incentive_residential_state_level', 'incentive_nonresidential_state_level', 'net_metering', 'feedin_tariff', 'cooperate_tax', 'property_tax', 'sales_tax', 'rebate', 'avg_electricity_retail_rate']
log_features = []
cols_to_drop = ['Unnamed: 0''county','voting_2016_dem_win','voting_2012_dem_win']
