# Machine Learning Models For Solar Cost Datasets

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import plotly.express as px

In [12]:
# Load the SDGE dataset
sdge_df = pd.read_csv("DataFrames/df_SDGE.csv")

In [13]:
# Convert zip code to string
sdge_df['Service_Zip'] = sdge_df['Service_Zip'].astype(int).astype(str).str.zfill(5)

In [15]:
# Impute missing values for Storage_Size_kW_AC
sdge_df['Storage_Size_kW_AC'] = sdge_df['Storage_Size_kW_AC'].fillna(0)
sdge_df.isnull().sum()

Utility                   0
Service_City              0
Service_Zip               0
Service_County            0
Technology_Type           0
System_Size_AC            0
Storage_Size_kW_AC        0
Inverter_Size_kW_AC       0
Mounting_Method           0
App_Received_Date         0
Installer_Name            0
Third_Party_Owned         0
Electric_Vehicle          0
Total_System_Cost         0
Generator_Manufacturer    0
Inverter_Manufacturer     0
Generator_Quantity        0
Inverter_Quantity         0
dtype: int64

In [16]:
# see how many installers there are
print("Zipcodes:", sdge_df.Service_Zip.nunique())
print("Cities:", sdge_df.Service_City.nunique())
print("Installers:", sdge_df.Installer_Name.nunique())

Zipcodes: 116
Cities: 70
Installers: 48


In [17]:
sdge_df.head()

Unnamed: 0,Utility,Service_City,Service_Zip,Service_County,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,App_Received_Date,Installer_Name,Third_Party_Owned,Electric_Vehicle,Total_System_Cost,Generator_Manufacturer,Inverter_Manufacturer,Generator_Quantity,Inverter_Quantity
0,SDGE,SAN DIEGO,92130,SAN DIEGO,Storage,10.0,10.0,0.01,Other,2019-05-21,Other,No,No,18415.0,Other,Tesla,0.0,2.0
1,SDGE,SAN DIEGO,92105,SAN DIEGO,Solar,3.676,0.0,3.8,Rooftop,2019-02-14,Tesla,No,No,21709.0,Kyocera,ABB,16.0,1.0
2,SDGE,SAN DIEGO,92110,SAN DIEGO,Solar,7.639,0.0,5.052,Rooftop,2019-09-24,Self-installed,No,No,38000.0,Hanwha,SolarEdge,24.0,1.0
3,SDGE,LEMON GROVE,91945,SAN DIEGO,Solar,4.76,0.0,15.0,Rooftop,2018-11-30,Self-installed,No,No,18000.0,Other,SunPower,24.0,12.0
4,SDGE,BORREGO SPRINGS,92004,ORANGE,Solar,6.687,0.0,7.625,Rooftop,2018-01-02,Other,No,No,35000.0,Hanwha,SolarEdge,29.0,1.0


In [21]:
# Convert categorical data to numeric with `pd.get_dummies`
cat_columns = sdge_df.dtypes[sdge_df.dtypes == "object"].index.tolist()
enc = OneHotEncoder(sparse_output=False)
enc_data = enc.fit_transform(sdge_df[cat_columns])
enc_columns = enc.get_feature_names_out().tolist()

encode_df = pd.DataFrame(enc_data, columns=enc_columns)
encode_df.head()

Unnamed: 0,Utility_SDGE,Service_City_AGUANGA,Service_City_ALISO VIEJO,Service_City_ALPINE,Service_City_BONITA,Service_City_BONSALL,Service_City_BORREGO SPRINGS,Service_City_BOULEVARD,Service_City_CAMPO,Service_City_CAPISTRANO BCH,...,Inverter_Manufacturer_Sanyo,Inverter_Manufacturer_Schneider,Inverter_Manufacturer_SolarBridge,Inverter_Manufacturer_SolarEdge,Inverter_Manufacturer_Solaria,Inverter_Manufacturer_Solectria,Inverter_Manufacturer_SunPower,Inverter_Manufacturer_Sungrow,Inverter_Manufacturer_Tesla,Inverter_Manufacturer_Xantrex
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Merge the encoded data back to the original dataframe
sdge_df = pd.concat([sdge_df, encode_df], axis=1)
sdge_df.head()

Unnamed: 0,Utility,Service_City,Service_Zip,Service_County,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,App_Received_Date,...,Inverter_Manufacturer_Sanyo,Inverter_Manufacturer_Schneider,Inverter_Manufacturer_SolarBridge,Inverter_Manufacturer_SolarEdge,Inverter_Manufacturer_Solaria,Inverter_Manufacturer_Solectria,Inverter_Manufacturer_SunPower,Inverter_Manufacturer_Sungrow,Inverter_Manufacturer_Tesla,Inverter_Manufacturer_Xantrex
0,SDGE,SAN DIEGO,92130,SAN DIEGO,Storage,10.0,10.0,0.01,Other,2019-05-21,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,SDGE,SAN DIEGO,92105,SAN DIEGO,Solar,3.676,0.0,3.8,Rooftop,2019-02-14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SDGE,SAN DIEGO,92110,SAN DIEGO,Solar,7.639,0.0,5.052,Rooftop,2019-09-24,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SDGE,LEMON GROVE,91945,SAN DIEGO,Solar,4.76,0.0,15.0,Rooftop,2018-11-30,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,SDGE,BORREGO SPRINGS,92004,ORANGE,Solar,6.687,0.0,7.625,Rooftop,2018-01-02,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Drop the original categorical columns
sdge_df = sdge_df.drop(columns=cat_columns)

In [25]:
# Split the data into features and target variable
X = sdge_df.drop(columns='Total_System_Cost')
y = sdge_df['Total_System_Cost']

In [26]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Create the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)


In [28]:
# Fit the model
rf_model.fit(X_train, y_train)

In [29]:
# Make predictions on the test set
predictions = rf_model.predict(X_test)

In [30]:
# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
r_squared = r2_score(y_test, predictions)
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r_squared}')


Mean Absolute Error: 9714.729173681728
R-squared: 0.8000414778618595


In [31]:
# Get feature importances
feature_importances = rf_model.feature_importances_
feature_names = X.columns