# Machine Learning Models For Solar Cost Datasets

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import plotly.express as px

In [2]:
# Load the SDGE dataset
df = pd.read_csv("clean_data/df_SDGE.csv")

In [3]:
# Convert zip code to string
df['Service_Zip'] = df['Service_Zip'].astype(int).astype(str).str.zfill(5)

In [4]:
# see how many installers there are
print("Zipcodes:", df.Service_Zip.nunique())
print("Cities:", df.Service_City.nunique())
print("Installers:", df.Installer_Name.nunique())

Zipcodes: 116
Cities: 70
Installers: 48


In [5]:
df.head()

Unnamed: 0,Utility,Service_City,Service_Zip,Service_County,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,App_Received_Date,Installer_Name,Third_Party_Owned,Electric_Vehicle,Total_System_Cost,Generator_Manufacturer,Inverter_Manufacturer,Generator_Quantity,Inverter_Quantity,Year
0,SDGE,SAN DIEGO,92130,SAN DIEGO,Storage,10.0,10.0,0.01,Other,2019-05-21,Other,No,No,18415.0,Other,Tesla,0.0,2.0,2019
1,SDGE,SAN DIEGO,92105,SAN DIEGO,Solar,3.676,0.0,3.8,Rooftop,2019-02-14,Tesla,No,No,21709.0,Kyocera,ABB,16.0,1.0,2019
2,SDGE,SAN DIEGO,92110,SAN DIEGO,Solar,7.639,0.0,5.052,Rooftop,2019-09-24,Self-installed,No,No,38000.0,Hanwha,SolarEdge,24.0,1.0,2019
3,SDGE,LEMON GROVE,91945,SAN DIEGO,Solar,4.76,0.0,15.0,Rooftop,2018-11-30,Self-installed,No,No,18000.0,Other,SunPower,24.0,12.0,2018
4,SDGE,BORREGO SPRINGS,92004,ORANGE,Solar,6.687,0.0,7.625,Rooftop,2018-01-02,Other,No,No,35000.0,Hanwha,SolarEdge,29.0,1.0,2018


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168543 entries, 0 to 168542
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Utility                 168543 non-null  object 
 1   Service_City            168543 non-null  object 
 2   Service_Zip             168543 non-null  object 
 3   Service_County          168543 non-null  object 
 4   Technology_Type         168543 non-null  object 
 5   System_Size_AC          168543 non-null  float64
 6   Storage_Size_kW_AC      168543 non-null  float64
 7   Inverter_Size_kW_AC     168543 non-null  float64
 8   Mounting_Method         168543 non-null  object 
 9   App_Received_Date       168543 non-null  object 
 10  Installer_Name          168543 non-null  object 
 11  Third_Party_Owned       168543 non-null  object 
 12  Electric_Vehicle        168543 non-null  object 
 13  Total_System_Cost       168543 non-null  float64
 14  Generator_Manufactur

In [7]:
# Results for SDGE - not scaled, n estimators 100, not dropped negative and small values

In [8]:
#columns_to_drop = ['Service_Zip', 'App_Received_Date'] #79% here
#columns_to_drop = ['App_Received_Date'] #Mean Absolute Error: 10005.585347959266 and R-squared: 0.7975549071933703
#columns_to_drop = ['Year'] #Mean Absolute Error: 9714.463670793002 and R-squared: 0.8023303546949349
#df.drop(columns=columns_to_drop, inplace=True)
#df.head()

In [9]:
# Results for SDGE - not scaled n estimators 100, not dropped negative and small values

In [10]:
columns_to_drop = ['Utility', 'Service_Zip', 'App_Received_Date', "Year"] #Mean Absolute Error: 6335.3067956234045 and R-squared: 0.6231565771055633
#columns_to_drop = ['App_Received_Date'] #Mean Absolute Error: 6291.18175942384 and R-squared: 0.6249063767730472
#columns_to_drop = ['Year'] # Mean Absolute Error: 6264.777424996714 and R-squared: 0.6271341735008127
#columns_to_drop = ['Utility', 'App_Received_Date'] #Mean Absolute Error: 6285.821611756003 and R-squared: 0.6281995925685182
#df.drop(columns=columns_to_drop, inplace=True) #Mean Absolute Error: 6291.18175942384 and R-squared: 0.6249063767730472
#df.head()

In [11]:
# Results for PGE - not scaled n estimators 100, not dropped negative and small values

In [12]:
#columns_to_drop = ['Service_Zip', 'App_Received_Date'] #Mean Absolute Error: 6335.3067956234045 and R-squared: 0.6231565771055634
#columns_to_drop = ['App_Received_Date'] #Mean Absolute Error: 6291.18175942384 and R-squared: 0.6249063767730472
#columns_to_drop = ['Year'] # Mean Absolute Error: 6264.777424996714 and R-squared: 0.6271341735008127
#columns_to_drop = ['Utility', 'App_Received_Date'] #Mean Absolute Error: 6285.821611756003 and R-squared: 0.6281995925685182
#columns_to_drop = ['Utility', 'App_Received_Date', 'Inverter_Size_kW_AC'] #Mean Absolute Error: 6387.598217884039
# and R-squared: 0.6197025210399145
#df.drop(columns=columns_to_drop, inplace=True) 
#df.head()

In [13]:
# Convert categorical data to numeric with `pd.get_dummies`
cat_columns = df.dtypes[df.dtypes == "object"].index.tolist()
enc = OneHotEncoder(sparse_output=False)
enc_data = enc.fit_transform(df[cat_columns])
enc_columns = enc.get_feature_names_out().tolist()

encode_df = pd.DataFrame(enc_data, columns=enc_columns)
encode_df.head()

Unnamed: 0,Utility_SDGE,Service_City_AGUANGA,Service_City_ALISO VIEJO,Service_City_ALPINE,Service_City_BONITA,Service_City_BONSALL,Service_City_BORREGO SPRINGS,Service_City_BOULEVARD,Service_City_CAMPO,Service_City_CAPISTRANO BCH,...,Inverter_Manufacturer_Sanyo,Inverter_Manufacturer_Schneider,Inverter_Manufacturer_SolarBridge,Inverter_Manufacturer_SolarEdge,Inverter_Manufacturer_Solaria,Inverter_Manufacturer_Solectria,Inverter_Manufacturer_SunPower,Inverter_Manufacturer_Sungrow,Inverter_Manufacturer_Tesla,Inverter_Manufacturer_Xantrex
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Merge the encoded data back to the original dataframe
df = pd.concat([df, encode_df], axis=1)
df.head()

Unnamed: 0,Utility,Service_City,Service_Zip,Service_County,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,App_Received_Date,...,Inverter_Manufacturer_Sanyo,Inverter_Manufacturer_Schneider,Inverter_Manufacturer_SolarBridge,Inverter_Manufacturer_SolarEdge,Inverter_Manufacturer_Solaria,Inverter_Manufacturer_Solectria,Inverter_Manufacturer_SunPower,Inverter_Manufacturer_Sungrow,Inverter_Manufacturer_Tesla,Inverter_Manufacturer_Xantrex
0,SDGE,SAN DIEGO,92130,SAN DIEGO,Storage,10.0,10.0,0.01,Other,2019-05-21,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,SDGE,SAN DIEGO,92105,SAN DIEGO,Solar,3.676,0.0,3.8,Rooftop,2019-02-14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SDGE,SAN DIEGO,92110,SAN DIEGO,Solar,7.639,0.0,5.052,Rooftop,2019-09-24,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SDGE,LEMON GROVE,91945,SAN DIEGO,Solar,4.76,0.0,15.0,Rooftop,2018-11-30,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,SDGE,BORREGO SPRINGS,92004,ORANGE,Solar,6.687,0.0,7.625,Rooftop,2018-01-02,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Drop the original categorical columns
df = df.drop(columns=cat_columns)

In [16]:
# Split the data into features and target variable
X = df.drop(columns='Total_System_Cost')
y = df['Total_System_Cost']

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Create the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)


In [None]:
# Fit the model
rf_model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
predictions = rf_model.predict(X_test)

In [None]:
# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
r_squared = r2_score(y_test, predictions)
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r_squared}')


In [None]:
# Get feature importances
feature_importances = rf_model.feature_importances_
feature_names = X.columns
feature_importance_df = feature_importances
feature_names