# Machine Learning Models For Solar Cost Datasets

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import plotly.express as px

In [2]:
# Load the SDGE dataset
sdge_df = pd.read_csv("clean_data/df_SCE.csv")

In [3]:
# Convert zip code to string
sdge_df['Service_Zip'] = sdge_df['Service_Zip'].astype(int).astype(str).str.zfill(5)

In [4]:
# Impute missing values for Storage_Size_kW_AC
sdge_df['Storage_Size_kW_AC'] = sdge_df['Storage_Size_kW_AC'].fillna(0)
sdge_df.isnull().sum()

Utility                        0
Service_City                   0
Service_Zip                    0
Service_County                 0
Technology_Type                0
System_Size_AC                 0
Storage_Size_kW_AC             0
Inverter_Size_kW_AC       347358
Mounting_Method                0
App_Received_Date              0
Installer_Name                 0
Third_Party_Owned              0
Electric_Vehicle               0
Total_System_Cost              0
Generator_Manufacturer         0
Inverter_Manufacturer          0
Generator_Quantity             0
Inverter_Quantity              0
dtype: int64

In [5]:
# see how many installers there are
print("Zipcodes:", sdge_df.Service_Zip.nunique())
print("Cities:", sdge_df.Service_City.nunique())
print("Installers:", sdge_df.Installer_Name.nunique())

Zipcodes: 663
Cities: 382
Installers: 51


In [6]:
sdge_df.head()

Unnamed: 0,Utility,Service_City,Service_Zip,Service_County,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,App_Received_Date,Installer_Name,Third_Party_Owned,Electric_Vehicle,Total_System_Cost,Generator_Manufacturer,Inverter_Manufacturer,Generator_Quantity,Inverter_Quantity
0,SCE,DAGGETT,92327,San Bernardino,Solar,137.21,0.0,,Other,2019-01-15,Shorebreak,Yes,No,1653333.0,SolarWorld,SMA America,508.0,6.0
1,SCE,MONROVIA,91016,Los Angeles,Solar,6.903,0.0,,Rooftop,2020-01-23,Infinity Energy,No,No,41258.0,Longi Green Energy,SolarEdge,25.0,1.0
2,SCE,STRATHMORE,93267,Tulare,Solar,8.64,0.0,,Rooftop,2020-08-18,Semper,No,No,42500.0,Sanyo,SolarEdge,16.0,1.0
3,SCE,LINDSAY,93247,Tulare,Solar,13.975,0.0,,Rooftop,2020-09-14,Other,No,No,39950.0,Other,Fronius,22.0,1.0
4,SCE,SANTA CLARITA,91390,Los Angeles,Solar,14.472,0.0,,Rooftop,2020-08-31,Other,No,No,39000.0,Other,Enphase,50.0,50.0


In [7]:
# Convert categorical data to numeric with `pd.get_dummies`
cat_columns = sdge_df.dtypes[sdge_df.dtypes == "object"].index.tolist()
enc = OneHotEncoder(sparse_output=False)
enc_data = enc.fit_transform(sdge_df[cat_columns])
enc_columns = enc.get_feature_names_out().tolist()

encode_df = pd.DataFrame(enc_data, columns=enc_columns)
encode_df.head()

Unnamed: 0,Utility_SCE,Service_City_ACTON,Service_City_ADELANTO,Service_City_AGOURA,Service_City_AGOURA HILLS,Service_City_AGUA DULCE,Service_City_AGUANGA,Service_City_ALHAMBRA,Service_City_ALISO VIEJO,Service_City_ALTA LOMA,...,Inverter_Manufacturer_SMA America,Inverter_Manufacturer_Schneider,Inverter_Manufacturer_SolarBridge,Inverter_Manufacturer_SolarEdge,Inverter_Manufacturer_Solaria,Inverter_Manufacturer_Solectria,Inverter_Manufacturer_SunPower,Inverter_Manufacturer_Sungrow,Inverter_Manufacturer_Tesla,Inverter_Manufacturer_Xantrex
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Merge the encoded data back to the original dataframe
sdge_df = pd.concat([sdge_df, encode_df], axis=1)
sdge_df.head()

Unnamed: 0,Utility,Service_City,Service_Zip,Service_County,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,App_Received_Date,...,Inverter_Manufacturer_SMA America,Inverter_Manufacturer_Schneider,Inverter_Manufacturer_SolarBridge,Inverter_Manufacturer_SolarEdge,Inverter_Manufacturer_Solaria,Inverter_Manufacturer_Solectria,Inverter_Manufacturer_SunPower,Inverter_Manufacturer_Sungrow,Inverter_Manufacturer_Tesla,Inverter_Manufacturer_Xantrex
0,SCE,DAGGETT,92327,San Bernardino,Solar,137.21,0.0,,Other,2019-01-15,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,SCE,MONROVIA,91016,Los Angeles,Solar,6.903,0.0,,Rooftop,2020-01-23,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SCE,STRATHMORE,93267,Tulare,Solar,8.64,0.0,,Rooftop,2020-08-18,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SCE,LINDSAY,93247,Tulare,Solar,13.975,0.0,,Rooftop,2020-09-14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,SCE,SANTA CLARITA,91390,Los Angeles,Solar,14.472,0.0,,Rooftop,2020-08-31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Drop the original categorical columns
sdge_df = sdge_df.drop(columns=cat_columns)

In [10]:
# Split the data into features and target variable
X = sdge_df.drop(columns='Total_System_Cost')
y = sdge_df['Total_System_Cost']

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Create the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)


In [13]:
# Fit the model
rf_model.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Make predictions on the test set
predictions = rf_model.predict(X_test)

In [None]:
# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
r_squared = r2_score(y_test, predictions)
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r_squared}')


In [None]:
# Get feature importances
feature_importances = rf_model.feature_importances_
feature_names = X.columns