# Machine Learning Models For Solar Cost Datasets

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import plotly.express as px

In [2]:
# Load the SDGE dataset
df = pd.read_csv("clean_data/df_PGE.csv")

In [3]:
# Convert zip code to string
df['Service_Zip'] = df['Service_Zip'].astype(int).astype(str).str.zfill(5)

In [4]:
# see how many installers there are
print("Zipcodes:", df.Service_Zip.nunique())
print("Cities:", df.Service_City.nunique())
print("Installers:", df.Installer_Name.nunique())

Zipcodes: 842
Cities: 741
Installers: 56


In [5]:
df.head()

Unnamed: 0,Utility,Service_City,Service_Zip,Service_County,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,App_Received_Date,Installer_Name,Third_Party_Owned,Electric_Vehicle,Total_System_Cost,Generator_Manufacturer,Inverter_Manufacturer,Generator_Quantity,Inverter_Quantity,Year
0,PGE,KNIGHTSEN,94548,CONTRA COSTA,Solar,2.827,0.0,4.93,Rooftop,2018-10-17,Self-installed,No,No,12000.0,Suntech Power,Enphase,17.0,17.0,2018
1,PGE,STONYFORD,95979,COLUSA,Solar,6.845,0.0,6.5,Other,2018-05-20,Other,No,No,30000.0,Other,Altenergy,25.0,13.0,2018
2,PGE,DANVILLE,94506,CONTRA COSTA,Solar,12.651,0.0,17.64,Other,2019-08-07,Sky Power,No,No,55200.0,SunPower,SunPower,56.0,56.0,2019
3,PGE,ARROYO GRANDE,93420,SAN LUIS OBISPO,Solar,4.768,0.0,5.0,Rooftop,2019-07-15,Self-installed,No,Yes,20000.0,SolarWorld,SMA America,18.0,1.0,2019
4,PGE,ROCKLIN,95765,PLACER,Solar,2.71,0.0,2.88,Rooftop,2019-01-17,SunPower,No,No,12814.0,SunPower,SunPower,9.0,9.0,2019


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355991 entries, 0 to 355990
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Utility                 355991 non-null  object 
 1   Service_City            355991 non-null  object 
 2   Service_Zip             355991 non-null  object 
 3   Service_County          355991 non-null  object 
 4   Technology_Type         355991 non-null  object 
 5   System_Size_AC          355991 non-null  float64
 6   Storage_Size_kW_AC      355991 non-null  float64
 7   Inverter_Size_kW_AC     355991 non-null  float64
 8   Mounting_Method         355991 non-null  object 
 9   App_Received_Date       355991 non-null  object 
 10  Installer_Name          355991 non-null  object 
 11  Third_Party_Owned       355991 non-null  object 
 12  Electric_Vehicle        355991 non-null  object 
 13  Total_System_Cost       355991 non-null  float64
 14  Generator_Manufactur

In [7]:
# Remove "other" from installer_Name column
#df = df[df['Installer_Name'] != 'Other']

In [8]:
#columns_to_drop = ['Utility', 'Service_Zip', 'App_Received_Date', 'Year', 'Service_County', 'Inverter_Size_kW_AC', 
#                   'Inverter_Manufacturer', 'Inverter_Quantity'] 
#columns_to_drop = ['Utility', 'Year', 'Service_County'] 
columns_to_drop = ['Utility', 'Service_Zip', 'App_Received_Date', 'Service_County'] 
#columns_to_drop = ['Utility', 'App_Received_Date'] 
df.drop(columns=columns_to_drop, inplace=True) 
df.head()

Unnamed: 0,Service_City,Service_Zip,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,App_Received_Date,Installer_Name,Third_Party_Owned,Electric_Vehicle,Total_System_Cost,Generator_Manufacturer,Inverter_Manufacturer,Generator_Quantity,Inverter_Quantity
0,KNIGHTSEN,94548,Solar,2.827,0.0,4.93,Rooftop,2018-10-17,Self-installed,No,No,12000.0,Suntech Power,Enphase,17.0,17.0
1,STONYFORD,95979,Solar,6.845,0.0,6.5,Other,2018-05-20,Other,No,No,30000.0,Other,Altenergy,25.0,13.0
2,DANVILLE,94506,Solar,12.651,0.0,17.64,Other,2019-08-07,Sky Power,No,No,55200.0,SunPower,SunPower,56.0,56.0
3,ARROYO GRANDE,93420,Solar,4.768,0.0,5.0,Rooftop,2019-07-15,Self-installed,No,Yes,20000.0,SolarWorld,SMA America,18.0,1.0
4,ROCKLIN,95765,Solar,2.71,0.0,2.88,Rooftop,2019-01-17,SunPower,No,No,12814.0,SunPower,SunPower,9.0,9.0


In [9]:
# Results for PGE - not scaled n estimators 100, not dropped negative and small values

In [10]:
#columns_to_drop = ['Service_Zip', 'App_Received_Date'] #Mean Absolute Error: 6335.3067956234045 and R-squared: 0.6231565771055634
#columns_to_drop = ['App_Received_Date'] #Mean Absolute Error: 6291.18175942384 and R-squared: 0.6249063767730472
#columns_to_drop = ['Year'] # Mean Absolute Error: 6264.777424996714 and R-squared: 0.6271341735008127
#columns_to_drop = ['Utility', 'App_Received_Date'] #Mean Absolute Error: 6285.821611756003 and R-squared: 0.6281995925685182
#columns_to_drop = ['Utility', 'App_Received_Date', 'Inverter_Size_kW_AC'] #Mean Absolute Error: 6387.598217884039
# and R-squared: 0.6197025210399145
#df.drop(columns=columns_to_drop, inplace=True) 
#df.head()

In [11]:
# Convert categorical data to numeric with `pd.get_dummies`
cat_columns = df.dtypes[df.dtypes == "object"].index.tolist()
enc = OneHotEncoder(sparse_output=False)
enc_data = enc.fit_transform(df[cat_columns])
enc_columns = enc.get_feature_names_out().tolist()

encode_df = pd.DataFrame(enc_data, columns=enc_columns)
encode_df.head()

Unnamed: 0,Service_City_ACAMPO,Service_City_ADELAIDE,Service_City_AHWAHNEE,Service_City_ALAMO,Service_City_ALBANY,Service_City_ALBION,Service_City_ALDERPOINT,Service_City_ALLENSWORTH,Service_City_ALPAUGH,Service_City_ALTA,...,Inverter_Manufacturer_Sanyo,Inverter_Manufacturer_Schneider,Inverter_Manufacturer_Sharp,Inverter_Manufacturer_SolarBridge,Inverter_Manufacturer_SolarEdge,Inverter_Manufacturer_Solaria,Inverter_Manufacturer_Solectria,Inverter_Manufacturer_SunPower,Inverter_Manufacturer_Tesla,Inverter_Manufacturer_Xantrex
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [12]:
# Merge the encoded data back to the original dataframe
df = pd.concat([df, encode_df], axis=1)
df.head()

Unnamed: 0,Service_City,Service_Zip,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,App_Received_Date,Installer_Name,Third_Party_Owned,...,Inverter_Manufacturer_Sanyo,Inverter_Manufacturer_Schneider,Inverter_Manufacturer_Sharp,Inverter_Manufacturer_SolarBridge,Inverter_Manufacturer_SolarEdge,Inverter_Manufacturer_Solaria,Inverter_Manufacturer_Solectria,Inverter_Manufacturer_SunPower,Inverter_Manufacturer_Tesla,Inverter_Manufacturer_Xantrex
0,KNIGHTSEN,94548,Solar,2.827,0.0,4.93,Rooftop,2018-10-17,Self-installed,No,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,STONYFORD,95979,Solar,6.845,0.0,6.5,Other,2018-05-20,Other,No,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DANVILLE,94506,Solar,12.651,0.0,17.64,Other,2019-08-07,Sky Power,No,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,ARROYO GRANDE,93420,Solar,4.768,0.0,5.0,Rooftop,2019-07-15,Self-installed,No,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ROCKLIN,95765,Solar,2.71,0.0,2.88,Rooftop,2019-01-17,SunPower,No,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [13]:
# Drop the original categorical columns
df = df.drop(columns=cat_columns)

In [14]:
# Split the data into features and target variable
X = df.drop(columns='Total_System_Cost')
y = df['Total_System_Cost']

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
#try to change split for testing

In [16]:
# Create the StandardScaler
scaler = StandardScaler()

In [17]:
# Fit and transform the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)

In [18]:
# Transform the test data using the same fitted scaler
X_test_scaled = scaler.transform(X_test)

In [19]:
# Create the Random Forest model
rf_model = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)


In [20]:
# Fit the model
rf_model.fit(X_train_scaled, y_train)

In [29]:
# Make predictions on the test set
predictions = rf_model.predict(X_test_scaled)

In [30]:
# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
r_squared = r2_score(y_test, predictions)
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r_squared}')


Mean Absolute Error: 6329.810707714982
R-squared: 0.6234910364980804


In [31]:
# Get feature importances
feature_importances = rf_model.feature_importances_
feature_names = X.columns

In [32]:
# Create a DataFrame for feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

In [33]:
# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [34]:
# Plot the most important features
top_features = feature_importance_df.head(10)
fig_top = px.bar(top_features, x='Feature', y='Importance', title='Top 10 Most Important Features')
fig_top.show()