# Machine Learning Models For Solar Cost Datasets

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
import pandas as pd
import tensorflow as tf
import numpy as np

# import xgboost dependencies
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import plot_importance

# disable the gpu on mac_m2 so it runs correctly
# https://github.com/keras-team/tf-keras/issues/140
# https://github.com/tensorflow/tensorflow/issues/62361
hw = tf.config.get_visible_devices()
tf.config.set_visible_devices(hw[0])

In [44]:
# import the Utility dataset
utility = 'SDGE'
#data_file = "DataFrames/df_" + utility + ".csv"
data_file = "/Users/ns96/Documents/ML_Project/df_" + utility + ".csv"

utility_df = pd.read_csv(data_file)

In [45]:
# lets see what values of total cost are
utility_df.Total_System_Cost.describe()

count    1.678570e+05
mean     2.915975e+04
std      2.564340e+04
min      7.000000e+03
25%      1.733000e+04
50%      2.450000e+04
75%      3.500000e+04
max      2.976760e+06
Name: Total_System_Cost, dtype: float64

In [46]:
# reduce the number of cities i.e if less than 200 then just seat it to other
city_counts = utility_df.Service_City.value_counts().to_dict()

def check_count(city):
    if city_counts[city] >= 200:
        return city
    else:
        return 'Other'

utility_df['Service_City'] = utility_df['Service_City'].apply(check_count)

In [47]:
utility_df['Service_City'].value_counts()

Service_City
SAN DIEGO          63455
CHULA VISTA        11088
OCEANSIDE           9488
ESCONDIDO           8597
EL CAJON            5973
CARLSBAD            5659
SAN CLEMENTE        5555
SAN MARCOS          5176
VISTA               4779
FALLBROOK           3787
LA MESA             3270
SPRING VALLEY       3255
SANTEE              3235
POWAY               3036
ENCINITAS           2994
RAMONA              2327
LAKESIDE            2010
VALLEY CENTER       1818
LAGUNA NIGUEL       1780
LA JOLLA            1444
SAN JUAN CAPO       1266
MISSION VIEJO       1123
LADERA RANCH        1116
LEMON GROVE         1098
ALPINE              1054
BONITA              1037
RANCHO SANTA FE     1019
RCH MSN VIEJO       1018
Other                974
NATIONAL CITY        871
DEL MAR              820
LAGUNA HILLS         785
DANA POINT           700
COTO DE CAZA         680
CARDIFF              591
CORONADO             567
SOLANA BEACH         567
JAMUL                531
RANCHO LA COSTA      504
IMPERIAL BEA

In [48]:
# convert zip code to string
utility_df['Service_Zip'] = utility_df['Service_Zip'].astype(int).astype(str).str.zfill(5)
utility_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167857 entries, 0 to 167856
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Utility                 167857 non-null  object 
 1   Service_City            167857 non-null  object 
 2   Service_Zip             167857 non-null  object 
 3   Service_County          167857 non-null  object 
 4   Technology_Type         167857 non-null  object 
 5   System_Size_AC          167857 non-null  float64
 6   Storage_Size_kW_AC      167857 non-null  float64
 7   Inverter_Size_kW_AC     167857 non-null  float64
 8   Mounting_Method         167857 non-null  object 
 9   App_Received_Date       167857 non-null  object 
 10  Installer_Name          167857 non-null  object 
 11  Third_Party_Owned       167857 non-null  object 
 12  Electric_Vehicle        167857 non-null  object 
 13  Total_System_Cost       167857 non-null  float64
 14  Generator_Manufactur

In [49]:
# imput missing values for Storage_Size_kW_AC
utility_df['Storage_Size_kW_AC'] = utility_df['Storage_Size_kW_AC'].fillna(0)
utility_df['Inverter_Size_kW_AC'] = utility_df['Inverter_Size_kW_AC'].fillna(4.9) # mean value imputation
utility_df['Third_Party_Owned'] = utility_df['Third_Party_Owned'].fillna('No')
utility_df.isnull().sum()

Utility                   0
Service_City              0
Service_Zip               0
Service_County            0
Technology_Type           0
System_Size_AC            0
Storage_Size_kW_AC        0
Inverter_Size_kW_AC       0
Mounting_Method           0
App_Received_Date         0
Installer_Name            0
Third_Party_Owned         0
Electric_Vehicle          0
Total_System_Cost         0
Generator_Manufacturer    0
Inverter_Manufacturer     0
Generator_Quantity        0
Inverter_Quantity         0
dtype: int64

In [50]:
# see how many installers there are
print("Zipcodes:", utility_df.Service_Zip.nunique())
print("Cities:", utility_df.Service_City.nunique())
print("Installers:", utility_df.Installer_Name.nunique())

Zipcodes: 116
Cities: 47
Installers: 48


In [51]:
#for now lest drop the Generator_Manufacturer and Inverter_Manufacturer manufacturers
#columns_to_drop = ['Utility', 'Service_Zip', 'App_Received_Date', 'Generator_Manufacturer', 'Inverter_Manufacturer']
columns_to_drop = ['Utility', 'Service_Zip', 'App_Received_Date']
#columns_to_drop = ['Utility', 'Service_City', 'App_Received_Date']
#columns_to_drop = ['Utility', 'App_Received_Date']

utility_df.drop(columns=columns_to_drop, inplace=True)
utility_df.head()

Unnamed: 0,Service_City,Service_County,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,Installer_Name,Third_Party_Owned,Electric_Vehicle,Total_System_Cost,Generator_Manufacturer,Inverter_Manufacturer,Generator_Quantity,Inverter_Quantity
0,SAN DIEGO,SAN DIEGO,Storage,10.0,10.0,0.01,Other,Other,No,No,18415.0,Other,Tesla,0.0,2.0
1,SAN DIEGO,SAN DIEGO,Solar,3.676,0.0,3.8,Rooftop,Tesla,No,No,21709.0,Kyocera,ABB,16.0,1.0
2,SAN DIEGO,SAN DIEGO,Solar,7.639,0.0,5.052,Rooftop,Self-installed,No,No,38000.0,Hanwha,SolarEdge,24.0,1.0
3,LEMON GROVE,SAN DIEGO,Solar,4.76,0.0,15.0,Rooftop,Self-installed,No,No,18000.0,Other,SunPower,24.0,12.0
4,BORREGO SPRINGS,ORANGE,Solar,6.687,0.0,7.625,Rooftop,Other,No,No,35000.0,Hanwha,SolarEdge,29.0,1.0


In [52]:
# Convert categorical data to numeric with `pd.get_dummies`
cat_columns = utility_df.dtypes[utility_df.dtypes == "object"].index.tolist()

enc = OneHotEncoder(sparse_output=False)
enc_data = enc.fit_transform(utility_df[cat_columns])
enc_columns = enc.get_feature_names_out().tolist()

encode_df = pd.DataFrame(enc_data, columns=enc_columns)
#display(encode_df.head())

# now lets merge the into the application dataframe then drop original columns
utility_df = utility_df.merge(encode_df, left_index=True, right_index=True)
utility_df = utility_df.drop(columns=cat_columns)

print(utility_df.columns)

utility_df.head()

Index(['System_Size_AC', 'Storage_Size_kW_AC', 'Inverter_Size_kW_AC',
       'Total_System_Cost', 'Generator_Quantity', 'Inverter_Quantity',
       'Service_City_ALPINE', 'Service_City_BONITA', 'Service_City_BONSALL',
       'Service_City_BORREGO SPRINGS',
       ...
       'Inverter_Manufacturer_Sanyo', 'Inverter_Manufacturer_Schneider',
       'Inverter_Manufacturer_SolarBridge', 'Inverter_Manufacturer_SolarEdge',
       'Inverter_Manufacturer_Solaria', 'Inverter_Manufacturer_Solectria',
       'Inverter_Manufacturer_SunPower', 'Inverter_Manufacturer_Sungrow',
       'Inverter_Manufacturer_Tesla', 'Inverter_Manufacturer_Xantrex'],
      dtype='object', length=172)


Unnamed: 0,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Total_System_Cost,Generator_Quantity,Inverter_Quantity,Service_City_ALPINE,Service_City_BONITA,Service_City_BONSALL,Service_City_BORREGO SPRINGS,...,Inverter_Manufacturer_Sanyo,Inverter_Manufacturer_Schneider,Inverter_Manufacturer_SolarBridge,Inverter_Manufacturer_SolarEdge,Inverter_Manufacturer_Solaria,Inverter_Manufacturer_Solectria,Inverter_Manufacturer_SunPower,Inverter_Manufacturer_Sungrow,Inverter_Manufacturer_Tesla,Inverter_Manufacturer_Xantrex
0,10.0,10.0,0.01,18415.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3.676,0.0,3.8,21709.0,16.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7.639,0.0,5.052,38000.0,24.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.76,0.0,15.0,18000.0,24.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,6.687,0.0,7.625,35000.0,29.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
# Split our preprocessed data into our features and target arrays
y = utility_df['Total_System_Cost']
X = utility_df.drop(columns='Total_System_Cost')

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, shuffle=True)

In [54]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

# set the number of features
number_input_features = X_train.shape[1]
number_input_records = X_train.shape[0]

print("Number of Input Features:", number_input_features)
print("Number of Input Records:", number_input_records)

Number of Input Features: 171
Number of Input Records: 83928


In [55]:
#Basic model Creating an XGBoost regressor
model = xgb.XGBRegressor()

#Training the model on the training data
model.fit(X_train, y_train)

#Making predictions on the test set
predictions = model.predict(X_test)

In [56]:
# Calculate the mean squared error and R-squared score
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print("Root Mean Squared Error:", rmse)
print("R-squared Score:", r2)

Root Mean Squared Error: 17860.74248391006
R-squared Score: 0.5956383641802186


In [28]:
# specify model and parameter range for grid search CV
# https://www.kaggle.com/code/jayatou/xgbregressor-with-gridsearchcv
# https://github.com/albertkklam/XGBRegressor/blob/master/XGBRegressor.ipynb
estimator = xgb.XGBRegressor(
    objective= 'reg:squarederror',
    learning_rate = 0.01,
    gamma = 0.1,
    subsample = 0.8,
    colsample_bytree = 0.8,
    reg_alpha = 1,
    reg_lambda = 1,
    #nthread=8,
    seed=42
)

parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(50, 500, 50)
}

# specify the grid search object
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'neg_mean_squared_error',
    n_jobs = -1,
    cv = 3,
    verbose=True
)

In [None]:
%%time
grid_search.fit(X_train, y_train)

In [None]:
# see the best parameters https://www.projectpro.io/recipes/find-optimal-parameters-using-gridsearchcv
print(" Results from Grid Search")
print("\n The best estimator across ALL searched params:\n", grid_search.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_search.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_search.best_params_)

In [None]:
# Calculate the mean squared error and R-squared score
model = grid_search.best_estimator_
predictions = model.predict(X_test_scaled)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print("Root Mean Squared Error:", rmse)
print("R-squared Score:", r2)

In [None]:
# plot feature importance
n_top_features = 20
sorted_idx = model.feature_importances_.argsort()[::-1]

top_features = X_test.columns[sorted_idx][:n_top_features ].tolist()

plt.barh(X_test.columns[sorted_idx][:n_top_features ], model.feature_importances_[sorted_idx][:n_top_features ])

In [None]:
# save the model for future use
model_file = "models/xgb_model-" + utility + ".json"
model.save_model(model_file)