# Machine Learning Models For Solar Cost Datasets

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import numpy as np

In [2]:
# Load the dataset
df = pd.read_csv("clean_data/df_SDGE.csv")

In [3]:
# Convert zip code to string
df['Service_Zip'] = df['Service_Zip'].astype(int).astype(str).str.zfill(5)

In [4]:
# see how many installers there are
print("Zipcodes:", df.Service_Zip.nunique())
print("Cities:", df.Service_City.nunique())
print("Installers:", df.Installer_Name.nunique())

Zipcodes: 116
Cities: 70
Installers: 48


In [5]:
df.head()

Unnamed: 0,Utility,Service_City,Service_Zip,Service_County,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,App_Received_Date,Installer_Name,Third_Party_Owned,Electric_Vehicle,Total_System_Cost,Generator_Manufacturer,Inverter_Manufacturer,Generator_Quantity,Inverter_Quantity,Year
0,SDGE,SAN DIEGO,92130,SAN DIEGO,Storage,10.0,10.0,0.01,Other,2019-05-21,Other,No,No,18415.0,Other,Tesla,0.0,2.0,2019
1,SDGE,SAN DIEGO,92105,SAN DIEGO,Solar,3.676,0.0,3.8,Rooftop,2019-02-14,Tesla,No,No,21709.0,Kyocera,ABB,16.0,1.0,2019
2,SDGE,SAN DIEGO,92110,SAN DIEGO,Solar,7.639,0.0,5.052,Rooftop,2019-09-24,Self-installed,No,No,38000.0,Hanwha,SolarEdge,24.0,1.0,2019
3,SDGE,LEMON GROVE,91945,SAN DIEGO,Solar,4.76,0.0,15.0,Rooftop,2018-11-30,Self-installed,No,No,18000.0,Other,SunPower,24.0,12.0,2018
4,SDGE,BORREGO SPRINGS,92004,ORANGE,Solar,6.687,0.0,7.625,Rooftop,2018-01-02,Other,No,No,35000.0,Hanwha,SolarEdge,29.0,1.0,2018


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168543 entries, 0 to 168542
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Utility                 168543 non-null  object 
 1   Service_City            168543 non-null  object 
 2   Service_Zip             168543 non-null  object 
 3   Service_County          168543 non-null  object 
 4   Technology_Type         168543 non-null  object 
 5   System_Size_AC          168543 non-null  float64
 6   Storage_Size_kW_AC      168543 non-null  float64
 7   Inverter_Size_kW_AC     168543 non-null  float64
 8   Mounting_Method         168543 non-null  object 
 9   App_Received_Date       168543 non-null  object 
 10  Installer_Name          168543 non-null  object 
 11  Third_Party_Owned       168543 non-null  object 
 12  Electric_Vehicle        168543 non-null  object 
 13  Total_System_Cost       168543 non-null  float64
 14  Generator_Manufactur

In [7]:
nan_columns = df.columns[df.isnull().any()].tolist()
print("Columns with NaN values:", nan_columns)

Columns with NaN values: []


In [8]:
# Remove "other" from installer_Name column
df = df[df['Installer_Name'] != 'Other']

In [9]:
# Results - not dropped negative and small values

In [10]:
columns_to_drop = ['Utility', 'Service_Zip', 'App_Received_Date', 'Year', 'Service_County', 'Inverter_Size_kW_AC', 
                   'Inverter_Manufacturer', 'Inverter_Quantity'] #
#columns_to_drop = ['App_Received_Date'] #
#columns_to_drop = ['Year'] # 
#columns_to_drop = ['Utility', 'App_Received_Date'] #
df.drop(columns=columns_to_drop, inplace=True) #
df.head()

Unnamed: 0,Service_City,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Mounting_Method,Installer_Name,Third_Party_Owned,Electric_Vehicle,Total_System_Cost,Generator_Manufacturer,Generator_Quantity
1,SAN DIEGO,Solar,3.676,0.0,Rooftop,Tesla,No,No,21709.0,Kyocera,16.0
2,SAN DIEGO,Solar,7.639,0.0,Rooftop,Self-installed,No,No,38000.0,Hanwha,24.0
3,LEMON GROVE,Solar,4.76,0.0,Rooftop,Self-installed,No,No,18000.0,Other,24.0
5,LA MESA,Solar,6.776,0.0,Rooftop,Self-installed,No,No,22000.0,LG,22.0
8,SAN DIEGO,Storage,5.0,5.0,Other,Solaire Energy,No,No,12178.0,Other,0.0


In [11]:
# Identify categorical columns
cat_columns = df.dtypes[df.dtypes == "object"].index.tolist()
cat_columns 

['Service_City',
 'Technology_Type',
 'Mounting_Method',
 'Installer_Name',
 'Third_Party_Owned',
 'Electric_Vehicle',
 'Generator_Manufacturer']

In [12]:
# One-hot encode categorical columns
encode_df = pd.get_dummies(df[cat_columns], drop_first=True)


In [13]:
# Drop the original categorical columns from the original DataFrame
df = df.drop(columns=cat_columns)
df.head()

Unnamed: 0,System_Size_AC,Storage_Size_kW_AC,Total_System_Cost,Generator_Quantity
1,3.676,0.0,21709.0,16.0
2,7.639,0.0,38000.0,24.0
3,4.76,0.0,18000.0,24.0
5,6.776,0.0,22000.0,22.0
8,5.0,5.0,12178.0,0.0


In [14]:
# Concatenate the original DataFrame with the encoded DataFrame
df = pd.concat([df, encode_df], axis=1)


In [15]:
# Now, check for NaN values in the entire DataFrame
nan_columns = df.columns[df.isnull().any()].tolist()
print("Columns with NaN values:", nan_columns)

Columns with NaN values: []


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 85823 entries, 1 to 168518
Columns: 149 entries, System_Size_AC to Generator_Manufacturer_Yingli Energy
dtypes: bool(145), float64(4)
memory usage: 15.1 MB


In [17]:
# Split the data into features and target variable
X = df.drop(columns='Total_System_Cost')
y = df['Total_System_Cost']

In [18]:
# Random Search Cross Validation 

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True, random_state=42)

In [20]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random search

In [21]:
# Define the parameter grid for RandomizedSearchCV
random_param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [22]:
# Create a RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

In [23]:
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf_model.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [24]:
# Create RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=random_param_grid, n_iter=10, scoring='r2', cv=5, random_state=42, n_jobs=-1)

In [25]:
# Fit the model
random_search.fit(X_train_scaled, y_train)

In [26]:
# Get the best hyperparameters from random search
best_params_random = random_search.best_params_

In [27]:
# Print the best hyperparameters from random search
print("Best Hyperparameters (Random Search):")
print(best_params_random)


Best Hyperparameters (Random Search):
{'n_estimators': 150, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10, 'bootstrap': True}


In [28]:
# Saving the result in case need to rerun
# Best Hyperparameters (Random Search):
# {'n_estimators': 150, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10, 'bootstrap': True}

In [29]:
# Use the best hyperparameters from random search to create a model
best_model_random = RandomForestRegressor(random_state=42, n_jobs=-1, **best_params_random)

In [30]:
# Train the model on the entire training set
best_model_random.fit(X_train_scaled, y_train)

In [31]:
# Make predictions on the test set
predictions = best_model_random.predict(X_test_scaled)


In [32]:
# Evaluate performance metrics
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

In [33]:
# Print performance metrics
print("Best R-squared Score (Random Search) Model Performance:")
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("R-squared Score:", r2)

Best R-squared Score (Random Search) Model Performance:
Root Mean Squared Error: 21173.238178403317
Mean Absolute Error: 9237.659603928232
R-squared Score: 0.8052501317812715


# Grid Search

In [47]:
# Define the parameter grid for GridSearchCV
grid_param_grid = {
    'n_estimators': [120, 150, 180],
    'max_depth': [5, 10, 20],  
    'min_samples_split': [3, 5, 7],  
    'min_samples_leaf': [1, 2, 3], 
    'bootstrap': [True]  # Since the best value was True
}

In [48]:
# Create GridSearchCV
grid_search = GridSearchCV(estimator=best_model_random, param_grid=grid_param_grid, scoring='r2', cv=5, n_jobs=-1)

In [49]:
# Fit the model
grid_search.fit(X_train_scaled, y_train)

In [50]:
# Get the best hyperparameters from grid search
best_params_grid = grid_search.best_params_

In [51]:
# Print the best hyperparameters from grid search
print("\nBest Hyperparameters (Grid Search):")
print(best_params_grid)



Best Hyperparameters (Grid Search):
{'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 180}


In [52]:
# Use the best hyperparameters from grid search to create a new model
best_model_grid = RandomForestRegressor(random_state=42, n_jobs=-1, **best_params_grid)


In [53]:
# Train the model on the entire training set
best_model_grid.fit(X_train_scaled, y_train)

In [54]:
# Make predictions on the test set
predictions = best_model_grid.predict(X_test_scaled)

In [55]:
# Evaluate performance metrics
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)


In [56]:
# Print performance metrics
print("\nFinal Model Performance:")
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("R-squared Score:", r2)


Final Model Performance:
Root Mean Squared Error: 20907.257183803693
Mean Absolute Error: 8685.921553655708
R-squared Score: 0.810112345628897


In [57]:
# Get feature importances
feature_importances = best_model_grid.feature_importances_
feature_names = X.columns

In [58]:
# Create a DataFrame for feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

In [59]:
# Plot the most important features
top_features = feature_importance_df.head(10)
fig_top = px.bar(top_features, x='Feature', y='Importance', title='Top 10 Most Important Features')
fig_top.show()