In [2]:
# import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
import pandas as pd
import tensorflow as tf
import numpy as np

# import GBR dependencies 
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import svm



In [3]:
# import the Utility dataset
utility = 'PGE'

data_file = f'../clean_data/df_{utility}.csv'
utility_df = pd.read_csv(data_file)
utility_df.head()




Unnamed: 0,Utility,Service_City,Service_Zip,Service_County,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,App_Received_Date,Installer_Name,Third_Party_Owned,Electric_Vehicle,Total_System_Cost,Generator_Manufacturer,Inverter_Manufacturer,Generator_Quantity,Inverter_Quantity,Year
0,PGE,KNIGHTSEN,94548.0,CONTRA COSTA,Solar,2.827,0.0,4.93,Rooftop,2018-10-17,Self-installed,No,No,12000.0,Suntech Power,Enphase,17.0,17.0,2018
1,PGE,STONYFORD,95979.0,COLUSA,Solar,6.845,0.0,6.5,Other,2018-05-20,Other,No,No,30000.0,Other,Altenergy,25.0,13.0,2018
2,PGE,DANVILLE,94506.0,CONTRA COSTA,Solar,12.651,0.0,17.64,Other,2019-08-07,Sky Power,No,No,55200.0,SunPower,SunPower,56.0,56.0,2019
3,PGE,ARROYO GRANDE,93420.0,SAN LUIS OBISPO,Solar,4.768,0.0,5.0,Rooftop,2019-07-15,Self-installed,No,Yes,20000.0,SolarWorld,SMA America,18.0,1.0,2019
4,PGE,ROCKLIN,95765.0,PLACER,Solar,2.71,0.0,2.88,Rooftop,2019-01-17,SunPower,No,No,12814.0,SunPower,SunPower,9.0,9.0,2019


# Data Cleaning

In [4]:
utility_df.loc[utility_df['Installer_Name'] != "Other"]

Unnamed: 0,Utility,Service_City,Service_Zip,Service_County,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,App_Received_Date,Installer_Name,Third_Party_Owned,Electric_Vehicle,Total_System_Cost,Generator_Manufacturer,Inverter_Manufacturer,Generator_Quantity,Inverter_Quantity,Year
0,PGE,KNIGHTSEN,94548.0,CONTRA COSTA,Solar,2.827,0.0,4.930,Rooftop,2018-10-17,Self-installed,No,No,12000.00,Suntech Power,Enphase,17.0,17.0,2018
2,PGE,DANVILLE,94506.0,CONTRA COSTA,Solar,12.651,0.0,17.640,Other,2019-08-07,Sky Power,No,No,55200.00,SunPower,SunPower,56.0,56.0,2019
3,PGE,ARROYO GRANDE,93420.0,SAN LUIS OBISPO,Solar,4.768,0.0,5.000,Rooftop,2019-07-15,Self-installed,No,Yes,20000.00,SolarWorld,SMA America,18.0,1.0,2019
4,PGE,ROCKLIN,95765.0,PLACER,Solar,2.710,0.0,2.880,Rooftop,2019-01-17,SunPower,No,No,12814.00,SunPower,SunPower,9.0,9.0,2019
5,PGE,ROCKLIN,95765.0,PLACER,Solar,2.710,0.0,2.880,Rooftop,2019-04-16,SunPower,No,No,12814.00,SunPower,SunPower,9.0,9.0,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355984,PGE,NOVATO,94947.0,MARIN,Solar,2.486,0.0,3.870,Rooftop,2023-12-27,Sunrun,No,No,12558.00,Other,Delta,7.0,1.0,2023
355986,PGE,SAN JOSE,95124.0,SANTA CLARA,Solar,4.794,0.0,5.700,Rooftop,2023-12-27,Tesla,No,Yes,16545.39,Hanwha,Tesla,13.0,1.0,2023
355988,PGE,ROHNERT PARK,94928.0,SONOMA,Solar,3.285,0.0,3.747,Rooftop,2023-12-27,Sunrun,No,No,10259.02,JA,SolarEdge,9.0,1.0,2023
355989,PGE,UNION CITY,94587.0,ALAMEDA,Solar,5.840,0.0,6.046,Rooftop,2023-12-27,Sunrun,No,No,24016.00,JA,SolarEdge,16.0,1.0,2023


In [5]:
# extract the year for modeling
utility_df.App_Received_Date = pd.to_datetime(utility_df.App_Received_Date)

In [6]:
utility_df['Year'] = utility_df.App_Received_Date.dt.year

In [7]:
# lets see what values of total cost are
utility_df.Total_System_Cost.describe()

count    3.559910e+05
mean     3.129718e+04
std      2.309740e+04
min      7.000000e+03
25%      1.806300e+04
50%      2.700000e+04
75%      3.900000e+04
max      2.995626e+06
Name: Total_System_Cost, dtype: float64

In [8]:
# reduce the number of cities i.e if less than 200 then just set it to other
city_counts = utility_df.Service_City.value_counts().to_dict()

def check_count(city):
    if city_counts[city] >= 200:
        return city
    else:
        return 'Other'

utility_df['Service_City'] = utility_df['Service_City'].apply(check_count)

In [9]:
utility_df['Service_City'].value_counts()

Service_City
BAKERSFIELD      22976
FRESNO           20045
SAN JOSE         19534
Other            17930
STOCKTON          9156
                 ...  
BUELLTON           203
MORRO BAY          202
GROVELAND          202
SANTA YNEZ         202
CARMEL VALLEY      201
Name: count, Length: 230, dtype: int64

In [10]:
# convert zip code to string
utility_df['Service_Zip'] = utility_df['Service_Zip'].astype(int).astype(str).str.zfill(5)
utility_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355991 entries, 0 to 355990
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   Utility                 355991 non-null  object        
 1   Service_City            355991 non-null  object        
 2   Service_Zip             355991 non-null  object        
 3   Service_County          355991 non-null  object        
 4   Technology_Type         355991 non-null  object        
 5   System_Size_AC          355991 non-null  float64       
 6   Storage_Size_kW_AC      355991 non-null  float64       
 7   Inverter_Size_kW_AC     355991 non-null  float64       
 8   Mounting_Method         355991 non-null  object        
 9   App_Received_Date       355991 non-null  datetime64[ns]
 10  Installer_Name          355991 non-null  object        
 11  Third_Party_Owned       355991 non-null  object        
 12  Electric_Vehicle        355991

In [11]:
# imput missing values for Storage_Size_kW_AC
utility_df['Storage_Size_kW_AC'] = utility_df['Storage_Size_kW_AC'].fillna(0)
utility_df['Inverter_Size_kW_AC'] = utility_df['Inverter_Size_kW_AC'].fillna(4.9) # mean value imputation
utility_df['Third_Party_Owned'] = utility_df['Third_Party_Owned'].fillna('No')
utility_df.isnull().sum()

Utility                   0
Service_City              0
Service_Zip               0
Service_County            0
Technology_Type           0
System_Size_AC            0
Storage_Size_kW_AC        0
Inverter_Size_kW_AC       0
Mounting_Method           0
App_Received_Date         0
Installer_Name            0
Third_Party_Owned         0
Electric_Vehicle          0
Total_System_Cost         0
Generator_Manufacturer    0
Inverter_Manufacturer     0
Generator_Quantity        0
Inverter_Quantity         0
Year                      0
dtype: int64

In [12]:
# see how many installers, cities, and zip codes we have
print("Zipcodes:", utility_df.Service_Zip.nunique())
print("Cities:", utility_df.Service_City.nunique())
print("Installers:", utility_df.Installer_Name.nunique())

Zipcodes: 842
Cities: 230
Installers: 56


In [13]:
#for now lest drop the Generator_Manufacturer and Inverter_Manufacturer manufacturers
#columns_to_drop = ['Utility', 'Service_Zip', 'App_Received_Date', 'Generator_Manufacturer', 'Inverter_Manufacturer']
columns_to_drop = ['Utility', 'Service_Zip', 'App_Received_Date']
#columns_to_drop = ['Utility', 'Service_City', 'App_Received_Date']
#columns_to_drop = ['Utility', 'App_Received_Date']
#columns_to_drop = ['Utility']
utility_df.drop(columns=columns_to_drop, inplace=True)
utility_df.head()

Unnamed: 0,Service_City,Service_County,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,Installer_Name,Third_Party_Owned,Electric_Vehicle,Total_System_Cost,Generator_Manufacturer,Inverter_Manufacturer,Generator_Quantity,Inverter_Quantity,Year
0,Other,CONTRA COSTA,Solar,2.827,0.0,4.93,Rooftop,Self-installed,No,No,12000.0,Suntech Power,Enphase,17.0,17.0,2018
1,Other,COLUSA,Solar,6.845,0.0,6.5,Other,Other,No,No,30000.0,Other,Altenergy,25.0,13.0,2018
2,DANVILLE,CONTRA COSTA,Solar,12.651,0.0,17.64,Other,Sky Power,No,No,55200.0,SunPower,SunPower,56.0,56.0,2019
3,ARROYO GRANDE,SAN LUIS OBISPO,Solar,4.768,0.0,5.0,Rooftop,Self-installed,No,Yes,20000.0,SolarWorld,SMA America,18.0,1.0,2019
4,ROCKLIN,PLACER,Solar,2.71,0.0,2.88,Rooftop,SunPower,No,No,12814.0,SunPower,SunPower,9.0,9.0,2019


In [14]:
# Convert categorical data to numeric with `pd.get_dummies`
cat_columns = utility_df.dtypes[utility_df.dtypes == "object"].index.tolist()

enc = OneHotEncoder(sparse_output=False)
enc_data = enc.fit_transform(utility_df[cat_columns])
enc_columns = enc.get_feature_names_out().tolist()

encode_df = pd.DataFrame(enc_data, columns=enc_columns)
#display(encode_df.head())

# now lets merge the into the application dataframe then drop original columns
utility_df = utility_df.merge(encode_df, left_index=True, right_index=True)
utility_df = utility_df.drop(columns=cat_columns)

print(utility_df.columns)

utility_df.head()

Index(['System_Size_AC', 'Storage_Size_kW_AC', 'Inverter_Size_kW_AC',
       'Total_System_Cost', 'Generator_Quantity', 'Inverter_Quantity', 'Year',
       'Service_City_ACAMPO', 'Service_City_ALAMO', 'Service_City_ALBANY',
       ...
       'Inverter_Manufacturer_Sanyo', 'Inverter_Manufacturer_Schneider',
       'Inverter_Manufacturer_Sharp', 'Inverter_Manufacturer_SolarBridge',
       'Inverter_Manufacturer_SolarEdge', 'Inverter_Manufacturer_Solaria',
       'Inverter_Manufacturer_Solectria', 'Inverter_Manufacturer_SunPower',
       'Inverter_Manufacturer_Tesla', 'Inverter_Manufacturer_Xantrex'],
      dtype='object', length=438)


Unnamed: 0,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Total_System_Cost,Generator_Quantity,Inverter_Quantity,Year,Service_City_ACAMPO,Service_City_ALAMO,Service_City_ALBANY,...,Inverter_Manufacturer_Sanyo,Inverter_Manufacturer_Schneider,Inverter_Manufacturer_Sharp,Inverter_Manufacturer_SolarBridge,Inverter_Manufacturer_SolarEdge,Inverter_Manufacturer_Solaria,Inverter_Manufacturer_Solectria,Inverter_Manufacturer_SunPower,Inverter_Manufacturer_Tesla,Inverter_Manufacturer_Xantrex
0,2.827,0.0,4.93,12000.0,17.0,17.0,2018,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6.845,0.0,6.5,30000.0,25.0,13.0,2018,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12.651,0.0,17.64,55200.0,56.0,56.0,2019,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,4.768,0.0,5.0,20000.0,18.0,1.0,2019,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.71,0.0,2.88,12814.0,9.0,9.0,2019,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Functions

In [15]:
#make function to prep data

def PrepData(utility):
    
    data_file = f'../clean_data/df_{utility}.csv'
    utility_df = pd.read_csv(data_file)
    utility_df.head()
    utility_df.loc[utility_df['Installer_Name'] != "Other"]
    utility_df.App_Received_Date = pd.to_datetime(utility_df.App_Received_Date)
    utility_df['Year'] = utility_df.App_Received_Date.dt.year
    city_counts = utility_df.Service_City.value_counts().to_dict()
    def check_count(city):
        if city_counts[city] >= 200:
            return city
        else:
            return 'Other'
    utility_df['Service_City'] = utility_df['Service_City'].apply(check_count)
    utility_df['Service_Zip'] = utility_df['Service_Zip'].astype(int).astype(str).str.zfill(5)
    utility_df['Storage_Size_kW_AC'] = utility_df['Storage_Size_kW_AC'].fillna(0)
    utility_df['Inverter_Size_kW_AC'] = utility_df['Inverter_Size_kW_AC'].fillna(4.9) # mean value imputation
    utility_df['Third_Party_Owned'] = utility_df['Third_Party_Owned'].fillna('No')
    utility_df.isnull().sum()
    columns_to_drop = ['Utility', 'Service_Zip', 'App_Received_Date']
    utility_df.drop(columns=columns_to_drop, inplace=True)
    # Convert categorical data to numeric with `pd.get_dummies`
    cat_columns = utility_df.dtypes[utility_df.dtypes == "object"].index.tolist()

    enc = OneHotEncoder(sparse_output=False)
    enc_data = enc.fit_transform(utility_df[cat_columns])
    enc_columns = enc.get_feature_names_out().tolist()

    encode_df = pd.DataFrame(enc_data, columns=enc_columns)
    #display(encode_df.head())

    # now lets merge the into the application dataframe then drop original columns
    utility_df = utility_df.merge(encode_df, left_index=True, right_index=True)
    utility_df = utility_df.drop(columns=cat_columns)
    #sample 10% of data
    sample_data = utility_df.sample(frac=0.05)
    return sample_data

In [16]:
def SplitData(sample_data):
    y = sample_data['Total_System_Cost']
    X = sample_data.drop(columns='Total_System_Cost')

    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)
    # Create a StandardScaler instances
    scaler = StandardScaler()

    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)

    # Scale the data
    X_train = X_scaler.transform(X_train)
    X_test = X_scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

In [17]:
def OptimizeModel(X_train, y_train):
    model = GradientBoostingRegressor()
    parameters = {
    'learning_rate': [0.03, 0.04, 0.05, 0.06, 0.07, 0.1],
    'subsample':[0.9, 0.5, 0.2],
    'n_estimators': [100, 200, 300, 500, 700, 1000],
    'max_depth': [4,6,8,10,20],
    'loss': ['squared_error'],
    'n_iter_no_change': [5], 
    
    

    }

    grid = GridSearchCV(estimator=model, param_grid=parameters, cv=2, n_jobs=-1)
    grid.fit(X_train, y_train)


    print("="*50)
    print("Results from grid search")
    print("="*50)
    print("\n The best estimator facross all searched params: \n", grid.best_estimator_)
    print("\n The best score across all searched params \n", grid.best_score_)
    print("\n The best parameters across all searched params: \n", grid.best_params_)
    print('='*50)


# PGE Models

In [18]:
sample_data_pge = PrepData('PGE')
X_train, X_test, y_train, y_test = SplitData(sample_data_pge)
OptimizeModel(X_train, y_train)

Results from grid search

 The best estimator facross all searched params: 
 GradientBoostingRegressor(learning_rate=0.04, max_depth=4, n_iter_no_change=5,
                          subsample=0.9)

 The best score across all searched params 
 0.4410138240140895

 The best parameters across all searched params: 
 {'learning_rate': 0.04, 'loss': 'squared_error', 'max_depth': 4, 'n_estimators': 100, 'n_iter_no_change': 5, 'subsample': 0.9}


In [19]:
# MakeModel
# gbm model 
model = GradientBoostingRegressor()
parameters = {
'learning_rate': [0.05],
'subsample':[0.9],
'n_estimators': [1000],
'max_depth': [6],
'n_iter_no_change': [5],
'loss':['squared_error']
}

#train the model on the training data
model.fit(X_train, y_train)

#make predictions on the test data
predictions = model.predict(X_test)
# Calculate the mean squared error and R-squared score
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print("Root Mean Squared Error:", rmse)
print("R-squared Score:", r2)
print("Mean Absolute Error:", mae)

Root Mean Squared Error: 14409.35618255719
R-squared Score: 0.3994594950512156
Mean Absolute Error: 7471.6376928919935


# SCE Models

In [20]:
sample_data_pge = PrepData('SCE')
X_train, X_test, y_train, y_test = SplitData(sample_data_pge)
OptimizeModel(X_train, y_train)

Results from grid search

 The best estimator facross all searched params: 
 GradientBoostingRegressor(max_depth=4, n_estimators=700, n_iter_no_change=5,
                          subsample=0.9)

 The best score across all searched params 
 0.565590391676809

 The best parameters across all searched params: 
 {'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 4, 'n_estimators': 700, 'n_iter_no_change': 5, 'subsample': 0.9}


In [23]:
# MakeModel
# gbm model 
sce_model = GradientBoostingRegressor()
parameters = {
'learning_rate': [0.06],
'subsample':[0.9],
'n_estimators': [200],
'max_depth': [8],
'n_iter_no_change': [5],
'loss':['squared_error']
}

#train the model on the training data
sce_model.fit(X_train, y_train)

#make predictions on the test data
sce_predictions = sce_model.predict(X_test)
# Calculate the mean squared error and R-squared score
mse = mean_squared_error(y_test, sce_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, sce_predictions)
mae = mean_absolute_error(y_test, sce_predictions)

print("Root Mean Squared Error:", rmse)
print("R-squared Score:", r2)
print("Mean Absolute Error:", mae)

Root Mean Squared Error: 33163.68060344845
R-squared Score: 0.5901001262815746
Mean Absolute Error: 13989.457396343216


# SDGE Models

In [24]:
sample_data_sdge = PrepData('SDGE')
X_train, X_test, y_train, y_test = SplitData(sample_data_pge)
OptimizeModel(X_train, y_train)

Results from grid search

 The best estimator facross all searched params: 
 GradientBoostingRegressor(learning_rate=0.07, max_depth=4, n_estimators=700,
                          n_iter_no_change=5, subsample=0.9)

 The best score across all searched params 
 0.5668150601398109

 The best parameters across all searched params: 
 {'learning_rate': 0.07, 'loss': 'squared_error', 'max_depth': 4, 'n_estimators': 700, 'n_iter_no_change': 5, 'subsample': 0.9}


In [25]:
# MakeModel
# gbm model 
model_sdge = GradientBoostingRegressor()
parameters = {
'learning_rate': [0.04],
'subsample':[0.9],
'n_estimators': [700],
'max_depth': [4],
'n_iter_no_change': [5],
'loss':['squared_error']
}

#train the model on the training data
model_sdge.fit(X_train, y_train)

#make predictions on the test data
sdge_predictions = model_sdge.predict(X_test)
# Calculate the mean squared error and R-squared score
mse = mean_squared_error(y_test, sdge_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, sdge_predictions)
mae = mean_absolute_error(y_test, sdge_predictions)

print("Root Mean Squared Error:", rmse)
print("R-squared Score:", r2)
print("Mean Absolute Error:", mae)

Root Mean Squared Error: 29314.146135882544
R-squared Score: 0.6190294629612987
Mean Absolute Error: 14140.095928155662
