In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import pandas as pd
import tensorflow as tf
import numpy as n


In [2]:
# import the Utility dataset
utility = 'all_utilities'

data_file = "/Users/cameroncullen/Documents/Github/AAA/Homework/ML_Project/clean_data/" + utility + ".csv"

all_df = pd.read_csv(data_file)
all_df.head()

Unnamed: 0,Utility,Service_City,Service_Zip,Service_County,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Inverter_Size_kW_AC,Mounting_Method,App_Received_Date,Installer_Name,Third_Party_Owned,Electric_Vehicle,Total_System_Cost,Generator_Manufacturer,Inverter_Manufacturer,Generator_Quantity,Inverter_Quantity
0,PGE,KNIGHTSEN,94548,CONTRA COSTA,Solar,2.827,0.0,4.93,Rooftop,10/17/18,Self-installed,No,No,12000.0,Suntech Power,Enphase,17,17
1,PGE,STONYFORD,95979,COLUSA,Solar,6.845,0.0,6.5,Other,5/20/18,Other,No,No,30000.0,Other,Altenergy,25,13
2,PGE,DANVILLE,94506,CONTRA COSTA,Solar,12.651,0.0,17.64,Other,8/7/19,Sky Power,No,No,55200.0,SunPower,SunPower,56,56
3,PGE,ARROYO GRANDE,93420,SAN LUIS OBISPO,Solar,4.768,0.0,5.0,Rooftop,7/15/19,Self-installed,No,Yes,20000.0,SolarWorld,SMA America,18,1
4,PGE,ROCKLIN,95765,PLACER,Solar,2.71,0.0,2.88,Rooftop,1/17/19,SunPower,No,No,12814.0,SunPower,SunPower,9,9


In [3]:
all_df['Service_Zip'] = all_df['Service_Zip'].astype(int).astype(str).str.zfill(5)

In [4]:
# see how many installers there are
print("Zipcodes:", all_df.Service_Zip.nunique())
print("Cities:", all_df.Service_City.nunique())
print("Installers:", all_df.Installer_Name.nunique())

Zipcodes: 1562
Cities: 1177
Installers: 66


In [5]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871892 entries, 0 to 871891
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Utility                 871892 non-null  object 
 1   Service_City            871892 non-null  object 
 2   Service_Zip             871892 non-null  object 
 3   Service_County          871892 non-null  object 
 4   Technology_Type         871892 non-null  object 
 5   System_Size_AC          871892 non-null  float64
 6   Storage_Size_kW_AC      412381 non-null  float64
 7   Inverter_Size_kW_AC     524534 non-null  float64
 8   Mounting_Method         871892 non-null  object 
 9   App_Received_Date       871892 non-null  object 
 10  Installer_Name          871892 non-null  object 
 11  Third_Party_Owned       871858 non-null  object 
 12  Electric_Vehicle        871892 non-null  object 
 13  Total_System_Cost       871892 non-null  float64
 14  Generator_Manufactur

In [6]:
nan_columns = all_df.columns[all_df.isnull().any()].tolist()
print("Columns with NaN values:", nan_columns)

Columns with NaN values: ['Storage_Size_kW_AC', 'Inverter_Size_kW_AC', 'Third_Party_Owned']


In [7]:
columns_to_drop = ['Service_Zip', 'App_Received_Date',  'Service_County', 'Inverter_Size_kW_AC', 
                   'Inverter_Manufacturer', 'Inverter_Quantity', 'Third_Party_Owned'] 

all_df.drop(columns=columns_to_drop, inplace=True) #
all_df.head()

Unnamed: 0,Utility,Service_City,Technology_Type,System_Size_AC,Storage_Size_kW_AC,Mounting_Method,Installer_Name,Electric_Vehicle,Total_System_Cost,Generator_Manufacturer,Generator_Quantity
0,PGE,KNIGHTSEN,Solar,2.827,0.0,Rooftop,Self-installed,No,12000.0,Suntech Power,17
1,PGE,STONYFORD,Solar,6.845,0.0,Other,Other,No,30000.0,Other,25
2,PGE,DANVILLE,Solar,12.651,0.0,Other,Sky Power,No,55200.0,SunPower,56
3,PGE,ARROYO GRANDE,Solar,4.768,0.0,Rooftop,Self-installed,Yes,20000.0,SolarWorld,18
4,PGE,ROCKLIN,Solar,2.71,0.0,Rooftop,SunPower,No,12814.0,SunPower,9


In [8]:
all_df.nunique()

Utility                        3
Service_City                1177
Technology_Type                4
System_Size_AC             21779
Storage_Size_kW_AC           580
Mounting_Method                2
Installer_Name                66
Electric_Vehicle               2
Total_System_Cost         264658
Generator_Manufacturer        60
Generator_Quantity           516
dtype: int64

In [9]:
nan_columns = all_df.columns[all_df.isnull().any()].tolist()
print("Columns with NaN values:", nan_columns)

Columns with NaN values: ['Storage_Size_kW_AC']


In [10]:
# Impute missing values for Storage_Size_kW_AC
all_df['Storage_Size_kW_AC'] = all_df['Storage_Size_kW_AC'].fillna(0)
all_df.isnull().sum()

Utility                   0
Service_City              0
Technology_Type           0
System_Size_AC            0
Storage_Size_kW_AC        0
Mounting_Method           0
Installer_Name            0
Electric_Vehicle          0
Total_System_Cost         0
Generator_Manufacturer    0
Generator_Quantity        0
dtype: int64

In [11]:
# Identify categorical columns
cat_columns = all_df.dtypes[all_df.dtypes == "object"].index.tolist()
cat_columns 

['Utility',
 'Service_City',
 'Technology_Type',
 'Mounting_Method',
 'Installer_Name',
 'Electric_Vehicle',
 'Generator_Manufacturer']

In [12]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871892 entries, 0 to 871891
Data columns (total 11 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Utility                 871892 non-null  object 
 1   Service_City            871892 non-null  object 
 2   Technology_Type         871892 non-null  object 
 3   System_Size_AC          871892 non-null  float64
 4   Storage_Size_kW_AC      871892 non-null  float64
 5   Mounting_Method         871892 non-null  object 
 6   Installer_Name          871892 non-null  object 
 7   Electric_Vehicle        871892 non-null  object 
 8   Total_System_Cost       871892 non-null  float64
 9   Generator_Manufacturer  871892 non-null  object 
 10  Generator_Quantity      871892 non-null  int64  
dtypes: float64(3), int64(1), object(7)
memory usage: 73.2+ MB


In [13]:
encode_df = pd.get_dummies(all_df[cat_columns], drop_first=True)
encode_df.head()

Unnamed: 0,Utility_SCE,Utility_SDGE,Service_City_ACTON,Service_City_ADELAIDE,Service_City_ADELANTO,Service_City_AGOURA,Service_City_AGOURA HILLS,Service_City_AGUA DULCE,Service_City_AGUANGA,Service_City_AHWAHNEE,...,Generator_Manufacturer_Solectria,Generator_Manufacturer_SunEdison,Generator_Manufacturer_SunPower,Generator_Manufacturer_Suniva,Generator_Manufacturer_Sunspark,Generator_Manufacturer_Suntech Power,Generator_Manufacturer_Tesla,Generator_Manufacturer_Trina,Generator_Manufacturer_Xantrex,Generator_Manufacturer_Yingli Energy
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


In [14]:
all_df = all_df.drop(columns=cat_columns)
all_df.head()

Unnamed: 0,System_Size_AC,Storage_Size_kW_AC,Total_System_Cost,Generator_Quantity
0,2.827,0.0,12000.0,17
1,6.845,0.0,30000.0,25
2,12.651,0.0,55200.0,56
3,4.768,0.0,20000.0,18
4,2.71,0.0,12814.0,9


In [15]:
# Concatenate the original DataFrame with the encoded DataFrame
all_df = pd.concat([all_df, encode_df], axis=1)
all_df.head()

Unnamed: 0,System_Size_AC,Storage_Size_kW_AC,Total_System_Cost,Generator_Quantity,Utility_SCE,Utility_SDGE,Service_City_ACTON,Service_City_ADELAIDE,Service_City_ADELANTO,Service_City_AGOURA,...,Generator_Manufacturer_Solectria,Generator_Manufacturer_SunEdison,Generator_Manufacturer_SunPower,Generator_Manufacturer_Suniva,Generator_Manufacturer_Sunspark,Generator_Manufacturer_Suntech Power,Generator_Manufacturer_Tesla,Generator_Manufacturer_Trina,Generator_Manufacturer_Xantrex,Generator_Manufacturer_Yingli Energy
0,2.827,0.0,12000.0,17,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,6.845,0.0,30000.0,25,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,12.651,0.0,55200.0,56,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,4.768,0.0,20000.0,18,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2.71,0.0,12814.0,9,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


In [16]:
# Now, check for NaN values in the entire DataFrame
nan_columns = all_df.columns[all_df.isnull().any()].tolist()
print("Columns with NaN values:", nan_columns)

Columns with NaN values: []


In [17]:
all_df.dropna(axis=0, inplace=True)
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871892 entries, 0 to 871891
Columns: 1311 entries, System_Size_AC to Generator_Manufacturer_Yingli Energy
dtypes: bool(1307), float64(3), int64(1)
memory usage: 1.1 GB


In [18]:
y = all_df['Total_System_Cost'].values
X = all_df.drop(['Total_System_Cost'], axis = 'columns').values

X.shape, y.shape

((871892, 1310), (871892,))

In [19]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, shuffle=True, random_state=42)


In [None]:
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)


X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



In [None]:
number_input_features = X_train.shape[1]
number_input_records = X_train.shape[0]

print("Number of Input Features:", number_input_features)
print("Number of Input Records:", number_input_records)

Number of Input Features: 1310
Number of Input Records: 329904


In [None]:
# Define the model - deep neural net
hidden_nodes_layer1 = number_input_features*3
hidden_nodes_layer2 = number_input_features
hidden_nodes_layer3 = int(number_input_features/3)


nn = tf.keras.models.Sequential()

# First hidden layer

nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='gelu'))

# Second hidden layer

nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='relu'))

# third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="gelu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="linear"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 3930)              5152230   
                                                                 
 dense_1 (Dense)             (None, 1310)              5149610   
                                                                 
 dense_2 (Dense)             (None, 436)               571596    
                                                                 
 dense_3 (Dense)             (None, 1)                 437       
                                                                 
Total params: 10,873,873
Trainable params: 10,873,873
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Compile the model
nn.compile(loss='mae', optimizer='adam', metrics=['mse','mae'])

In [None]:
# Train the model
earlyStop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

fit_model = nn.fit(X_train_scaled, y_train, epochs=100, batch_size=32, callbacks=[earlyStop])

In [None]:
#Making predictions on the test set
predictions = nn.predict(X_test_scaled)
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
rmse = n.sqrt(mse)
r2 = r2_score(y_test, predictions)

mae = mean_absolute_error(y_test, predictions)
rmse = n.sqrt(mse)
mse = mean_squared_error(y_test, predictions)

r2 = r2_score(y_test, predictions)


print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("R-squared Score:", r2)