## Notebook For Data Exploration from SQL Databases and Making Predictions

In [21]:
# import required packages and connect to the database
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine, text
import pickle

# set plotting theme to seaborn
sns.set()

In [2]:
# try connecting to the database
engine = create_engine('postgresql+psycopg2://ns96:java100@localhost/SolarCostData')

# make sure we can connect to the database, otherwise exit
try:
  conn = engine.connect()
  conn.close()
  print("Successfully Connected to DB")
except Exception as e:
  print("DB Connection Error\n")    
  print(e)

Successfully Connected to DB


In [3]:
# Run query to return all cities, with installers, and average cost
query = text('SELECT "Service_City", "Installer_Name", COUNT("Installer_Name"), '\
             'ROUND(AVG("Total_System_Cost")) '\
             'FROM "CA" '\
             'GROUP BY "Service_City", "Installer_Name" '\
             'HAVING "Installer_Name" != \'Other\' '\
             'ORDER BY "Service_City", COUNT("Installer_Name") DESC')
print(query)
    
with engine.connect() as conn:
    results = conn.execute(query).fetchall()
    
    records = dict()
    
    for row in results:
        print(row[0], row[1])

SELECT "Service_City", "Installer_Name", COUNT("Installer_Name"), ROUND(AVG("Total_System_Cost")) FROM "CA" GROUP BY "Service_City", "Installer_Name" HAVING "Installer_Name" != 'Other' ORDER BY "Service_City", COUNT("Installer_Name") DESC
ACAMPO Infinity Energy
ACAMPO Sunrun
ACAMPO Self-installed
ACAMPO Tesla
ACAMPO Nexus Energy
ACAMPO Sierra Pacific
ACAMPO West Coast Solar
ACAMPO Semper
ACAMPO SunWorks
ACAMPO 1st Light
ACAMPO Sun Solar Energy
ACAMPO Westhaven
ACAMPO SunPower
ACAMPO Sunrise
ACTON Sunrun
ACTON Tesla
ACTON Self-installed
ACTON SunPower
ACTON SolarMax
ACTON Sun Solar Energy
ACTON Semper
ACTON LA Solar
ACTON Complete
ACTON NRG
ACTON Infinity Energy
ACTON Nexus Energy
ACTON Horizon
ACTON Smart Energy
ADELANTO Sunrun
ADELANTO SunPower
ADELANTO Complete
ADELANTO Sunstreet
ADELANTO Tesla
ADELANTO Infinity Energy
ADELANTO Solcius
ADELANTO Semper
ADELANTO SolarMax
ADELANTO Grid Alternatives
ADELANTO Horizon
ADELANTO Sungevity
ADELANTO 1st Light
ADELANTO Self-installed
ADELANTO S

In [13]:
# given a city zipcode return all installers and their average install cost
zip_code = '92130'

utility_query = text('SELECT "Utility" FROM "CA" WHERE "Service_Zip" = \'' + zip_code + '\' LIMIT 1')
city_query = 'SELECT "Service_City" FROM "CA" WHERE "Service_Zip" = \'' + zip_code + '\' LIMIT 1'

query = text('SELECT "Service_City", "Installer_Name", COUNT("Installer_Name"), '\
             'ROUND (AVG("System_Size_AC")), ROUND (AVG("Generator_Quantity")), '\
             'ROUND(AVG("Total_System_Cost")) '\
             'FROM "CA" '\
             'WHERE "Service_City" = (' + city_query + ') '\
             'GROUP BY "Service_City", "Installer_Name" '\
             'HAVING "Installer_Name" != \'Other\' '\
             'ORDER BY COUNT("Installer_Name") DESC LIMIT 10')
print(query)
    
with engine.connect() as conn:
    # get the untility
    utility = conn.execute(utility_query).fetchall()[0][0]
    print('Utility: ', utility)
    
    results = conn.execute(query).fetchall()    
    records = dict()
    for row in results:
        print(row)    

SELECT "Service_City", "Installer_Name", COUNT("Installer_Name"), ROUND (AVG("System_Size_AC")), ROUND (AVG("Generator_Quantity")), ROUND(AVG("Total_System_Cost")) FROM "CA" WHERE "Service_City" = (SELECT "Service_City" FROM "CA" WHERE "Service_Zip" = '92130' LIMIT 1) GROUP BY "Service_City", "Installer_Name" HAVING "Installer_Name" != 'Other' ORDER BY COUNT("Installer_Name") DESC LIMIT 10
Utility:  SDGE
('SAN DIEGO', 'Tesla', 5974, 7.0, 12.0, 28104.0)
('SAN DIEGO', 'Baker', 5679, 7.0, 11.0, 37402.0)
('SAN DIEGO', 'Semper', 4822, 6.0, 8.0, 29833.0)
('SAN DIEGO', 'Stellar', 2528, 6.0, 9.0, 31281.0)
('SAN DIEGO', 'SunPower', 2320, 5.0, 11.0, 32268.0)
('SAN DIEGO', 'Sunrun', 2287, 5.0, 13.0, 27808.0)
('SAN DIEGO', 'Sunline Energy', 1552, 6.0, 13.0, 28561.0)
('SAN DIEGO', 'Sunnova', 1275, 3.0, 8.0, 11381.0)
('SAN DIEGO', 'Self-installed', 1143, 6.0, 15.0, 20552.0)
('SAN DIEGO', 'Sullivan', 951, 6.0, 16.0, 26045.0)


In [46]:
# load the scaler and optimzed model
utility = "SDGE"
scaler_file = "../models/scaler-" + utility + ".pkl"
model_file = "../models/xgb_model-" + utility + ".pkl"

scaler = pickle.load(open(scaler_file, "rb"))
model = pickle.load(open(model_file, "rb"))

In [50]:
# define dataframe which holds data for making a prediction
test_data = {
    'Service_City': ['SAN DIEGO', 'SAN DIEGO'], 
    'Technology_Type': ['Solar', 'Solar'],
    'System_Size_AC': [7.0, 7.0],
    'Storage_Size_kW_AC': [0, 0],
    'Mounting_Method': ['Rooftop', 'Rooftop'],
    'Installer_Name': ['Tesla', 'Baker'],
    'Third_Party_Owned': ['No', 'No'],
    'Electric_Vehicle': ['No', 'No'],
    'Generator_Manufacturer':['Other', 'Other'],
    'Generator_Quantity': [12, 12]
}

In [51]:
# function to one hot encode and add all the needed columns for the scaler to work
def one_hot_encode(df, train_features):
    cat_columns = df.dtypes[df.dtypes == "object"].index.tolist()
    enc = OneHotEncoder(sparse_output=False)
    enc_data = enc.fit_transform(df[cat_columns])
    enc_columns = enc.get_feature_names_out().tolist()

    encode_df = pd.DataFrame(enc_data, columns=enc_columns)

    # now lets merge the into the main dataframe then drop original columns
    df = df.merge(encode_df, left_index=True, right_index=True)
    df = df.drop(columns=cat_columns)

    # add all the features that the model was trained on otherwise scaler/model won't work
    for feature in train_features:
        if feature not in df.columns:
            series = pd.Series(0, index=df.index, name=feature)
            df = pd.concat([df, series], axis=1)
    
    # re-order the feature names to be the same as what the scaler saw during training
    df = df[train_features]
    
    # return the one hot encoded dataframe
    return df
    
#function to make a prediction provided a dictionary containing variable to predict on
def estimate(data):
    df = pd.DataFrame(data)
    
    #one hot encode the data and scale it
    train_features = scaler.feature_names_in_
    df = one_hot_encode(df, train_features)
    #display(df)

    # use scaler to scale the data
    X_scaled = scaler.transform(df)

    # make a prediction now
    return model.predict(X_scaled)

# for testing
estimate(test_data)

array([22770.86, 36573.4 ], dtype=float32)