## Notebook For Data Exploration from SQL Databases and Making Predictions

In [1]:
# import required packages and connect to the database
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine, text
import pickle

# set plotting theme to seaborn
sns.set()

In [2]:
# try connecting to the database
engine = create_engine('postgresql+psycopg2://ns96:java100@localhost/SolarCostData')

# make sure we can connect to the database, otherwise exit
try:
  conn = engine.connect()
  conn.close()
  print("Successfully Connected to DB")
except Exception as e:
  print("DB Connection Error\n")    
  print(e)

Successfully Connected to DB


In [None]:
# Run query to return all cities, with installers, and average cost
query = text('SELECT "Service_City", "Installer_Name", COUNT("Installer_Name"), '\
             'ROUND(AVG("Total_System_Cost")) '\
             'FROM "CA" '\
             'GROUP BY "Service_City", "Installer_Name" '\
             'HAVING "Installer_Name" != \'Other\' '\
             'ORDER BY "Service_City", COUNT("Installer_Name") DESC')
print(query)
    
with engine.connect() as conn:
    results = conn.execute(query).fetchall()
    
    records = dict()
    
    for row in results:
        print(row[0], row[1])

In [3]:
# create a look table which indicate the most common generator used by an installer
def make_generator_table():
    global generator_table
    
    generator_table = dict()
    query = text('SELECT "Installer_Name", "Generator_Manufacturer", '\
                  'COUNT("Generator_Manufacturer"), ROUND(AVG("Generator_Quantity")) '\
                  'FROM "CA" '\
                  'GROUP BY "Installer_Name", "Generator_Manufacturer" '\
                  'HAVING "Installer_Name" != \'Other\' '\
                  'ORDER BY "Installer_Name", COUNT("Generator_Manufacturer") DESC')
    
    with engine.connect() as conn:
        results = conn.execute(query).fetchall()    
        for row in results:
            installer = row[0]
        
            if installer not in generator_table:
                generator_table[installer] = (row[1], int(row[2])) 
                #print(row)

# test method
make_generator_table()

In [4]:
# given a city zip code return the top 10 installers and their average generator install cost. return utility
# and array of tuples
def get_installers(zip_code = '92130'):
    utility_query = text('SELECT "Utility" FROM "CA" WHERE "Service_Zip" = \'' + zip_code + '\' LIMIT 1')
    city_query = 'SELECT "Service_City" FROM "CA" WHERE "Service_Zip" = \'' + zip_code + '\' LIMIT 1'

    query = text('SELECT "Service_City", "Installer_Name", COUNT("Installer_Name"), '\
                 'ROUND (AVG("System_Size_AC")), '\
                 'ROUND(AVG("Total_System_Cost")) '\
                 'FROM "CA" '\
                 'WHERE "Service_City" = (' + city_query + ') '\
                 'GROUP BY "Service_City", "Installer_Name" '\
                 'HAVING "Installer_Name" != \'Other\' '\
                 'ORDER BY COUNT("Installer_Name") DESC LIMIT 10')
    
    print(query)
    
    with engine.connect() as conn:
        # get the untility
        utility = conn.execute(utility_query).fetchall()[0][0]
        #print('Utility: ', utility)
    
        results = conn.execute(query).fetchall()    
        records = list()
        for row in results:
            records.append(row)
    
    # return the utility and installer records 
    return utility, records

# test the function
get_installers()

SELECT "Service_City", "Installer_Name", COUNT("Installer_Name"), ROUND (AVG("System_Size_AC")), ROUND(AVG("Total_System_Cost")) FROM "CA" WHERE "Service_City" = (SELECT "Service_City" FROM "CA" WHERE "Service_Zip" = '92130' LIMIT 1) GROUP BY "Service_City", "Installer_Name" HAVING "Installer_Name" != 'Other' ORDER BY COUNT("Installer_Name") DESC LIMIT 10


('SDGE',
 [('SAN DIEGO', 'Tesla', 5974, 7.0, 28104.0),
  ('SAN DIEGO', 'Baker', 5679, 7.0, 37402.0),
  ('SAN DIEGO', 'Semper', 4822, 6.0, 29833.0),
  ('SAN DIEGO', 'Stellar', 2528, 6.0, 31281.0),
  ('SAN DIEGO', 'SunPower', 2320, 5.0, 32268.0),
  ('SAN DIEGO', 'Sunrun', 2287, 5.0, 27808.0),
  ('SAN DIEGO', 'Sunline Energy', 1552, 6.0, 28561.0),
  ('SAN DIEGO', 'Sunnova', 1275, 3.0, 11381.0),
  ('SAN DIEGO', 'Self-installed', 1143, 6.0, 20552.0),
  ('SAN DIEGO', 'Sullivan', 951, 6.0, 26045.0)])

In [5]:
# load the scaler and optimzed model
scalers = dict()
models = dict()

# function to load the meachine learning models
def loadModels():
    utilities  = ['SDGE', 'PGE', 'SCE']
    for utility in utilities:
        scaler_file = "../models/scaler-" + utility + ".pkl"
        model_file = "../models/xgb_model-" + utility + ".pkl"

        scalers[utility] = pickle.load(open(scaler_file, "rb"))
        models[utility] = pickle.load(open(model_file, "rb"))

# load the trained xgb boost models for testing
loadModels()

In [6]:
# function to one hot encode and add all the needed columns for the scaler to work
def one_hot_encode(df, train_features):
    cat_columns = df.dtypes[df.dtypes == "object"].index.tolist()
    enc = OneHotEncoder(sparse_output=False)
    enc_data = enc.fit_transform(df[cat_columns])
    enc_columns = enc.get_feature_names_out().tolist()

    encode_df = pd.DataFrame(enc_data, columns=enc_columns)

    # now lets merge the into the main dataframe then drop original columns
    df = df.merge(encode_df, left_index=True, right_index=True)
    df = df.drop(columns=cat_columns)

    # add all the features that the model was trained on otherwise scaler/model won't work
    for feature in train_features:
        if feature not in df.columns:
            series = pd.Series(0, index=df.index, name=feature)
            df = pd.concat([df, series], axis=1)
    
    # re-order the feature names to be the same as what the scaler saw during training
    df = df[train_features]
    
    # return the one hot encoded dataframe
    return df
    
#function to make a prediction provided a dictionary containing variable to predict on
def get_estimate(utility, data):
    # load the scaler and model
    scaler = scalers[utility]
    model = models[utility]

    # convert the dictionary into a dataframe 
    df = pd.DataFrame(data)
    #display(df)
    
    # one hot encode the data and scale it
    train_features = scaler.feature_names_in_
    df = one_hot_encode(df, train_features)
    X_scaled = scaler.transform(df)

    # make a prediction now
    return model.predict(X_scaled)

In [7]:
# Test making a prediction
utility = 'SDGE'

test_data = {
    'Service_City': ['SAN DIEGO', 'SAN DIEGO'], 
    'Technology_Type': ['Solar', 'Solar'],
    'System_Size_AC': [7.0, 7.0],
    'Storage_Size_kW_AC': [0, 0],
    'Mounting_Method': ['Rooftop', 'Rooftop'],
    'Installer_Name': ['Tesla', 'Baker'],
    'Third_Party_Owned': ['No', 'No'],
    'Electric_Vehicle': ['No', 'No'],
    'Generator_Manufacturer':['Other', 'Other'],
    'Generator_Quantity': [12, 12]
}

get_estimate(utility, test_data)

array([22770.86, 36573.4 ], dtype=float32)

In [12]:
# making predictions using zip codes
def make_predications(zipcode, kw, ecar):
    pred_data = {
        'Service_City': [],
        'Technology_Type': [],
        'System_Size_AC': [],
        'Storage_Size_kW_AC': [],
        'Mounting_Method': [],
        'Installer_Name': [],
        'Third_Party_Owned': [],
        'Electric_Vehicle': [],
        'Generator_Manufacturer': [],
        'Generator_Quantity': []
    }

    # store this information
    estimate_data = {
        'Service_City': [],
        'Installer_Name': [],
        'Installation_Count': [],
        'Avg_Size_AC': [],
        'Size_AC': [],
        'Avg_Cost': [],
        'Est_Cost': []
    }
    
    # get the utility and top 10 installers for the particular zipcode
    utility, installers = get_installers(zipcode)
    
    for installer in installers:
        print("Installer Info:", installer)
        generator_info = generator_table[installer[1]]

        # populate dictionary that get returned with estimates
        estimate_data['Service_City'].append(installer[0])
        estimate_data['Installer_Name'].append(installer[1])
        estimate_data['Installation_Count'].append(installer[2])
        estimate_data['Avg_Size_AC'].append(installer[3])
        estimate_data['Avg_Cost'].append(installer[4])
        
        # populate the dictionary with information for making predictions
        pred_data['Service_City'].append(installer[0])
        pred_data['Technology_Type'].append('Solar')
        pred_data['System_Size_AC'].append(kw)
        pred_data['Storage_Size_kW_AC'].append(0)
        pred_data['Mounting_Method'].append('Rooftop')
        pred_data['Installer_Name'].append(installer[1])
        pred_data['Third_Party_Owned'].append('No')
        pred_data['Electric_Vehicle'].append(ecar)
        pred_data['Generator_Manufacturer'].append(generator_info[0]) # the most common generator used by installer
        pred_data['Generator_Quantity'].append(generator_info[1]) # the average number of times installer used this generator

    # now return the estimates and append to the dictionary so it can be turned info a dataframe
    #print("Data to predict", pred_data)
    estimates = get_estimate(utility, pred_data)
    estimate_data['Est_Cost'] = estimates

    return estimates

# test the function
estimates = make_predications('92130', 7.0, 'No')

SELECT "Service_City", "Installer_Name", COUNT("Installer_Name"), ROUND (AVG("System_Size_AC")), ROUND(AVG("Total_System_Cost")) FROM "CA" WHERE "Service_City" = (SELECT "Service_City" FROM "CA" WHERE "Service_Zip" = '92130' LIMIT 1) GROUP BY "Service_City", "Installer_Name" HAVING "Installer_Name" != 'Other' ORDER BY COUNT("Installer_Name") DESC LIMIT 10
Installer Info: ('SAN DIEGO', 'Tesla', 5974, 7.0, 28104.0)
Installer Info: ('SAN DIEGO', 'Baker', 5679, 7.0, 37402.0)
Installer Info: ('SAN DIEGO', 'Semper', 4822, 6.0, 29833.0)
Installer Info: ('SAN DIEGO', 'Stellar', 2528, 6.0, 31281.0)
Installer Info: ('SAN DIEGO', 'SunPower', 2320, 5.0, 32268.0)
Installer Info: ('SAN DIEGO', 'Sunrun', 2287, 5.0, 27808.0)
Installer Info: ('SAN DIEGO', 'Sunline Energy', 1552, 6.0, 28561.0)
Installer Info: ('SAN DIEGO', 'Sunnova', 1275, 3.0, 11381.0)
Installer Info: ('SAN DIEGO', 'Self-installed', 1143, 6.0, 20552.0)
Installer Info: ('SAN DIEGO', 'Sullivan', 951, 6.0, 26045.0)
Data to predict {'Servi