# ETL Notebook For Preparing Data For Tableau Exploration

In [1]:
# import pandas
import pandas as pd
import glob

pd.set_option('display.max_columns', None)

In [3]:
# Specify the data files location
data_dir = "/Users/ns96/Documents/ML_Project/"

df_list = []
for file in glob.glob(data_dir + "*_year.csv"):
    print(file)
    df_list.append(pd.read_csv(file, skipfooter=1))

df = pd.concat(df_list)

print(df.shape)

/Users/ns96/Documents/ML_Project/projects_sce_5_year.csv


  df_list.append(pd.read_csv(file, skipfooter=1))


/Users/ns96/Documents/ML_Project/projects_pge_5_year.csv


  df_list.append(pd.read_csv(file, skipfooter=1))


/Users/ns96/Documents/ML_Project/projects_sdge_5_year.csv


  df_list.append(pd.read_csv(file, skipfooter=1))


(1017690, 151)


In [4]:
# drop all rows will all nulls
df.dropna(axis=1, how='all', inplace=True)

In [5]:
# drop all rows which does not have a Total System Cost and Itc Cost Basis
df.dropna(subset=['Total System Cost', 'Itc Cost Basis'], how='all', inplace=True)

# lets fill any na in the remaining rwos with zero
df['Total System Cost'] = df['Total System Cost'].fillna(0)
df['Itc Cost Basis'] = df['Itc Cost Basis'].fillna(0)
df["TOTAL_COST"] = df['Total System Cost'] + df['Itc Cost Basis']

print(df.shape)

(1006696, 152)


In [6]:
# Delete repetitions
df = df[~df['Application Id'].duplicated(keep=False)]

In [7]:
# Even though we downloaded data for the last five years from the website, the files still have old values.
# Saving onlt last 5 years (2018-2023)

# Convert 'App Received Date' column to datetime format
df['App Received Date'] = pd.to_datetime(df['App Received Date'], errors='coerce')

# Filter the DataFrame to keep only rows with 'App Received Date' on or after January 1, 2018
df = df[df['App Received Date'] >= '2018-01-01']
print(df.shape)

(1005739, 152)


In [8]:
# select only the Solar and Residential
valid_technology_types = ['Solar PV', 'Solar', 'Solar PV, Storage', 'Solar PV;Storage', 'Advanced Energy Storage', 'Storage', 'Energy Storage']

# Filter the DataFrame to keep only the rows with valid technology types
df = df[df['Technology Type'].isin(valid_technology_types)]

In [9]:
# Remaming technology type datapoints
df['Technology Type'] = df['Technology Type'].replace({'Solar PV': 'Solar', 'Solar PV;Storage': 'Solar, Storage',\
                                                       'Advanced Energy Storage': 'Storage', \
                                                      'Energy Storage': 'Storage'})

In [10]:
# Filter the DataFrame to keep only the rows with valid customer sectors
valid_customer_sectors = ['Residential']
df = df[df['Customer Sector'].isin(valid_customer_sectors)]

In [11]:
# Delete redundant columns 
columns_to_remove = ['Matched CSI Application Number', 'Application Status', 'System Size DC', 'App Complete Date',\
                    'App Approved Date', 'Installer Phone', 'Installer City', 'Installer State', 'Installer Zip',\
                    'Pace Financed', 'Pace Financier', 'Previous Application', 'Previous Application Ids',\
                     'VNEM, NEM-V, NEM-Agg', 'NEMPV or nonNEMPV', 'VNEM ID', 'Match Somah Application']

# Remove specified columns
df = df.drop(columns=columns_to_remove)

# Display the DataFrame after removing columns
df.head()

Unnamed: 0,Application Id,Utility,Service City,Service Zip,Service County,Technology Type,System Size AC,Storage Capacity (kWh),Storage Size (kW AC),Inverter Size (kW AC),Tilt,Azimuth,Mounting Method,Tracking,Customer Sector,App Received Date,Self Installer,Installer Name,CSLB Number,Third Party Owned,Third Party Owned Type,Third Party Name,Electric Vehicle,Electric Vehicle Count,System Output Monitoring,System Output Reports To Vendor?,System Output Monitoring Provider,Total System Cost,Itc Cost Basis,NEM Tariff,Interconnection Program,"Project is VNEM, NEM-V, NEM-Agg?",Generator Model 1,Generator Manufacturer 1,Generator Quantity 1,Generator Model 2,Generator Manufacturer 2,Generator Quantity 2,Generator Model 3,Generator Manufacturer 3,Generator Quantity 3,Generator Model 4,Generator Manufacturer 4,Generator Quantity 4,Generator Model 5,Generator Manufacturer 5,Generator Quantity 5,Generator Model 6,Generator Manufacturer 6,Generator Quantity 6,Generator Model 7,Generator Manufacturer 7,Generator Quantity 7,Generator Model 8,Generator Manufacturer 8,Generator Quantity 8,Generator Model 9,Generator Manufacturer 9,Generator Quantity 9,Generator Model 10,Generator Manufacturer 10,Generator Quantity 10,Generator Model 11,Generator Manufacturer 11,Generator Quantity 11,Generator Model 12,Generator Manufacturer 12,Generator Quantity 12,Inverter Model 1,Inverter Manufacturer 1,Inverter Quantity 1,Inverter Model 2,Inverter Manufacturer 2,Inverter Quantity 2,Inverter Model 3,Inverter Manufacturer 3,Inverter Quantity 3,Inverter Model 4,Inverter Manufacturer 4,Inverter Quantity 4,Inverter Model 5,Inverter Manufacturer 5,Inverter Quantity 5,Inverter Model 6,Inverter Manufacturer 6,Inverter Quantity 6,Inverter Model 7,Inverter Manufacturer 7,Inverter Quantity 7,Inverter Model 8,Inverter Manufacturer 8,Inverter Quantity 8,Inverter Model 9,Inverter Manufacturer 9,Inverter Quantity 9,Inverter Model 10,Inverter Manufacturer 10,Inverter Quantity 10,Inverter Model 11,Inverter Manufacturer 11,Inverter Quantity 11,Inverter Model 12,Inverter Manufacturer 12,Inverter Quantity 12,Inverter Model 13,Inverter Manufacturer 13,Inverter Quantity 13,Inverter Model 14,Inverter Manufacturer 14,Inverter Quantity 14,Inverter Model 15,Inverter Manufacturer 15,Inverter Quantity 15,Inverter Model 16,Inverter Manufacturer 16,Inverter Quantity 16,Inverter Model 17,Inverter Manufacturer 17,Inverter Quantity 17,Inverter Model 18,Inverter Manufacturer 18,Inverter Quantity 18,Inverter Model 19,Inverter Manufacturer 19,Inverter Quantity 19,Inverter Model 20,Inverter Manufacturer 20,Inverter Quantity 20,Inverter Model 21,Inverter Manufacturer 21,Inverter Quantity 21,Inverter Model 22,Inverter Manufacturer 22,Inverter Quantity 22,TOTAL_COST
3,SCE-INT-501116044,SCE,DAGGETT,92327.0,San Bernardino,Solar,137.21,,,,25.0,180.0,Ground,Fixed,Residential,2019-01-15,No,Shorebreak Energy Developers LLC,972616.0,Yes,PPA,Shorebreak Energy Developers LLC,No,,Yes,No,,0.0,496000.0,1.0,NEM,No,SW315XL,SolarWorld,508.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,STP24000TL-US-10 (480V),SMA America,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,496000.0
8,SCE-INT-501164421,SCE,TULARE,93274.0,Tulare,Solar,26.197,,,,18.0,180.0,Rooftop,Fixed,Residential,2019-06-20,No,CENTRAL CALIFORNIA SOLAR ELECTRIC,941526.0,No,,,No,,No,,,1.0,0.0,1.0,NEM,No,Generic Module Model,Generic Module Mfr,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Generic Inverter Model,Generic Inverter Mfr,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
10,SCE-INT-501176158,SCE,SPRINGVILLE,93265.0,Tulare,Solar,18.444,,,,15.0,210.0,Rooftop,Fixed,Residential,2019-10-02,No,Good Energy Renewables,988879.0,No,,,No,,No,,,1.0,0.0,1.0,NEM,No,Generic Module Model,Generic Module Mfr,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Generic Inverter Model,Generic Inverter Mfr,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
12,SCE-INT-501186076,SCE,LONG BEACH,90814.0,Los Angeles,Solar,3.459,,,,28.0,180.0,Rooftop,Fixed,Residential,2019-11-12,No,SUNERGY CONSTRUCTION INC,1005730.0,No,,,No,,No,,,1.0,0.0,1.0,NEM,No,Generic Module Model,Generic Module Mfr,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Generic Inverter Model,Generic Inverter Mfr,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
14,SCE-INT-501194903,SCE,WESTMINSTER,92683.0,Orange,Solar,7.407,,,,18.0,180.0,Rooftop,Fixed,Residential,2019-11-01,No,TESLA ENERGY OPERATIONS INC,888104.0,No,,,No,,No,,,1.0,0.0,1.0,NEM,No,Generic Module Model,Generic Module Mfr,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Generic Inverter Model,Generic Inverter Mfr,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0


## Clean up the installer names

In [12]:
# clean up installer names
df_installer_names = df[['Installer Name']].dropna().copy()
df_installer_names = df_installer_names.rename(columns={'Installer Name': 'installer_name'})

df_installer_names = df_installer_names.drop_duplicates()
df_installer_names.shape

(7755, 1)

In [13]:
df_installer_names_counts = df_installer_names.groupby('installer_name').size().reset_index()
df_installer_names_counts = df_installer_names_counts.rename(columns={0: 'counts'})
df_installer_names_counts['pct'] = df_installer_names_counts['counts']/df_installer_names_counts['counts'].sum()
df_installer_names_counts

Unnamed: 0,installer_name,counts,pct
0,"""GOT WATTS ELECTRIC, SOLAR & HVAC""",1,0.000129
1,"""INFINIUM SOLAR, INC.""",1,0.000129
2,0.0,1,0.000129
3,1 ON 1 TECHNOLOGY,1,0.000129
4,1 ST CHOICE ENERGY BUILDERS INC,1,0.000129
...,...,...,...
7750,west coast solar,1,0.000129
7751,western sierra,1,0.000129
7752,wiring innovations,1,0.000129
7753,your energy solutions,1,0.000129


In [14]:
# base list of installers
installers = {'Tesla': 1,
             'SolarCity': 1,
             'Sunrun': 1, 
             'Vivint': 1,
             'SunPower': 1,
             'PetersenDean': 2,
             'Sungevity': 2,
             'Spectrum': 2,
             'Sunnova': 2,
             'Baker': 2,
             'Spruce': 2,
             'Kilowatt': 2,
             'CPF': 2,
             'Sullivan': 2,
             'Verengo': 2,
             'ASI': 2,
             'Semper': 2,
             'Horizon': 2,
             'Lennar': 2,
             'SolarMax': 2,
             'SunWorks': 2,
             'The Solar Company': 2,
             'Alternative Energy': 2,
             'Stellar': 2,
             'Westhaven': 2,
             'Suncrest': 2,
             'A1 Solar': 2,
             'West Coast Solar': 2,
             'Future Energy': 2,
             'Smart Energy': 2,
             'Enver Solar': 2,
             'Bland': 3,
             'Solar Universe': 3,
             'Solcius': 3,
             'Grid Alternatives': 3,
             'Revolve Solar': 3,
             'Solare Energy': 3,
             'Helio': 3,
             'NRG': 3,
             'Clean Solar': 3,
             'Sierra Pacific': 3,
             '1st Light': 3,
             'Cobalt': 3,
             'Shorebreak': 3,
             'Renova': 3,
             'Arise': 3,
             'Infinity Energy': 3,
             'Planer': 3,
             'Solartec': 3,
             'LA Solar': 3,
             'Fidelity': 3,
             'Cosmic': 3,
             'Fralick Homes': 3,
             'Bay Area': 3,
             'Solar Technologies': 3,
             'Natural Energy': 3,
             'GCI': 3,
             'Complete': 3,
             'Secure Roofing': 3,
             'Palomar Solar': 3,
             'Solaire Energy': 3,
             'Sun Solar Energy': 3,
             'Sunline Energy': 3,
             'Kuykendall': 3,
             'Elevate': 3,
             'Nexus Energy': 3,
             'Sky Power': 3,
             'Sunstreet': 3,
             'Sunrise': 3,
              'Alterra': 3,
              'Millholland': 3,
              'New Day Solar': 3,
              'Hot Purple Energy': 3,
              'Divine Power': 3,
              'Precis': 3,
              'SunFusion Solar': 3,
              'SolarCraft Services': 3,
              'Solarponics': 3,
              'North State Solar': 3,
              'Self-installed': 3}

In [15]:
COMPANY_SUFFIXES = ['com', 'in', 'int', 'international', 'inc', 'incorporated',
                    'incorporation', 'corp', 'corporation', 'cos', 'co', '& co',
                    'intl', 'ltd', 'limited', 'plc', 'llc', 'holdings', 'hldgs',
                    'partners', 'cl', 'pl', 'technology', 'technologies', 'energy']
import re
def preprocess(text):
    # lowercase
    text = text.lower()

    # removing punctuations in string
    text = re.sub(r'[^\w\s]', '', text)

    # remove company suffixes:
    text = ' '.join([p for p in text.split() if p not in COMPANY_SUFFIXES])
    return text.strip()

# preprocess candidate column
installers_preprocessed = {preprocess(txt): txt for txt in installers.keys()}
installers_preprocessed

# preprocess target
df_installer_names['installer_name_preprocessed'] = df_installer_names['installer_name'].apply(lambda x: preprocess(x))

# try fuzzy matching
from thefuzz import process, fuzz

df_installer_names['fuzzy_match'] = df_installer_names['installer_name_preprocessed'].apply(
    lambda x: process.extractOne(x, installers_preprocessed.keys(), scorer=fuzz.token_set_ratio))

df_installer_names[['fuzzy_match', 'fuzzy_match_score']] = df_installer_names['fuzzy_match'].to_list()
df_installer_names

df_installer_names['fuzzy_match_score'].value_counts()

# merge with counts to
df_installer_names = df_installer_names.merge(df_installer_names_counts, on='installer_name')
df_installer_names.sort_values(['counts', 'fuzzy_match_score'], ascending=False)

Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '']


Unnamed: 0,installer_name,installer_name_preprocessed,fuzzy_match,fuzzy_match_score,counts,pct
0,Shorebreak Energy Developers LLC,shorebreak developers,shorebreak,100,1,0.000129
1,CENTRAL CALIFORNIA SOLAR ELECTRIC,central california solar electric,solar,100,1,0.000129
4,TESLA ENERGY OPERATIONS INC,tesla operations,tesla,100,1,0.000129
7,Infinity Energy,infinity,infinity,100,1,0.000129
8,One Love Solar,one love solar,solar,100,1,0.000129
...,...,...,...,...,...,...
188,ENERGY +,,tesla,0,1,0.000129
1425,ENERGY,,tesla,0,1,0.000129
2045,JJ ENERGY INC,jj,tesla,0,1,0.000129
3045,0.0,00,tesla,0,1,0.000129


In [16]:
fuzzy_match_map = df_installer_names[df_installer_names['fuzzy_match_score'] == 100].copy()
fuzzy_match_map['standard_name'] = fuzzy_match_map['fuzzy_match'].map(installers_preprocessed)

In [17]:
col = "Installer Name"
df = df.merge(fuzzy_match_map[['installer_name', 'standard_name']], 
              left_on=col, right_on='installer_name',
             how='left').drop(columns=['installer_name'])
df = df.rename(columns={'standard_name': f"{col}_standard_name"})

In [18]:
# Rename Vivint to Sunrun and  Lennar solar to Sunnova due to acquisitions
df['Installer Name_standard_name'] = df['Installer Name_standard_name'].fillna('Other')
df['Installer Name'] = df['Installer Name_standard_name'].replace({'Vivint': 'Sunrun', 'Lennar': 'Sunnova'})

In [19]:
#drop the Installer Name_standard_name
df.drop(columns=['Installer Name_standard_name'], inplace=True)
df.columns.to_list()

['Application Id',
 'Utility',
 'Service City',
 'Service Zip',
 'Service County',
 'Technology Type',
 'System Size AC',
 'Storage Capacity (kWh)',
 'Storage Size (kW AC)',
 'Inverter Size (kW AC)',
 'Tilt',
 'Azimuth',
 'Mounting Method',
 'Tracking',
 'Customer Sector',
 'App Received Date',
 'Self Installer',
 'Installer Name',
 'CSLB Number',
 'Third Party Owned',
 'Third Party Owned Type',
 'Third Party Name',
 'Electric Vehicle',
 'Electric Vehicle Count',
 'System Output Monitoring',
 'System Output Reports To Vendor?',
 'System Output Monitoring Provider',
 'Total System Cost',
 'Itc Cost Basis',
 'NEM Tariff',
 'Interconnection Program',
 'Project is VNEM, NEM-V, NEM-Agg?',
 'Generator Model 1',
 'Generator Manufacturer 1',
 'Generator Quantity 1',
 'Generator Model 2',
 'Generator Manufacturer 2',
 'Generator Quantity 2',
 'Generator Model 3',
 'Generator Manufacturer 3',
 'Generator Quantity 3',
 'Generator Model 4',
 'Generator Manufacturer 4',
 'Generator Quantity 4',
 'G

## Export CSV for Tablaue Exploration

In [20]:
# drop all rows with cost less than 1000? We have to figure cut off point!
df = df[df['TOTAL_COST'] > 1000]

In [21]:
df['TOTAL_COST'].describe()

count    9.758630e+05
mean     3.058590e+04
std      2.300458e+04
min      1.000140e+03
25%      1.785000e+04
50%      2.618084e+04
75%      3.811800e+04
max      2.995626e+06
Name: TOTAL_COST, dtype: float64

In [22]:
# let write out three different csv for each untility
df['Utility'].value_counts()

Utility
PGE     445673
SCE     354076
SDGE    176114
Name: count, dtype: int64

In [23]:
# seperate out by utility and export to seperate csv files
df_PGE = df[df['Utility'] == 'PGE']
df_SCE = df[df['Utility'] == 'SCE']
df_SDGE = df[df['Utility'] == 'SDGE']

df_PGE.to_csv(data_dir + "PGE.csv", index=False)
df_SCE.to_csv(data_dir + "SCE.csv", index=False)
df_SDGE.to_csv(data_dir + "SDGE.csv", index=False)

In [24]:
# create a stratified dataset
#https://www.geeksforgeeks.org/stratified-sampling-in-pandas/