# ETL Notebook For Preparing Data For Tableau Exploration and ML

In [None]:
import pandas as pd
import glob

In [None]:
# Merging files together

# Specify the data files location
data_dir = "/Users/ns96/Documents/ML_Project/"

df_list = []
for file in glob.glob(data_dir + "*_year.csv"):
    print(file)
    df_list.append(pd.read_csv(file, skipfooter=1))
df = pd.concat(df_list)
df

In [None]:
# Drop columns with all NaN values
df_cleaned = df.dropna(axis=1, how='all')

In [None]:
# Some ids are repeated; although that's not a big number: 730 ids have repetitions across all files
a = df['Application Id'].value_counts()
a[a>1]

In [None]:
# Delete repetitions
df = df[~df['Application Id'].duplicated(keep=False)]

In [None]:
# Check for repetitions again
a = df['Application Id'].value_counts()
a[a>1]

In [None]:
# Even though we downloaded data for the last five years from the website, the files still have old values.
# Saving only last 5 years (2018-2023)

# Convert 'App Received Date' column to datetime format
df['App Received Date'] = pd.to_datetime(df['App Received Date'], errors='coerce')

# Filter the DataFrame to keep only rows with 'App Received Date' on or after January 1, 2018
df = df[df['App Received Date'] >= '2018-01-01']

# Check the earliest date after filtering
earliest_date_after_filtering = df['App Received Date'].min()

# Display the result
print("Earliest Date after Filtering:", earliest_date_after_filtering)


In [None]:
# Technology Type columns exploration
# Top technology types
top_technologies = df['Technology Type'].value_counts().head(25)
top_technologies

In [None]:
# Removing redundant datapoints
valid_technology_types = ['Solar PV', 'Solar', 'Solar PV, Storage', 'Solar PV;Storage', 'Advanced Energy Storage', 'Storage', 'Energy Storage']

# Filter the DataFrame to keep only the rows with valid technology types
df = df[df['Technology Type'].isin(valid_technology_types)]

# Display the updated DataFrame
print(df['Technology Type'].value_counts().head(25))

In [None]:
# Remaming technology type datapoints
df['Technology Type'] = df['Technology Type'].replace({'Solar PV': 'Solar', 'Solar PV;Storage': 'Solar, Storage',\
                                                       'Advanced Energy Storage': 'Storage', \
                                                      'Energy Storage': 'Storage'})

# Display the updated DataFrame
print(df['Technology Type'].value_counts().head(25))

In [None]:
# Customer Sector columns exploration
customer_sectors = df['Customer Sector'].value_counts().head(25)
customer_sectors

In [None]:
# Removing redundant datapoints
valid_customer_sectors = ['Residential']

# Filter the DataFrame to keep only the rows with valid customer sectors
df = df[df['Customer Sector'].isin(valid_customer_sectors)]

# Display the updated DataFrame
print(df['Customer Sector'].value_counts().head(25))

In [None]:
# Print all column names
column_names = df.columns.tolist()
print("Column Names:", column_names)

In [None]:
# Delete redundant columns 
columns_to_remove = ['Application Id', 'Storage Capacity (kWh)', 'Tilt', "Azimuth",
'Matched CSI Application Number', 'Electric Vehicle Count', 'System Output Monitoring', \
                     'System Output Reports To Vendor?', 'System Output Monitoring Provider', \
                     'NEM Tariff', 'Tracking', 'Interconnection Program', 'Application Status', 'System Size DC', \
                     'Project is VNEM, NEM-V, NEM-Agg?', 'Customer Sector', 'App Complete Date',\
                    'App Approved Date', 'Installer Phone', 'Installer City', 'Installer State', 'Installer Zip',\
                    'Pace Financed', 'Pace Financier', 'Previous Application', 'Previous Application Ids',\
                     'VNEM, NEM-V, NEM-Agg', 'NEMPV or nonNEMPV', 'VNEM ID', 'Match Somah Application']

# Remove specified columns
df = df.drop(columns=columns_to_remove)

# Display the DataFrame after removing columns
df.head()

In [None]:
# Replace empty cells in "Installer Name_standard_name" column with "Self-installed" where "Self Installer" is "yes"
df.loc[df['Self Installer'] == 'yes', 'Installer Name'] = df.loc[df['Self Installer'] == 'yes', 'Installer Name'].fillna('Self-installed')

# Display the updated dataframe
df

In [None]:
# Create a copy of the column
df_installer_names = df[['Installer Name']].copy()

# Replace NaN values with 'Other'
df_installer_names['Installer Name'].fillna('Other', inplace=True)

df_installer_names = df_installer_names.rename(columns={'Installer Name': 'installer_name'})
df_installer_names

In [None]:
df_installer_names_counts = df_installer_names.groupby('installer_name').size().reset_index()
df_installer_names_counts = df_installer_names_counts.rename(columns={0: 'counts'})
df_installer_names_counts['pct'] = df_installer_names_counts['counts']/df_installer_names_counts['counts'].sum()
df_installer_names_counts

In [None]:
# base list of installers
installers = {'Tesla': 1,
             'SolarCity': 1,
             'Sunrun': 1, 
             'Vivint': 1,
             'SunPower': 1,
             'PetersenDean': 2,
             'Sungevity': 2,
             'Spectrum': 2,
             'Sunnova': 2,
             'Baker': 2,
             'Spruce': 2,
             'Kilowatt': 2,
             'CPF': 2,
             'Sullivan': 2,
             'Verengo': 2,
             'ASI': 2,
             'Semper': 2,
             'Horizon': 2,
             'Lennar': 2,
             'SolarMax': 2,
             'SunWorks': 2,
             'The Solar Company': 2,
             'Alternative Energy': 2,
             'Stellar': 2,
             'Westhaven': 2,
             'Suncrest': 2,
             'A1 Solar': 2,
             'West Coast Solar': 2,
             'Future Energy': 2,
             'Smart Energy': 2,
             'Enver Solar': 2,
             'Bland': 3,
             'Solar Universe': 3,
             'Solcius': 3,
             'Grid Alternatives': 3,
             'Revolve Solar': 3,
             'Solare Energy': 3,
             'Helio': 3,
             'NRG': 3,
             'Clean Solar': 3,
             'Sierra Pacific': 3,
             '1st Light': 3,
             'Cobalt': 3,
             'Shorebreak': 3,
             'Renova': 3,
             'Arise': 3,
             'Infinity Energy': 3,
             'Planer': 3,
             'Solartec': 3,
             'LA Solar': 3,
             'Fidelity': 3,
             'Cosmic': 3,
             'Fralick Homes': 3,
             'Bay Area': 3,
             'Natural Energy': 3,
             'GCI': 3,
             'Complete': 3,
             'Secure Roofing': 3,
             'Palomar Solar': 3,
             'Solaire Energy': 3,
             'Sun Solar Energy': 3,
             'Sunline Energy': 3,
             'Kuykendall': 3,
             'Elevate': 3,
             'Nexus Energy': 3,
             'Sky Power': 3,
             'Sunstreet': 3,
             'Sunrise': 3,
              'Alterra': 3,
              'Millholland': 3,
              'New Day Solar': 3,
              'Hot Purple Energy': 3,
              'Divine Power': 3,
              'Precis': 3,
              'SunFusion Solar': 3,
              'SolarCraft Services': 3,
              'Solarponics': 3,
              'North State Solar': 3,
              'Other': 3,
              'Self-installed': 3}

df_installer_names = df_installer_names.drop_duplicates()
df_installer_names.shape

In [None]:
COMPANY_SUFFIXES = ['com', 'in', 'int', 'international', 'inc', 'incorporated',
                    'incorporation', 'corp', 'corporation', 'cos', 'co', '& co',
                    'intl', 'ltd', 'limited', 'plc', 'llc', 'holdings', 'hldgs',
                    'partners', 'cl', 'pl', 'technology', 'technologies', 'energy']
import re
def preprocess(text):
    # lowercase
    text = text.lower()

    # removing punctuations in string
    text = re.sub(r'[^\w\s]', '', text)

    # remove company suffixes:
    text = ' '.join([p for p in text.split() if p not in COMPANY_SUFFIXES])
    return text.strip()

# preprocess candidate column
installers_preprocessed = {preprocess(txt): txt for txt in installers.keys()}
installers_preprocessed

# preprocess target
df_installer_names['installer_name_preprocessed'] = df_installer_names['installer_name'].apply(lambda x: preprocess(x))

# try fuzzy matching
from thefuzz import process, fuzz
df_installer_names['fuzzy_match'] = df_installer_names['installer_name_preprocessed'].apply(
    lambda x: process.extractOne(x, installers_preprocessed.keys(), scorer=fuzz.token_set_ratio))

df_installer_names[['fuzzy_match', 'fuzzy_match_score']] = df_installer_names['fuzzy_match'].to_list()
df_installer_names

df_installer_names['fuzzy_match_score'].value_counts()

# merge with counts to
df_installer_names = df_installer_names.merge(df_installer_names_counts, on='installer_name')
df_installer_names.sort_values(['counts', 'fuzzy_match_score'], ascending=False)


In [None]:
### Step 2: Taking those with 100 fuzzy match score

In [None]:
fuzzy_match_map = df_installer_names[df_installer_names['fuzzy_match_score'] == 100].copy()
fuzzy_match_map['standard_name'] = fuzzy_match_map['fuzzy_match'].map(installers_preprocessed)
fuzzy_match_map

In [None]:
# Merging two dataframe with updated installer names
col = "Installer Name"
df = df.merge(fuzzy_match_map[['installer_name', 'standard_name']], 
              left_on=col, right_on='installer_name',
             how='left').drop(columns=['installer_name'])
df = df.rename(columns={'standard_name': f"{col}_standard_name"})

df.head()

In [None]:
df.info(max_cols=200)

In [None]:
#Checking how many unique names are in the dataframe 
unique_installers_count = df['Installer Name_standard_name'].nunique()
print("Number of unique installers:", unique_installers_count)

In [None]:
installer_names = df['Installer Name_standard_name'].unique()

# Display the names of all installers
print("Names of all installers:")
for installer_name in installer_names:
    print(installer_name)

In [None]:
# Rename Vivint to Sunrun, Lennar solar to Sunnova, Solarcity to Tesla due to acquisitions

df['Installer Name_standard_name'] = df['Installer Name_standard_name'].replace({'Vivint': 'Sunrun', 'Lennar': 'Sunnova', 'SolarCity': 'Tesla'})


In [None]:
# Checking if cleaning worked properly 
top_installers_names = df['Installer Name_standard_name'].value_counts().head(50)
top_installers_names

In [None]:
# Replace rows where 'Installer Name' is not available with "Other"
# Create a copy of the dataframe
df3 = df.copy()

# Replace NaN values in 'Installer Name_standard_name' with 'Other'
df3['Installer Name_standard_name'].fillna('Other', inplace=True)

# Display the updated dataframe
df3

In [None]:
# Move "Installer Name_standard_name" column next to "Installer Name"
columns = df3.columns.tolist()
columns.remove("Installer Name_standard_name")
columns.insert(columns.index("Installer Name") + 1, "Installer Name_standard_name")
df4 = df3[columns]


In [None]:
# Drop not used columns
df4 = df4.drop(columns=['CSLB Number', 'Third Party Owned Type', 'Third Party Name'])

In [None]:
# Replace non-"Rooftop" values in "Mounting Method" column with "Other"
df4.loc[df4['Mounting Method'] != 'Rooftop', 'Mounting Method'] = 'Other'

In [None]:
# Removing all Generator Model columns 1 through 12
# Define the columns to be removed
columns_to_remove = [f'Generator Model {i}' for i in range(1, 13)]

# Drop the columns
df4 = df4.drop(columns=columns_to_remove)

In [None]:
# Same process for Inverter models columns 1 through 23

# Define the columns to be removed
columns_to_remove = [f'Inverter Model {i}' for i in range(1, 23)]

# Drop the columns
df4 = df4.drop(columns=columns_to_remove)

In [None]:
# Convert 'Total System Cost' to numeric, handle errors='coerce' to replace non-numeric values with NaN
df4['Total System Cost'] = pd.to_numeric(df4['Total System Cost'], errors='coerce')

# Calculate values based on 'Itc Cost Basis' / 0.3 where 'Total System Cost' is blank
#df4.loc[df4['Total System Cost'].isna(), 'Total System Cost'] = df4['Itc Cost Basis']/0.3
df4.loc[df4['Total System Cost'].isna(), 'Total System Cost'] = df4['Itc Cost Basis']

In [None]:
# Remove all Generator manufacturer columns 2 through 12
# Define the columns to be removed
columns_to_remove = [f'Generator Manufacturer {i}' for i in range(2, 13)]

# Drop the columns
df4 = df4.drop(columns=columns_to_remove)

In [None]:
# Rename the specific column
df4.rename(columns={'Generator Manufacturer 1': 'Generator_Manufacturer'}, inplace=True)

In [None]:
# Same process for Inverter Manufacturer columns 1 through 23
# Define the columns to be removed
columns_to_remove = [f'Inverter Manufacturer {i}' for i in range(2, 23)]

# Drop the columns
df4 = df4.drop(columns=columns_to_remove)

In [None]:
# Rename the specific column
df4.rename(columns={'Inverter Manufacturer 1': 'Inverter_Manufacturer'}, inplace=True)

In [None]:
column_names = df4.columns.tolist()
print("Column Names:", column_names)

In [None]:
# Create the new column and fill it with the sum of values in "Generator Quantity 1" through "Generator Quantity 12"
df4['Generator_Quantity'] = df4.loc[:, 'Generator Quantity 1':'Generator Quantity 12'].sum(axis=1)

# Drop the columns "Generator Quantity 1" through "Generator Quantity 12"
df4 = df4.drop(columns=[f'Generator Quantity {i}' for i in range(1, 13)])

In [None]:
# Create the new column and fill it with the sum of values in "Inverter Quantity 1" through "Inverter Quantity 22"
df4['Inverter_Quantity'] = df4.loc[:, 'Inverter Quantity 1':'Inverter Quantity 22'].sum(axis=1)

# Drop the columns "Inverter Quantity 1" through "Inverter Quantity 22"
df4 = df4.drop(columns=[f'Inverter Quantity {i}' for i in range(1, 23)])

In [None]:
# Replace spaces with underscores in column names
df4.columns = df4.columns.str.replace(' ', '_')
df4

In [None]:
column_names = df4.columns.tolist()
print("Column Names:", column_names)

In [None]:
# Drop the old "Installer_Name" column
df4 = df4.drop(columns=['Installer_Name'])

# Rename new "Installer_Name_standard_name" into "Installer Name"
df4.rename(columns={'Installer_Name_standard_name': 'Installer_Name'}, inplace=True)

In [None]:
# Drop ITC Cost Basis column
df4 = df4.drop(columns=['Itc_Cost_Basis'])

In [None]:
column_names = df4.columns.tolist()
print("Column Names:", column_names)

In [None]:
# Replace empty cells in "Installer Name_standard_name" column with "Self-installed" where "Self Installer" is "yes"
df4.loc[df4['Self_Installer'] == 'Yes', 'Installer_Name'] = 'Self-installed'

# Drop "Self-installed" column
df4 = df4.drop(columns=['Self_Installer'])

# Display the updated dataframe
df4

In [None]:
# Filter the DataFrame for the system cost that is likely to be wrongly recorded 
min_acceptable_cost = 7000
filtered_system_cost = df4[df4['Total_System_Cost'] < min_acceptable_cost]

count = filtered_system_cost.shape[0]
count

In [None]:
# Dropping unreosanable datapoints
df4 = df4[df4['Total_System_Cost'] >= min_acceptable_cost]

In [None]:
# Cleaning "Inverter_Manufacturer" column

In [None]:
top_inverters_names = df4['Inverter_Manufacturer'].value_counts().head(50)
top_inverters_names

In [None]:
# Create a copy of the column
df_inverter_manufacturer = df4[['Inverter_Manufacturer']].copy()

# Replace NaN values with 'Other'
df_inverter_manufacturer['Inverter_Manufacturer'].fillna('Other', inplace=True)

df_inverter_manufacturer = df_inverter_manufacturer.rename(columns={'Inverter_Manufacturer': 'inverter_manufacturer'})
df_inverter_manufacturer

In [None]:
df_inverter_manufacturer_counts = df_inverter_manufacturer.groupby('inverter_manufacturer').size().reset_index()
df_inverter_manufacturer_counts = df_inverter_manufacturer_counts.rename(columns={0: 'counts'})
df_inverter_manufacturer_counts['pct'] = df_inverter_manufacturer_counts['counts']/df_inverter_manufacturer_counts['counts'].sum()
df_inverter_manufacturer_counts

#df_inverter_manufacturer_counts.to_csv('data_files/df_inverter_manufacturer_counts.csv', index=False)

In [None]:
inverters = {"SolarEdge": 1,
             "Enphase": 1,
             "SMA America": 1,
             "SunPower": 1,
             "ABB": 1,
             "Power-One": 1,
             "Fronius": 1,
             "Delta": 1,
             "Ningbo Ginlong": 1,
             "Xantrex": 1,
             "Tesla": 1,
             "PV Powered": 1,
             "Advanced Energy": 1,
             "Altenergy": 1,
             "Kaco": 1,
             "Maxeon": 1,
             "Generac": 1,
             "Pika": 1,
             "SolarCity": 1,
             "LG": 1,
             "Panasonic": 1,
             "Sanyo": 1,
             "APSystems": 1,
             "Chilicon": 1,
             "Sungrow": 1,
             "Solaria": 1,
             "Candian Solar": 1,
             "Sharp": 1,
             "Huawei": 1,
             "Schneider": 1,
             "Solectria": 1,
             "SatCon": 1,
             "Generac": 1,
             "SolarBridge": 1,
             "Chint": 1}

df_inverter_manufacturer = df_inverter_manufacturer.drop_duplicates()
df_inverter_manufacturer.shape


In [None]:
COMPANY_SUFFIXES = ['com', 'in', 'int', 'international', 'inc', 'incorporated',
                    'incorporation', 'corp', 'corporation', 'cos', 'co', '& co',
                    'intl', 'ltd', 'limited', 'plc', 'llc', 'holdings', 'hldgs',
                    'partners', 'cl', 'pl', 'technology', 'technologies', 'energy']
import re
def preprocess(text):
    # lowercase
    text = text.lower()

    # removing punctuations in string
    text = re.sub(r'[^\w\s]', '', text)

    # remove company suffixes:
    text = ' '.join([p for p in text.split() if p not in COMPANY_SUFFIXES])
    return text.strip()

# preprocess candidate column
inverters_preprocessed = {preprocess(txt): txt for txt in inverters.keys()}
inverters_preprocessed

# preprocess target
df_inverter_manufacturer['inverter_name_preprocessed'] = df_inverter_manufacturer['inverter_manufacturer'].apply(lambda x: preprocess(x))

# try fuzzy matching
from thefuzz import process, fuzz
df_inverter_manufacturer['fuzzy_match'] = df_inverter_manufacturer['inverter_name_preprocessed'].apply(
    lambda x: process.extractOne(x, inverters_preprocessed.keys(), scorer=fuzz.token_set_ratio))

df_inverter_manufacturer[['fuzzy_match', 'fuzzy_match_score']] = df_inverter_manufacturer['fuzzy_match'].to_list()
df_inverter_manufacturer

df_inverter_manufacturer['fuzzy_match_score'].value_counts()

# merge with counts to
df_inverter_manufacturer = df_inverter_manufacturer.merge(df_inverter_manufacturer_counts, on='inverter_manufacturer')
df_inverter_manufacturer.sort_values(['counts', 'fuzzy_match_score'], ascending=False)


In [None]:
df_inverter_manufacturer.info()

In [None]:
fuzzy_match_map = df_inverter_manufacturer[df_inverter_manufacturer['fuzzy_match_score'] == 100].copy()
fuzzy_match_map['standard_name'] = fuzzy_match_map['fuzzy_match'].map(inverters_preprocessed)
fuzzy_match_map

In [None]:
# Merging two dataframe with updated installer names
col = "Inverter_Manufacturer"
df5 = df4.merge(fuzzy_match_map[['inverter_manufacturer', 'standard_name']], 
              left_on=col, right_on='inverter_manufacturer',
             how='left').drop(columns=['inverter_manufacturer'])
df5 = df5.rename(columns={'standard_name': f"{col}_standard_name"})

df5

In [None]:
# Rename Vivint to Sunrun and  Lennar solar to Sunnova due to acquisitions, Solarcity to Tesla

df5['Inverter_Manufacturer_standard_name'] = df5['Inverter_Manufacturer_standard_name'].replace({'Vivint': 'Sunrun', 'Lennar': 'Sunnova', 'SolarCity': 'Tesla'})


In [None]:
df5.info()

In [None]:
# Replace NaN values in 'Inverters_Manufacturer_standard_name' with 'Other'
df5['Inverter_Manufacturer_standard_name'].fillna('Other', inplace=True)

In [None]:
# Move "Inverter_Manufacturer_standard_name" column next to "Inverter_Manufacturer"
columns = df5.columns.tolist()
columns.remove("Inverter_Manufacturer_standard_name")
columns.insert(columns.index("Inverter_Manufacturer") + 1, "Inverter_Manufacturer_standard_name")
df5 = df5[columns]


In [None]:
# Drop old column
df5 = df5.drop(columns=["Inverter_Manufacturer"])

In [None]:
# Rename "Generator_Manufacturer_standard_name" into 'Generator_Manufacturer'
df5 = df5.rename(columns={'Inverter_Manufacturer_standard_name': 'Inverter_Manufacturer'})

In [None]:
# Cleaning "Generator_Manufacturer" column

In [None]:
top_generator_names = df5['Generator_Manufacturer'].value_counts().head(50)
top_generator_names

In [None]:
# Create a copy of the column
df_generator_manufacturer = df4[['Generator_Manufacturer']].copy()

# Replace NaN values with 'Other'
df_generator_manufacturer['Generator_Manufacturer'].fillna('Other', inplace=True)

df_generator_manufacturer = df_generator_manufacturer.rename(columns={'Generator_Manufacturer': 'generator_manufacturer'})
df_generator_manufacturer

In [None]:
df_generator_manufacturer_counts = df_generator_manufacturer.groupby('generator_manufacturer').size().reset_index()
df_generator_manufacturer_counts = df_generator_manufacturer_counts.rename(columns={0: 'counts'})
df_generator_manufacturer_counts['pct'] = df_generator_manufacturer_counts['counts']/df_generator_manufacturer_counts['counts'].sum()
df_generator_manufacturer_counts

#df_generator_manufacturer_counts.to_csv('data_files/df_generator_manufacturer_counts.csv', index=False)

In [None]:
generators = {"SolarEdge": 1,
              "Generic Manufacturer": 1,
              "REC Solar": 1,
              "Hanwha": 1,
              "Trina": 1,
              "Kyocera": 1,
              "Jinko": 1,
              "Longi Green Energy": 1,
              "Yingli Energy": 1,
              "Hyundai": 1,
              "Sanyo": 1,
              "Mission": 1,
              "Silfab": 1,
              "Solar World": 1,
              "BP": 1,
              "Suniva": 1,
              "AU Optronics": 1,
              "SunEdison": 1,
              "Suntech Power": 1,
              "Changzhou": 1,
              "Boviet": 1,
              "ET": 1,
              "Evergreen": 1,
              "Mitsubishi": 1,
              "Renesola": 1,
              "Axitec": 1,
              "Phono": 1,
              "Sunspark": 1,
              "Aptos": 1,
              "CertainTeed": 1,
              "S-Energy": 1,
              "Enphase": 1,
              "JA": 1,
              "CSI": 1,
              "SolarWorld": 1,
              "SMA America": 1,
              "Schuco": 1,
              "SunPower": 1,
              "ABB": 1,
              "Power-One": 1,
              "Fronius": 1,
              "Delta": 1,
              "Ningbo Ginlong": 1,
              "Xantrex": 1,
              "Tesla": 1,
              "PV Powered": 1,
              "Advanced Energy": 1,
              "Altenergy": 1,
              "Kaco": 1,
              "Maxeon": 1,
              "Generac": 1,
              "Pika": 1,
              "SolarCity": 1,
              "LG": 1,
              "Panasonic": 1,
              "Sanyo": 1,
              "APSystems": 1,
              "Chilicon": 1,
              "Sungrow": 1,
              "Solaria": 1,
              "Candian Solar": 1,
              "Sharp": 1,
              "Huawei": 1,
              "Schneider": 1,
              "Solectria": 1,
              "SatCon": 1,
              "Generac": 1,
              "SolarBridge": 1,
              "Chint": 1,
              "PowerLight": 1}

df_generator_manufacturer = df_generator_manufacturer.drop_duplicates()
df_generator_manufacturer.shape

In [None]:
COMPANY_SUFFIXES = ['com', 'in', 'int', 'international', 'inc', 'incorporated',
                    'incorporation', 'corp', 'corporation', 'cos', 'co', '& co',
                    'intl', 'ltd', 'limited', 'plc', 'llc', 'holdings', 'hldgs',
                    'partners', 'cl', 'pl', 'technology', 'technologies', 'energy']
import re
def preprocess(text):
    # lowercase
    text = text.lower()

    # removing punctuations in string
    text = re.sub(r'[^\w\s]', '', text)

    # remove company suffixes:
    text = ' '.join([p for p in text.split() if p not in COMPANY_SUFFIXES])
    return text.strip()

# preprocess candidate column
generators_preprocessed = {preprocess(txt): txt for txt in generators.keys()}
generators_preprocessed

# preprocess target
df_generator_manufacturer['generator_name_preprocessed'] = df_generator_manufacturer['generator_manufacturer'].apply(lambda x: preprocess(x))

# try fuzzy matching
from thefuzz import process, fuzz
df_generator_manufacturer['fuzzy_match'] = df_generator_manufacturer['generator_name_preprocessed'].apply(
    lambda x: process.extractOne(x, generators_preprocessed.keys(), scorer=fuzz.token_set_ratio))

df_generator_manufacturer[['fuzzy_match', 'fuzzy_match_score']] = df_generator_manufacturer['fuzzy_match'].to_list()
df_generator_manufacturer

df_generator_manufacturer['fuzzy_match_score'].value_counts()

# merge with counts to
df_generator_manufacturer = df_generator_manufacturer.merge(df_generator_manufacturer_counts, on='generator_manufacturer')
df_generator_manufacturer.sort_values(['counts', 'fuzzy_match_score'], ascending=False)


In [None]:
df_generator_manufacturer.info()

In [None]:
fuzzy_match_map = df_generator_manufacturer[df_generator_manufacturer['fuzzy_match_score'] == 100].copy()
fuzzy_match_map['standard_name'] = fuzzy_match_map['fuzzy_match'].map(generators_preprocessed)
fuzzy_match_map

In [None]:
# Merging two dataframe with updated generators names
col = "Generator_Manufacturer"
df6 = df5.merge(fuzzy_match_map[['generator_manufacturer', 'standard_name']], 
              left_on=col, right_on='generator_manufacturer',
             how='left').drop(columns=['generator_manufacturer'])
df7 = df6.rename(columns={'standard_name': f"{col}_standard_name"})

df7

In [None]:
df7.info()

In [None]:
top_generator_names = df7['Generator_Manufacturer_standard_name'].value_counts().head(50)
top_generator_names

In [None]:
# Rename Vivint to Sunrun and  Lennar solar to Sunnova due to acquisitions, Solarcity to Tesla

df7['Generator_Manufacturer_standard_name'] = df7['Generator_Manufacturer_standard_name'].replace({'Vivint': 'Sunrun', 'Lennar': 'Sunnova', 'SolarCity': 'Tesla'})


In [None]:
# Replace NaN values in 'Generator_Manufacturer_standard_name' with 'Other'
df7['Generator_Manufacturer_standard_name'].fillna('Other', inplace=True)


In [None]:
# Move "Generator_Manufacturer_standard_name" column next to "Generator_Manufacturer"
columns = df7.columns.tolist()
columns.remove("Generator_Manufacturer_standard_name")
columns.insert(columns.index("Generator_Manufacturer") + 1, "Generator_Manufacturer_standard_name")
df8 = df7[columns]


In [None]:
# Drop old column
df8 = df8.drop(columns=['Generator_Manufacturer'])

In [None]:
# Rename "Generator_Manufacturer_standard_name" into 'Generator_Manufacturer'
df8 = df8.rename(columns={'Generator_Manufacturer_standard_name': 'Generator_Manufacturer'})

In [None]:
# Consider deleting all negative values for storage and inverter size

In [None]:
# Filing out missing cities in the 'Service_City' column

# Group by 'Service_County' and find the most popular 'Service_City'
most_popular_city_by_county = df8.groupby('Service_County')['Service_City'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).reset_index()

# Merge the most popular city data back to the original dataframe
df8 = pd.merge(df8, most_popular_city_by_county, on='Service_County', how='left', suffixes=('', '_fill'))

# Fill missing values in 'Service_City' with the corresponding most popular city
df8['Service_City'] = df8['Service_City'].fillna(df8['Service_City_fill'])

# Drop the auxiliary column used for filling
df9 = df8.drop(columns=['Service_City_fill'])


In [None]:
# Filling out missing zipcode

# Group by 'Service_City' and find the most popular 'Service_Zip'
most_popular_zip_by_city = df9.groupby('Service_City')['Service_Zip'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).reset_index()

# Merge the most popular zip data back to the original dataframe
df9 = pd.merge(df9, most_popular_zip_by_city, on='Service_City', how='left', suffixes=('', '_fill'))

# Fill missing values in 'Service_Zip' with the corresponding most popular zip code
df9['Service_Zip'] = df9['Service_Zip'].fillna(df9['Service_Zip_fill'])

# Drop the auxiliary column used for filling
df10 = df9.drop(columns=['Service_Zip_fill'])


In [None]:
# Rename columns in the DataFrame
df10.rename(columns={'Storage_Size_(kW_AC)': 'Storage_Size_kW_AC', 'Inverter_Size_(kW_AC)': 'Inverter_Size_kW_AC'}, inplace=True)


## Do some imputating for missing values

In [None]:
# change the zip to string
df10['Service_Zip'] = df10['Service_Zip'].astype(int).astype(str).str.zfill(5)

In [None]:
# lets impute the storage, inverter size, and third party own
df10['Storage_Size_kW_AC'] = df10['Storage_Size_kW_AC'].fillna(0)

mean_size = df10.Inverter_Size_kW_AC.mean()
df10['Inverter_Size_kW_AC'] = df10['Inverter_Size_kW_AC'].fillna(mean_size)

df10['Third_Party_Owned'] = df10['Third_Party_Owned'].fillna('No')

In [None]:
# add a year column
df10['Year'] = df10['App_Received_Date'].dt.year

In [None]:
# replace negative values
df10['Inverter_Size_kW_AC'] = df10['Inverter_Size_kW_AC'].apply(lambda x: 0 if x < 0 else x)
df10['Storage_Size_kW_AC'] = df10['Storage_Size_kW_AC'].apply(lambda x: 0 if x < 0 else x)

In [None]:
# now make sure company as at least 100 records other flacg as other and drop
installer_count = df10['Installer_Name'].value_counts().to_dict()
df_final = df10.copy()

def check_count(installer):
    if installer_count[installer] >= 10:
        return installer
    else:
        return 'Other'

df_final['Installer_Name'] = df_final['Installer_Name'].apply(check_count)

# do some filtering in order to try and improve ML model performance
df_final = df_final[df_final['Installer_Name'] != 'Other']
df_final.shape

In [None]:
# Seperate out by utility and export to seperate csv files
PGE = df_final[df_final['Utility'] == 'PGE']
SCE = df_final[df_final['Utility'] == 'SCE']
SDGE = df_final[df_final['Utility'] == 'SDGE']

PGE.to_csv(data_dir + 'df_PGE.csv', index=False)
SCE.to_csv(data_dir + 'df_SCE.csv', index=False)
SDGE.to_csv(data_dir + 'df_SDGE.csv', index=False)

In [None]:
# save the entire file out
df_final.to_csv(data_dir + 'df_ALL-ML.csv', index=False)