## Extracting Petrochemicals data from raw download files

In [1]:
import numpy as np
import pandas as pd

In [2]:
## File paths
data_path = 'RAW_DATA_PATH'
save_path = "..\data\classification_inputs/petrochemicals"

In [3]:
def filter_df(df:pd.DataFrame, cols:list, filters:list) -> pd.DataFrame:
    """Function for filtering a pandas dataframe based on column values"""
    for col, filt in zip(cols, filters):
        df = df[[i in filt for i in df[col]]]
    return df

In [None]:
## Import and manipulate facility data
facilities = pd.read_parquet(data_path)

# Use only one target
gas_t, type_t = 'CO2e_100a', ['Feedstock', 'Organic chemicals', 'Primary chemicals', 'Other intermediates', 'Direct Utilities', 'Indirect Utilities', 'Direct Process']
used = filter_df(facilities, ['Gas', 'Type'], [[gas_t], type_t])

# Get correct columns
columns = ['PRODUCT', 'COUNTRY/TERRITORY', 'COMPANY', 'ROUTE', 'TECHNOLOGY','SITE', '#', 'COMPLEX', 'LICENSOR', 'START_YR', 'LATITUDE', 'LONGITUDE']
used = used.groupby(columns).sum().reset_index()

years = [str(i) for i in range(1978, 2051)]
used = used[columns+years]

# Define start year as first year with value if unspecified + find end year
used['START_YR'] = np.where(used['START_YR'].replace('n.a.', 0).astype(float).lt(1), used[years].transpose().ne(0).idxmax(), used['START_YR']).astype(float)
used['END_YR'] = used[list(reversed(years))].transpose().ne(0).idxmax().astype(float)

# Only take actual facilities and not country fillers
used = used[[i not in ['n.a.', '~ADDITIONAL'] for i in used['COMPANY']]]
used[['LATITUDE', 'LONGITUDE']+years] = used[['LATITUDE', 'LONGITUDE']+years].astype(float)
used = used.drop_duplicates()

used.head()

In [5]:
def convert_to_yearly(df, columns, years) -> pd.DataFrame:
    """Convert dataframe of plants with entry for each year into dataframe with row for each year"""
    # Convert to yearly
    yearly = pd.melt(df, id_vars=columns+['END_YR'], value_vars=years, var_name='Year', value_name='Emissions')

    # Get rid of emissions for years before start year
    yearly['Age'] = yearly['Year'].astype(int) - yearly['START_YR'].astype(int)
    yearly = yearly[yearly['Age'] >= 0]

    # Get rid of emissions for years after end years
    yearly['ToGo'] = yearly['END_YR'].astype(int) - yearly['Year'].astype(int)
    yearly = yearly[yearly['ToGo'] >= 0]

    return yearly.drop(columns=['ToGo'])

used_pivoted = convert_to_yearly(used, columns, years)

In [6]:
used.to_csv(save_path+'.csv', index=False)
used_pivoted.to_csv(save_path+'_melted.csv', index=False)

In [None]:
data = pd.read_csv(save_path+'.csv')