In [220]:
# Import packages
import pandas as pd
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm

# Variable definition
lcaDataPath = "C:/Users\lukec\OneDrive - University of Cambridge\Projects\PhD\Data\EcoInvent_B0729/xml"
impactMethods = "C:/Users\lukec\OneDrive - University of Cambridge\Projects\PhD\Data\Impact-Methods\IPCC2013.xml"

In [211]:
def to_listlist(inlist:list): return [inlist] if type(inlist[0]) is str else inlist

# Read in impact factors
def read_xml_attributes(filepath:str,branches:list,attributes:list,df=False):
    """This function creates a dataframe of attributes within an xml file.\n
    Inputs:\n
    filepath - path to xml file\n
    branches - list of successive branch choices\n
    attributes - attributes to be read from chosen branch\n
    Outputs:\n
    df - pandas dataframe of attributes for each end branch\n
    Requirements: pandas as pd, xml.etree.ElementTree as ET"""
    if df is False: df = pd.DataFrame([],columns=attributes)
    tree_loc = ET.parse(filepath).getroot()
    branches, attributes = to_listlist(branches), to_listlist(attributes)
    attr_values = dict()
    for pathnum, path in enumerate(branches):
        for branch in path:
            for num,val in enumerate([branch in i.tag for i in tree_loc]):
                if val:
                    if branch is path[-1]:
                        attr_values.update(dict(zip(attributes[pathnum],[tree_loc[num].attrib[i] for i in attributes[pathnum]])))
                        if path is branches[-1]: df = df.append(attr_values,ignore_index=True)
                    else:
                        tree_loc=tree_loc[num]
                        break
    return df

In [212]:
impacts = read_xml_attributes(impactMethods,['dataset','flowData','exchange'],['name','category','subCategory','meanValue'])
impacts.sort_values('name')

Unnamed: 0,name,category,subCategory,meanValue
56,"Carbon dioxide, fossil",Emission to air,unspecified,1.0
45,"Carbon dioxide, fossil",Emission to air,low population density,1.0
164,"Carbon dioxide, fossil",Emission to air,"low population density, long-term",1.0
69,"Carbon dioxide, fossil",Emission to air,lower stratosphere + upper troposphere,1.0
19,"Carbon dioxide, fossil",Emission to air,high population density,1.0
...,...,...,...,...
189,Sulfur hexafluoride,Emission to air,lower stratosphere + upper troposphere,23506.81999316
79,Sulfur hexafluoride,Emission to air,"low population density, long-term",23506.81999316
44,Sulfur hexafluoride,Emission to air,low population density,23506.81999316
3,Sulfur hexafluoride,Emission to air,unspecified,23506.81999316


In [233]:
# Load in xml data to dataframe
def calculate_co2e(filepath:str,lookup:pd.DataFrame,branches=['dataset','flowData','exchange'],attributes=['name','category','subCategory','meanValue']):
    product = read_xml_attributes(filepath,branches,attributes)
    product = product.rename(columns={attributes[-1]: "kg"})
    emissions = product.merge(lookup)
    return sum(pd.to_numeric(emissions['kg'])*pd.to_numeric(emissions[attributes[-1]]))

def import_xml_batch(directory:str,path:list,attributes:list,co2e=False,impacts=False):
    """This function imports xml data into a pandas dataframe
    inputs:
    directory - directory of xml files
    path - location of attributes within xml
    attributes - names of attributes/column names
    outputs:
    df - dataframe of requested attributes
    Requirements: os, pandas as pd, xml.etree.ElementTree as ET"""
    file_list = os.listdir(directory)
    attributes = to_listlist(attributes)
    df = pd.DataFrame([],columns=[j for i in attributes for j in i])
    if co2e is not False: df[co2e] = None
    for file in tqdm(file_list):
        df = read_xml_attributes(os.path.join(directory, file),path,attributes,df)
        if co2e is not False: df[co2e].iloc[-1] = calculate_co2e(os.path.join(directory, file), impacts)
    return df, file_list

processes, _ = import_xml_batch(lcaDataPath,[['dataset','meta','process','reference'],['dataset','meta','process','geo']],[['name','generalComment'],['location']],'CO2e',impacts)

100%|██████████| 254/254 [12:06<00:00,  2.86s/it]


In [234]:
# Filter dataframe for non market activities
processes[~ processes['generalComment'].str.contains('market')].sort_values('name')

Unnamed: 0,name,generalComment,location,CO2e
241,barite,This dataset has been copied from an original ...,RoW,0.031836
249,barite,This dataset refers to the production of 1 kg ...,BR,0.020889
142,bauxite,Average moisture content of bauxite is approxi...,GLO,0.012219
114,bulk lead-zinc concentrate,"The multi-output ""zinc mine operation"" process...",GLO,0.418987
224,"calcium carbonate, precipitated",For the separation and refining of rare earth ...,CN-SC,3.496398
...,...,...,...,...
51,zinc concentrate,From metal containing ore extraction from the ...,CA-QC,0.463061
134,zinc concentrate,"This dataset describes the exploitation, conce...",PE,1.714277
226,"zircon, 50% zirconium",Heavy mineral sands are a class of ore deposit...,ZA,1.319584
108,"zircon, 50% zirconium",This dataset refers to the production of 1 kg ...,RoW,0.337368


In [236]:
processes.sort_values('name').to_csv("C:/Users\lukec\OneDrive - University of Cambridge\Projects\PhD\Data\EcoInvent_B0729\process_co2e/GWP100a_IPCC2013.csv")

In [None]:
# Display data

In [None]:
# Get data from all files into database

In [None]:
# Graph it up baby