In [2]:
%cd C:/Users\lshc3\PycharmProjects\emissions-tracking

C:\Users\lshc3\PycharmProjects\emissions-tracking


In [53]:
%%writefile functions/import_data.py

"""Import classes for each type of LCA and chemical manufacturing data used in emissions-tracking project"""

import numpy as np
import pandas as pd


class LCA(object):
    """Import EcoInvent CO2e data"""
    def __init__(self, data_path:str, lca_lists:list):
        self.data = pd.DataFrame()
        for path in lca_lists:
            filepath = data_path+"EcoInvent/"+path+"/GWP100a_IPCC2013.csv"
            try:
                raw = pd.read_csv(filepath)
            except FileNotFoundError:
                print(filepath+' not found')
                raise
            self.data = self.data.append(raw[['name', 'location', 'CO2e']][~raw['generalComment'].str.contains('market')].sort_values('name'))
        self.data_path = data_path

    def location(self, location='Global'):
        """Filter entries for EcoInvent regions"""
        loc_filepath = self.data_path+"EcoInvent/EcoInvent_locations.csv"
        loc_codes = pd.read_csv(loc_filepath)
        region_code = loc_codes['Code'][np.where(loc_codes['Name'] == location)[0]].iloc[0]
        rows = []
        for product in self.data['name'].unique():
            for code in [region_code, 'RoW', 'GLO']:
                found = self.data[(self.data['name']==product) & (self.data['location']==code)]
                if not found.index.empty:
                    rows += list(found.index)
                    break
        return self.data.loc[rows].groupby(['name','location']).mean().reset_index()


class IHSData(object):
    """Import IHS Markit chemical product and material composition data"""
    def __init__(self, data_path):
        product_file = data_path+"IHS/US/products.csv"
        material_file = data_path+"IHS/US/materials.csv"
        self.products = pd.read_csv(product_file, index_col=0).reset_index(drop=True)
        self.materials = pd.read_csv(material_file, index_col=0).reset_index(drop=True)


class ICISData(object):
    """Import ICIS chemical manufacturing data"""
    def __init__(self, filepath):
        self.countryCol = 'COUNTRY/TERRITORY'
        self.properties = ['Capacity', 'Statistic Production', 'Import', 'Export', 'Consumption']
        in_file = pd.ExcelFile(filepath)
        self.plants, self.prod, self.imps, self.exps, self.cons = [pd.DataFrame()]*5

        def append_data(prev,region,breaks,start):
            """Append data while row of ICIS data is within particular type"""
            br_end = breaks[start+1] if start+1 < len(breaks) else -1
            return prev.append(region[breaks[start]+1:br_end].dropna(subset=[self.countryCol]), ignore_index=True)

        for sheet in in_file.sheet_names[2:]:
            region = pd.read_excel(in_file, sheet_name=sheet)
            region['Region'] = [sheet]*len(region)
            breaks = region.loc[region['PRODUCT'].isin(self.properties)].index
            self.plants, self.prod, self.imps, self.exps, self.cons = [append_data(attr,region,breaks,i)
                                                                       for i, attr in enumerate([self.plants, self.prod, self.imps, self.exps, self.cons])]

Overwriting functions/import_data.py


In [90]:
%%writefile functions/test_import_data.py

"""Unit tests for import_data.py"""

import unittest
import pandas as pd
import functions.import_data as id


class LCATests(unittest.TestCase):
    def test_LCA_import(self):
        valid_data = id.LCA("C:/Users/lshc3/Documents/", ['Basic_chemicals_201','Coke_Petro_19']).data
        self.assertIsInstance(valid_data, pd.DataFrame, 'Import does not return valid pandas Dataframe')
        self.assertAlmostEqual(valid_data['CO2e'][0], 4.86288208344407, 3, msg = 'Incorrect value found on test case')
        self.assertRaises(FileNotFoundError, lambda: id.LCA("C:/Users/lshc3/Documents/", ['invalid list']))

    def test_LCA_location(self):
        filt_locs = id.LCA("C:/Users/lshc3/Documents/", ['Basic_chemicals_201','Coke_Petro_19']).location('France')
        self.assertTrue(set(filt_locs['location']).issubset(['GLO','RoW','FR']), 'Location filter returns unwanted locations')

class IHSDataTests(unittest.TestCase):
    def test_products(self):
        self.assertIsInstance(id.IHSData("C:/Users/lshc3/Documents/").products, pd.DataFrame, 'Import does not return valid pandas Dataframe')
        self.assertEqual(id.IHSData("C:/Users/lshc3/Documents/").products['Name'][0], 'ETHYLENE', 'Incorrect value found on test case')
    def test_materials(self):
        self.assertIsInstance(id.IHSData("C:/Users/lshc3/Documents/").materials, pd.DataFrame, 'Import does not return valid pandas Dataframe')
        self.assertEqual(id.IHSData("C:/Users/lshc3/Documents/").materials['Source'][0], 'CATALYST', 'Incorrect value found on test case')

# class IHSDataTests(unittest.TestCase):
#     def test_dataframes(self):
#         icis = id.ICISData("C:/ICIS_data/US_allchemicals.xlsx")
#         #for i in [icis.plants, icis.prod, icis.imps, icis.exps, icis.cons]:
#         self.assertIsInstance(icis.plants, pd.DataFrame, 'Import does not return valid pandas Dataframe')
#         self.assertIsInstance(icis.prod, pd.DataFrame, 'Import does not return valid pandas Dataframe')

if __name__ == '__main__':
    unittest.main()

Overwriting functions/test_import_data.py


In [91]:
!pytest

platform win32 -- Python 3.8.3, pytest-5.4.3, py-1.9.0, pluggy-0.13.1
rootdir: C:\Users\lshc3\PycharmProjects\emissions-tracking
collected 5 items

functions\test_import_data.py .F..                                       [ 80%]
tests\test_utilities.py .                                                [100%]

_________________________ LCATests.test_LCA_location __________________________

self = <functions.test_import_data.LCATests testMethod=test_LCA_location>

    def test_LCA_location(self):
        filt_locs = id.LCA("C:/Users/lshc3/Documents/", ['Basic_chemicals_201','Coke_Petro_19']).location('France')
        print(set(filt_locs['location']))
>       self.assertTrue(set(filt_locs['location']).issubset(['GLO','RoW','FR']), 'Location filter returns unwanted locations')
E       AssertionError: False is not true : Location filter returns unwanted locations

functions\test_import_data.py:19: AssertionError
---------------------------- Captured stdout call -----------------------------


In [96]:
%%writefile functions/preprocessing.py

"""Preprocessing functions for chemical data"""

import difflib
import pandas as pd

def format_icis(icis,year=2019):
    plants_clean = icis.plants.drop(columns=['#','DERIVATIVE','NOTE','@','@Sub','Region']+list(filter(lambda i: 'None' in str(i) or (type(i) is int and i!=year), icis.plants.columns)))
    plants_working = plants_clean[plants_clean[year] != '-'].reset_index(drop=True)
    return plants_working

def merge_chemical_data(lca, ihs, plants):
    """Add EcoInvent LCA data and IHS materials data to ICIS plant data dataframe"""
    merge = pd.merge(lca, plants, left_on=lca['name'].str.lower(), right_on=plants['PRODUCT'].str.lower(), how="left").reset_index(drop=True)
    plant_details = pd.merge(merge, ihs.materials, left_on=merge['PRODUCT'].str.lower(), right_on=ihs.materials['Product'].str.lower(), how="left")
    return plant_details

# def merge_ihs_icis(ihs, icis):
#     plants_working['ihsProcess'] = list(map(lambda x: next(iter(difflib.get_close_matches(str(x), ihs.products['Process'])),None), plants_working['PRODUCT']+' '+plants_working['ROUTE']+' '+plants_working['TECHNOLOGY']))
#     plant_products = pd.merge(plants_working, ihs, left_on=plants_working['ihsProcess'], right_on=ihs.products['Process'], how="left")
#     return plant_products


Overwriting functions/preprocessing.py


In [None]:
%%writefile functions/test_preprocessing.py

"""Unit tests for preprocessing.py"""

import unittest
import pandas as pd
import functions.preprocessing as pp


class IcisFormatTests(unittest.TestCase):
    def test_ICIS_filter(self):



class MergeDataTests(unittest.TestCase):
    def test_products(self):


if __name__ == '__main__':
    unittest.main()

In [None]:
!pytest

In [97]:
# %%writefile scripts/graph_classification.py

"""Script for running graph classification on chemical data"""

import functions.import_data as impData
import functions.preprocessing as preproc

# Import data
data_path = "C:/Users/lshc3/Documents/"
lca = impData.LCA(data_path, lca_lists = ['Basic_chemicals_201','Coke_Petro_19']).location('United States')
ihs = impData.IHSData(data_path)
icis = impData.ICISData("C:\ICIS_data/US_allchemicals.xlsx")

# Merge data
plants = preproc.format_icis(icis, 2019)
merged_data = preproc.merge_chemical_data(lca,ihs,plants)


ValueError: cannot insert key_0, already exists

In [37]:
# Trial merges
merge = pd.merge(lca, icis.plants, left_on=lca['name'].str.lower(), right_on=icis.plants['PRODUCT'].str.lower(), how="left")
merge['PRODUCT'].unique()

merge_cons = pd.merge(lca, icis.cons, left_on=lca['name'].str.lower(), right_on=icis.cons['DERIVATIVE'].str.lower(), how="left")
merge_cons['DERIVATIVE'].unique()
mergeIHS = pd.merge(merge, ihs.materials, left_on=merge['PRODUCT'].str.lower(), right_on=ihs.materials['Product'].str.lower(), how="left")

NameError: name 'lca' is not defined

In [168]:
# Associate with IHS process and merge
import difflib
plants_working['ihsProcess'] = list(map(lambda x: next(iter(difflib.get_close_matches(str(x), ihs.products['Process'])),None), plants_working['PRODUCT']+' '+plants_working['ROUTE']+' '+plants_working['TECHNOLOGY']))

plant_products = pd.merge(plants_working, ihs, left_on=plants_working['ihsProcess'], right_on=ihs.products['Process'], how="left")


# df2 = ihs.products.sort_values('Name')
# df2['NameNew'] = df2['Name'].apply(lambda x: next(iter(difflib.get_close_matches(x, plants_working['ROUTE'])),None))
# df2['ProcessNew'] = df2['Process'].apply(lambda x: next(iter(difflib.get_close_matches(x, plants_working['PRODUCT'])),None))
#.merge(df2)
plants_working

TypeError: Can only merge Series or DataFrame objects, a <class 'functions.import_data.IHSData'> was passed

In [None]:
# Sparse matrix with all materials for process frmo IHS

In [None]:
# Associate with Y vector from EcoInvent

In [145]:
import difflib
plants_working['ihsProcess'] = list(map(lambda x: next(iter(difflib.get_close_matches(x, ihs.products['Process'])),None), plants_working['ROUTE']+' '+plants_working['TECHNOLOGY']))

TypeError: 'float' object is not iterable

In [172]:
l = plants_working['PRODUCT']+' '+plants_working['ROUTE']+' '+plants_working['TECHNOLOGY']
list(map(lambda x: next(iter(difflib.get_close_matches(str(x), ihs.products['Process'])),None), plants_working['PRODUCT']+' '+plants_working['ROUTE']+' '+plants_working['TECHNOLOGY']))

[None,
 None,
 None,
 None,
 'ABS RESIN BY EMULSION POLYMERIZATION',
 'ABS RESIN BY EMULSION POLYMERIZATION',
 'ABS RESIN PRODUCTION',
 'ABS RESIN BY CONTINUOUS MASS POLYMERIZATION',
 'ACETALDEHYDE FROM ETHYLENE BY ONE-STEP OXIDATION',
 'ACETIC ACID BY LOW PRESSURE CARBONYLATION OF METHANOL',
 None,
 'AZELAIC ACID FROM OLEIC ACID',
 None,
 'ACETIC ACID BY LOW PRESSURE CARBONYLATION OF METHANOL',
 'ACETIC ACID BY LOW PRESSURE CARBONYLATION OF METHANOL',
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 None,
 None,
 None,
 None,
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 None,
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'ACRYL