In [1]:
%cd C:/Users\lshc3\PycharmProjects\emissions-tracking

C:\Users\lshc3\PycharmProjects\emissions-tracking


In [2]:
%%writefile functions/import_data.py

"""Import classes for each type of LCA and chemical manufacturing data used in emissions-tracking project"""

import numpy as np
import pandas as pd


class LCA(object):
    """Import EcoInvent CO2e data"""
    def __init__(self, data_path:str, lca_lists:list):
        self.data = pd.DataFrame()
        for path in lca_lists:
            filepath = data_path+"EcoInvent/"+path+"/GWP100a_IPCC2013.csv"
            try:
                raw = pd.read_csv(filepath)
            except FileNotFoundError:
                print(filepath+' not found')
                raise
            self.data = self.data.append(raw[['name', 'location', 'CO2e']][~raw['generalComment'].str.contains('market')].sort_values('name'))
        self.data_path = data_path

    def location(self, location='Global'):
        """Filter entries for EcoInvent regions"""
        loc_filepath = self.data_path+"EcoInvent/EcoInvent_locations.csv"
        loc_codes = pd.read_csv(loc_filepath)
        region_code = loc_codes['Code'][np.where(loc_codes['Name'] == location)[0]].iloc[0]
        rows = []
        for product in self.data['name'].unique():
            for code in [region_code, 'RoW', 'GLO']:
                found = self.data[(self.data['name']==product) & (self.data['location']==code)]
                if not found.index.empty:
                    rows += list(found.index)
                    break
        return self.data.loc[rows].groupby(['name','location']).mean().reset_index()


class IHSData(object):
    """Import IHS Markit chemical product and material composition data"""
    def __init__(self, data_path):
        product_file = data_path+"IHS/US/products.csv"
        material_file = data_path+"IHS/US/materials.csv"
        self.products = pd.read_csv(product_file, index_col=0).reset_index(drop=True)
        self.materials = pd.read_csv(material_file, index_col=0).reset_index(drop=True)


class ICISData(object):
    """Import ICIS chemical manufacturing data"""
    def __init__(self, filepath):
        self.countryCol = 'COUNTRY/TERRITORY'
        self.properties = ['Capacity', 'Statistic Production', 'Import', 'Export', 'Consumption']
        in_file = pd.ExcelFile(filepath)
        self.plants, self.prod, self.imps, self.exps, self.cons = [pd.DataFrame()]*5

        def append_data(prev,region,breaks,start):
            """Append data while row of ICIS data is within particular type"""
            br_end = breaks[start+1] if start+1 < len(breaks) else -1
            return prev.append(region[breaks[start]+1:br_end].dropna(subset=[self.countryCol]), ignore_index=True)

        for sheet in in_file.sheet_names[2:]:
            region = pd.read_excel(in_file, sheet_name=sheet)
            region['Region'] = [sheet]*len(region)
            breaks = region.loc[region['PRODUCT'].isin(self.properties)].index
            self.plants, self.prod, self.imps, self.exps, self.cons = [append_data(attr,region,breaks,i)
                                                                       for i, attr in enumerate([self.plants, self.prod, self.imps, self.exps, self.cons])]

Overwriting functions/import_data.py


In [3]:
%%writefile functions/test_import_data.py

"""Unit tests for import_data.py"""

import unittest
import pandas as pd
import functions.import_data as id


class LCATests(unittest.TestCase):
    def test_LCA_import(self):
        valid_data = id.LCA("C:/Users/lshc3/Documents/", ['Basic_chemicals_201','Coke_Petro_19']).data
        self.assertIsInstance(valid_data, pd.DataFrame, 'Import does not return valid pandas Dataframe')
        self.assertAlmostEqual(valid_data['CO2e'][0], 4.86288208344407, 3, msg = 'Incorrect value found on test case')
        self.assertRaises(FileNotFoundError, lambda: id.LCA("C:/Users/lshc3/Documents/", ['invalid list']))

    def test_LCA_location(self):
        filt_locs = id.LCA("C:/Users/lshc3/Documents/", ['Basic_chemicals_201','Coke_Petro_19']).location('France')
        self.assertTrue(set(filt_locs['location']).issubset(['GLO','RoW','FR']), 'Location filter returns unwanted locations')

class IHSDataTests(unittest.TestCase):
    def test_products(self):
        self.assertIsInstance(id.IHSData("C:/Users/lshc3/Documents/").products, pd.DataFrame, 'Import does not return valid pandas Dataframe')
        self.assertEqual(id.IHSData("C:/Users/lshc3/Documents/").products['Name'][0], 'ETHYLENE', 'Incorrect value found on test case')
    def test_materials(self):
        self.assertIsInstance(id.IHSData("C:/Users/lshc3/Documents/").materials, pd.DataFrame, 'Import does not return valid pandas Dataframe')
        self.assertEqual(id.IHSData("C:/Users/lshc3/Documents/").materials['Source'][0], 'CATALYST', 'Incorrect value found on test case')

# class IHSDataTests(unittest.TestCase):
#     def test_dataframes(self):
#         icis = id.ICISData("C:/ICIS_data/US_allchemicals.xlsx")
#         #for i in [icis.plants, icis.prod, icis.imps, icis.exps, icis.cons]:
#         self.assertIsInstance(icis.plants, pd.DataFrame, 'Import does not return valid pandas Dataframe')
#         self.assertIsInstance(icis.prod, pd.DataFrame, 'Import does not return valid pandas Dataframe')

if __name__ == '__main__':
    unittest.main()

Overwriting functions/test_import_data.py


In [4]:
!pytest

platform win32 -- Python 3.8.3, pytest-5.4.3, py-1.9.0, pluggy-0.13.1
rootdir: C:\Users\lshc3\PycharmProjects\emissions-tracking
collected 5 items / 1 error / 4 selected

______________ ERROR collecting functions/test_preprocessing.py _______________
ImportError while importing test module 'C:\Users\lshc3\PycharmProjects\emissions-tracking\functions\test_preprocessing.py'.
Hint: make sure your test modules/packages have valid Python names.
Traceback:
functions\test_preprocessing.py:6: in <module>
    import preprocessing as pp
E   ModuleNotFoundError: No module named 'preprocessing'
ERROR functions/test_preprocessing.py
!!!!!!!!!!!!!!!!!!! Interrupted: 1 error during collection !!!!!!!!!!!!!!!!!!!!


In [8]:
%%writefile functions/test_preprocessing.py

"""Unit tests for preprocessing.py"""

import unittest
import pandas as pd
import preprocessing as pp


class IcisFormatTests(unittest.TestCase):
    def test_ICIS_filter(self):
        pass



class MergeDataTests(unittest.TestCase):
    def test_products(self):
        pass


if __name__ == '__main__':
    unittest.main()

Writing functions/test_preprocessing.py


In [None]:
!pytest

In [203]:
%%writefile functions/preprocessing.py

"""Preprocessing functions for chemical data"""

import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz

def left_merge(df1, df2, col1, col2):
    return pd.merge(df1, df2, left_on=df1[col1].str.lower(), right_on=df2[col2].str.lower(), how="left").reset_index(drop=True).drop('key_0',axis=1)

def format_icis(icis,year=2019):
    plants_clean = icis.plants.drop(columns=['#','DERIVATIVE','NOTE','@','@Sub','Region']+list(filter(lambda i: 'None' in str(i) or (type(i) is int and i!=year), icis.plants.columns)))
    plants_working = plants_clean[plants_clean[year] != '-'].reset_index(drop=True)
    return plants_working

def merge_chemical_data(lca, ihs, plants):
    """Add EcoInvent LCA data and IHS materials data to ICIS plant data dataframe"""
    # LCA merge
    plants_lca = left_merge(plants, lca, 'PRODUCT', 'name')

    # IHS product merge
    plants_products = left_merge(plants_lca, ihs.products, 'name', 'Name')
    matches = [fuzz.ratio(str(i)+' '+str(j), str(k)) for i,j,k in zip(plants_products['ROUTE'], plants_products['TECHNOLOGY'], plants_products['Process'])]
    for i in np.where(plants_products['Process'] == 'NaN')[0]:
        matches[i] = 0
    plants_products['process_match'] = matches
    plants_products = plants_products.sort_values(list(plants_products.columns[:12])+['process_match']).drop_duplicates(plants_products.columns[:12],keep='last').reset_index(drop=True)
    plants_products.drop('process_match', axis=1, inplace=True)

    # IHS materials merge
    sparse_materials = ihs.materials[['Target','Source','Value','Source cost']].groupby(['Target','Source']).sum().unstack().reset_index()
    plants_materials = left_merge(plants_products, sparse_materials, 'Process', 'Target')
    return plants_materials

Overwriting functions/preprocessing.py


In [1]:
%%writefile scripts/chemical_graph_build.py

"""Script for running graph classification on chemical data"""

import sys
sys.path.insert(0, '../')
import functions.import_data as impData
import functions.preprocessing as preproc

data_path = "C:/Users/lshc3/Documents/"
out_file = "C:/Users/lshc3/Documents/Aggregated/US_2019.csv"

# Import data
lca = impData.LCA(data_path, lca_lists = ['Basic_chemicals_201','Coke_Petro_19']).location('United States')
ihs = impData.IHSData(data_path)
icis = impData.ICISData("C:\ICIS_data/US_allchemicals.xlsx")

# Merge data
plants = preproc.format_icis(icis, 2019)
merged_data = preproc.merge_chemical_data(lca,ihs,plants)

# Output to csv
merged_data.to_csv(out_file, index=False)

Writing scripts/chemical_graph_build.py


FileNotFoundError: [Errno 2] No such file or directory: 'scripts/chemical_graph_build.py'