In [176]:
%cd C:/Users\lshc3\PycharmProjects\emissions-tracking

C:\Users\lshc3\PycharmProjects\emissions-tracking


In [177]:
%%writefile functions/import_data.py

"""Import classes for each type of LCA and chemical manufacturing data used in emissions-tracking project"""

import numpy as np
import pandas as pd


class LCA(object):
    """Import EcoInvent CO2e data"""
    def __init__(self, data_path:str, lca_lists:list):
        self.data = pd.DataFrame()
        for path in lca_lists:
            filepath = data_path+"EcoInvent/"+path+"/GWP100a_IPCC2013.csv"
            try:
                raw = pd.read_csv(filepath)
            except FileNotFoundError:
                print(filepath+' not found')
            self.data = self.data.append(raw[['name', 'location', 'CO2e']][~raw['generalComment'].str.contains('market')].sort_values('name'))
        self.data_path = data_path

    def location(self, location='Global'):
        """Filter entries for EcoInvent regions"""
        loc_filepath = self.data_path+"EcoInvent\EcoInvent_locations.csv"
        loc_codes = pd.read_csv(loc_filepath)
        region_code = loc_codes['Code'][np.where(loc_codes['Name'] == location)[0]].iloc[0]
        rows = []
        for product in self.data['name'].unique():
            for code in [region_code, 'RoW', 'GLO']:
                found = self.data[(self.data['name']==product) & (self.data['location']==code)]
                if not found.index.empty:
                    rows += list(found.index)
                    break
        return self.data.loc[rows].groupby(['name','location']).mean().reset_index()


class IHSData(object):
    """Import IHS Markit chemical product and material composition data"""
    def __init__(self, data_path):
        product_file = data_path+"IHS/US/products.csv"
        material_file = data_path+"IHS/US/materials.csv"
        self.products = pd.read_csv(product_file, index_col=0).reset_index(drop=True)
        self.materials = pd.read_csv(material_file, index_col=0).reset_index(drop=True)


class ICISData(object):
    """Import ICIS chemical manufacturing data"""
    def __init__(self, filepath):
        self.countryCol = 'COUNTRY/TERRITORY'
        self.properties = ['Capacity', 'Statistic Production', 'Import', 'Export', 'Consumption']
        in_file = pd.ExcelFile(filepath)
        self.plants, self.prod, self.imps, self.exps, self.cons = [pd.DataFrame()]*5

        def append_data(prev,region,breaks,start):
            """Append data while row of ICIS data is within particular type"""
            br_end = breaks[start+1] if start+1 < len(breaks) else -1
            return prev.append(region[breaks[start]+1:br_end].dropna(subset=[self.countryCol]), ignore_index=True)

        for sheet in in_file.sheet_names[2:]:
            region = pd.read_excel(in_file, sheet_name=sheet)
            region['Region'] = [sheet]*len(region)
            breaks = region.loc[region['PRODUCT'].isin(self.properties)].index
            self.plants, self.prod, self.imps, self.exps, self.cons = [append_data(attr,region,breaks,i)
                                                                       for i, attr in enumerate([self.plants, self.prod, self.imps, self.exps, self.cons])]

Writing functions/import_data.py


In [178]:
%%writefile tests/test_import_data.py

import unittest
# import sys
# sys.path.append('C:/Users\lshc3\PycharmProjects\emissions-tracking')
# from functions.importData import LCA, IHSData, ICISData
import functions.import_data as id

class LCATest(unittest.TestCase):

        #self.valid_data =

    def test_true(self):
        # print(utl.to_listlist(['listyy']))
        # self.assertEqual(utl.to_listlist(['listyy']),utl.to_listlist(['listyy']))
        #self.assertEqual(,True)
        valid_data = id.LCA("C:/Users/lshc3/Documents/", ['Test_chemicals'])
        self.assertEqual(valid_data[0]['CO2e'], 4.86288208344407, 'Incorrect value found on test case')
        #self.assertIsInstance(id.LCA("C:/Users/lshc3/Documents/", ['Test_chemicals']), pd.DataFrame(), 'Import does not return valid pandas Dataframe')
        self.assertEqual(True,False)

    # def valid_import(self):
    #     self.assertIsInstance(self.valid_data, pd.DataFrame, 'Import does not return valid pandas Dataframe')
    #     self.assertEqual(self.valid_data[0]['CO2e'], 4.86288208344407, 'Incorrect value found on test case')
    #     self.assertRaises(FileNotFoundError, lambda: ipdata.LCA("C:/Users/lshc3/Documents/", ['invalid list']),
    #                       msg='FileNotFound error not raised for invalid filepath')
    #
    # def location_filter(self):
    #     filt_locs = set(self.valid_data.location('USA')['location'])
    #     self.assertTrue(filt_locs.issubset(['GLO','RoW','USA']), 'Location filter returns unwanted locations')

# class IHSData_test(unittest.TestCase):
#     def valid_products(self):
#         self.assertEqual(True, False, 'Invalid product read')
#     def valid_materials(self):
#         self.assertEqual(True,False, )

if __name__ == '__main__':
    unittest.main()


Writing tests/test_import_data.py


In [163]:
!pytest

platform win32 -- Python 3.8.3, pytest-5.4.3, py-1.9.0, pluggy-0.13.1
rootdir: C:\Users\lshc3\PycharmProjects\emissions-tracking


Windows fatal exception: access violation

Thread 0x00000c1c (most recent call first):
  File "<fWindows fatal exception: raccess violationo

zeWindows fatal exception: access violationn



In [165]:
# %%writefile ../../scripts/graph_classification.py

import functions.import_data as impData
#data_path = "C:/Users/lukec/OneDrive - University of Cambridge/PhD/Data/"
data_path = "C:/Users/lshc3/Documents/"
lca = impData.LCA(data_path, lca_lists = ['Basic_chemicals_201','Coke_Petro_19']).location('United States')
ihs = impData.IHSData(data_path)
icis = impData.ICISData("C:\ICIS_data/US_allchemicals.xlsx")

In [170]:
icis.plants

Unnamed: 0,PRODUCT,COUNTRY/TERRITORY,STATE,COMPANY,SITE,#,ROUTE,TECHNOLOGY,LICENSOR,START_YR,...,None.58,2037,None.59,2038,None.60,2039,None.61,2040,None.62,Region
0,2-ETHYLHEXANOL,UNITED STATES,n.a.,~ADDITIONAL,~UNKNOWN USA,1.0,BUTYRALDEHYDE,OXO,n.a.,2023,...,,220,,220,,220,,220,,CountryTerritory Summary
1,2-ETHYLHEXANOL,UNITED STATES,TEXAS,BASF CORP.,FREEPORT/TX,1.0,BUTYRALDEHYDE,OXO,BASF,-,...,,160,,160,,160,,160,,CountryTerritory Summary
2,2-ETHYLHEXANOL,UNITED STATES,TEXAS,BASF CORP.,PASADENA/TX,1.0,BUTYRALDEHYDE,OXO,UN. CARBIDE/DAVY MCKEE/J.,1983,...,,-,,-,,-,,-,,CountryTerritory Summary
3,2-ETHYLHEXANOL,UNITED STATES,TEXAS,BASF CORP.,PASADENA/TX,2.0,BUTYRALDEHYDE,n.a.,n.a.,2017,...,,100,,100,,100,,100,,CountryTerritory Summary
4,2-ETHYLHEXANOL,UNITED STATES,TEXAS,DOW CHEMICAL,TEXAS CITY/TX,1.0,BUTYRALDEHYDE,OXO,UN. CARBIDE/DAVY MCKEE/J.,-,...,,55,,55,,55,,55,,CountryTerritory Summary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4412,VISBREAKING,UNITED STATES,CALIFORNIA,UNOCAL CORP.,LOS ANGELES/CA,1.0,VISBREAKING,n.a.,n.a.,-,...,,-,,-,,-,,-,,CountryTerritory Summary
4413,VISBREAKING,UNITED STATES,TEXAS,VALERO REFINING,CORPUS CHRISTI/TX,2.0,VISBREAKING,n.a.,n.a.,1982,...,,-,,-,,-,,-,,CountryTerritory Summary
4414,VISBREAKING,UNITED STATES,LOUISIANA,VALERO REFINING,GOOD HOPE/LA,1.0,VISBREAKING,n.a.,n.a.,1980,...,,-,,-,,-,,-,,CountryTerritory Summary
4415,VISBREAKING,UNITED STATES,TEXAS,VALERO REFINING,HOUSTON/TX,1.0,VISBREAKING,n.a.,n.a.,1978,...,,-,,-,,-,,-,,CountryTerritory Summary


In [166]:
# Trial merges
merge = pd.merge(lca, icis.plants, left_on=lca['name'].str.lower(), right_on=icis.plants['PRODUCT'].str.lower(), how="left")
merge['PRODUCT'].unique()

merge_cons = pd.merge(lca, icis.cons, left_on=lca['name'].str.lower(), right_on=icis.cons['DERIVATIVE'].str.lower(), how="left")
merge_cons['DERIVATIVE'].unique()
mergeIHS = pd.merge(merge, ihs.materials, left_on=merge['PRODUCT'].str.lower(), right_on=ihs.materials['Product'].str.lower(), how="left")
mergeIHS['Product'].unique()

array([nan, 'ACETALDEHYDE', 'ACRYLONITRILE', 'ADIPIC ACID', 'ANILINE',
       'BENZENE', 'BUTADIENE', 'ETHYLENE', 'ISOPROPANOL'], dtype=object)

In [167]:
# Get ICIS plants
year = 2019
plants_clean = icis.plants.drop(columns=['#','DERIVATIVE','NOTE','@','@Sub','Region']+list(filter(lambda i: 'None' in str(i) or (type(i) is int and i!=year), icis.plants.columns)))
plants_working = plants_clean[plants_clean[year] != '-'].reset_index(drop=True)
plants_working

Unnamed: 0,PRODUCT,COUNTRY/TERRITORY,STATE,COMPANY,SITE,ROUTE,TECHNOLOGY,LICENSOR,START_YR,START_MO,COMPLEX,LATITUDE,LONGITUDE,2019
0,2-ETHYLHEXANOL,UNITED STATES,TEXAS,BASF CORP.,FREEPORT/TX,BUTYRALDEHYDE,OXO,BASF,-,na,n.a.,29.003299,-95.393879,160
1,2-ETHYLHEXANOL,UNITED STATES,TEXAS,BASF CORP.,PASADENA/TX,BUTYRALDEHYDE,n.a.,n.a.,2017,na,,29.614724,-95.052865,100
2,2-ETHYLHEXANOL,UNITED STATES,TEXAS,DOW CHEMICAL,TEXAS CITY/TX,BUTYRALDEHYDE,OXO,UN. CARBIDE/DAVY MCKEE/J.,-,na,n.a.,29.378634,-94.943043,55
3,2-ETHYLHEXANOL,UNITED STATES,TEXAS,EASTMAN CHEMICAL,LONGVIEW/TX,BUTYRALDEHYDE,OXO,EASTMAN CHEMICAL,-,na,n.a.,32.437345,-94.702061,250
4,ABS,UNITED STATES,OHIO,INEOS STYROLUTION,ADDYSTON/OH,VIRGIN RESIN,POLYMERIZATION,MONSANTO,1972,na,,39.135425,-84.713303,195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2453,VINYL ACETATE M.,UNITED STATES,TEXAS,DOW CHEMICAL,TEXAS CITY/TX,ETHYLENE,ACETOXYLATION,BAYER,1976,na,*,29.378634,-94.943043,365
2454,VINYL ACETATE M.,UNITED STATES,TEXAS,KURARAY CO.,LA PORTE/TX,ETHYLENE,ACETOXYLATION,n.a.,-,na,n.a.,29.702605,-95.038387,335
2455,VINYL ACETATE M.,UNITED STATES,TEXAS,LYONDELLBASELL INDUSTRIES,LA PORTE/TX,ETHYLENE,ACETOXYLATION,QUANTUM,1970,na,n.a.,29.708512,-95.065409,385
2456,VISBREAKING,UNITED STATES,HAWAII,PAR HAWAII REFINING,KAPOLEI/HI,VISBREAKING,n.a.,n.a.,1987,na,*,21.304607,-158.097343,650


In [168]:
# Associate with IHS process and merge
import difflib
plants_working['ihsProcess'] = list(map(lambda x: next(iter(difflib.get_close_matches(str(x), ihs.products['Process'])),None), plants_working['PRODUCT']+' '+plants_working['ROUTE']+' '+plants_working['TECHNOLOGY']))

plant_products = pd.merge(plants_working, ihs, left_on=plants_working['ihsProcess'], right_on=ihs.products['Process'], how="left")


# df2 = ihs.products.sort_values('Name')
# df2['NameNew'] = df2['Name'].apply(lambda x: next(iter(difflib.get_close_matches(x, plants_working['ROUTE'])),None))
# df2['ProcessNew'] = df2['Process'].apply(lambda x: next(iter(difflib.get_close_matches(x, plants_working['PRODUCT'])),None))
#.merge(df2)
plants_working

TypeError: Can only merge Series or DataFrame objects, a <class 'functions.import_data.IHSData'> was passed

In [None]:
# Sparse matrix with all materials for process frmo IHS

In [None]:
# Associate with Y vector from EcoInvent

In [145]:
import difflib
plants_working['ihsProcess'] = list(map(lambda x: next(iter(difflib.get_close_matches(x, ihs.products['Process'])),None), plants_working['ROUTE']+' '+plants_working['TECHNOLOGY']))

TypeError: 'float' object is not iterable

In [172]:
l = plants_working['PRODUCT']+' '+plants_working['ROUTE']+' '+plants_working['TECHNOLOGY']
list(map(lambda x: next(iter(difflib.get_close_matches(str(x), ihs.products['Process'])),None), plants_working['PRODUCT']+' '+plants_working['ROUTE']+' '+plants_working['TECHNOLOGY']))

[None,
 None,
 None,
 None,
 'ABS RESIN BY EMULSION POLYMERIZATION',
 'ABS RESIN BY EMULSION POLYMERIZATION',
 'ABS RESIN PRODUCTION',
 'ABS RESIN BY CONTINUOUS MASS POLYMERIZATION',
 'ACETALDEHYDE FROM ETHYLENE BY ONE-STEP OXIDATION',
 'ACETIC ACID BY LOW PRESSURE CARBONYLATION OF METHANOL',
 None,
 'AZELAIC ACID FROM OLEIC ACID',
 None,
 'ACETIC ACID BY LOW PRESSURE CARBONYLATION OF METHANOL',
 'ACETIC ACID BY LOW PRESSURE CARBONYLATION OF METHANOL',
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 None,
 None,
 None,
 None,
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 None,
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 'ACROLEIN BY PROPYLENE OXIDATION',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'ACRYL