In [1]:
import pandas as pd
df = pd.read_csv('master_data.csv').iloc[:,:-1] # remove last column with random notes
df.head()

Unnamed: 0,Client,Cradle-to-Grave Addition,Product,Product Stage,Scope,Impact Category,Step,Default Item,Emissions (kg CO2e),Plastic (kg),Water (m3),Amount of input assessed,Unit
0,Revol Snax,,Chocolate Almond,Raw Material,3,Direct Combustion,Cocoa beans,"Cocoa beans, at farm - GLO",0.4959,,0.0271,0.019,kg
1,Revol Snax,,Chocolate Almond,Raw Material,3,Direct Combustion,Cocoa beans transportation,"Cocoa beans, transportation, ship",0.0011,,0.0,120.631,kgkm
2,Revol Snax,,Revol Snax,Raw Material,3,Direct Combustion,Organic sunflower lecithin,"Soybean oil and lecithin, France, at french mi...",0.0885,,0.0007,0.001,kg
3,Revol Snax,,Chocolate Almond,Raw Material,3,Direct Combustion,Organic sunflower lecithin transportation,"Soybean oil and lecithin, transportation",0.0,,0.0,0.0,kgkm
4,Revol Snax,,Chocolate Almond,Raw Material,3,Direct Combustion,Hazelnut,"Hazelnut, processed in FR | Ambient (long) | N...",0.0349,,0.0031,0.008,kg


In [2]:
# Preprocessing
# Replace NaN in emissions, plastic, and water with zeros
df['Plastic (kg)'] = df['Plastic (kg)'].fillna(0)
df['Water (m3)'] = df['Water (m3)'].fillna(0)
df['Emissions (kg CO2e)'] = df['Emissions (kg CO2e)'].fillna(0)
df.head()

Unnamed: 0,Client,Cradle-to-Grave Addition,Product,Product Stage,Scope,Impact Category,Step,Default Item,Emissions (kg CO2e),Plastic (kg),Water (m3),Amount of input assessed,Unit
0,Revol Snax,,Chocolate Almond,Raw Material,3,Direct Combustion,Cocoa beans,"Cocoa beans, at farm - GLO",0.4959,0.0,0.0271,0.019,kg
1,Revol Snax,,Chocolate Almond,Raw Material,3,Direct Combustion,Cocoa beans transportation,"Cocoa beans, transportation, ship",0.0011,0.0,0.0,120.631,kgkm
2,Revol Snax,,Revol Snax,Raw Material,3,Direct Combustion,Organic sunflower lecithin,"Soybean oil and lecithin, France, at french mi...",0.0885,0.0,0.0007,0.001,kg
3,Revol Snax,,Chocolate Almond,Raw Material,3,Direct Combustion,Organic sunflower lecithin transportation,"Soybean oil and lecithin, transportation",0.0,0.0,0.0,0.0,kgkm
4,Revol Snax,,Chocolate Almond,Raw Material,3,Direct Combustion,Hazelnut,"Hazelnut, processed in FR | Ambient (long) | N...",0.0349,0.0,0.0031,0.008,kg


In [3]:
def get_product_data(df_product):
    data = []
    total_emissions = df_product['Emissions (kg CO2e)'].sum()
    
    for category in df_product['Impact Category'].unique():
        dct = {}
        dct['category'] = category
        # Using emissions for now to compute percentage but may need to change this later
        df_category = df_product.loc[df_product['Impact Category'] == category]
        dct['percentage'] = df_category['Emissions (kg CO2e)'].sum() / total_emissions
        
        # Get information for subcategories
        subcategories = []
        for idx, row in df_category.iterrows():
            dct_sub = {}
            dct_sub['category'] = row['Step']
            dct_sub['percentage'] = row['Emissions (kg CO2e)'] / df_category['Emissions (kg CO2e)'].sum()
            subcategories.append(dct_sub)
            
        dct['subcategories'] = subcategories
        data.append(dct)
        
    return data

In [4]:
def get_node_data(category_data,product_name,color):
    nodes = []
    # add a node for the product
    dct = {}
    dct['key'] = product_name
    dct['text'] = product_name
    dct['color'] = color
    nodes.append(dct)
    
    # add the rest of the nodes
    for category in category_data:
        dct = {}
        dct['key'] = category['category']
        dct['ltext'] = category['category']
        dct['color'] = color
        nodes.append(dct)
    
    return nodes

In [5]:
def get_link_data(category_data,product_name):
    links = []
    
    for category in category_data:
        dct = {}
        dct['from'] = product_name
        dct['to'] = category['category']
        # This means the width will be a decimal value; need to see if GoJS is okay with this
        dct['width'] = category['percentage'] * 100
        links.append(dct)
    
    return links

In [6]:
def write_file(file,data):
    file.writelines(['[\n'])
    for items in data:
        file.writelines([str(items)+',\n'])
    file.writelines([']'])
    file.close()

In [7]:
def write_data(product_name,client_name,category_data,node_data,link_data):
    # Write data for this particular product to a file
    # Note: We will create 3 separate files (category data, node data, link data) in a folder by product
    # Product folder will be inside client's folder
    # create directories if they don't exist yet
    import os
    filename = './'+client_name+'/'+product_name+'/category.txt'
    os.makedirs(os.path.dirname(filename), exist_ok=True)

    category_file = open('./'+client_name+'/'+product_name+'/category.txt','w')
    node_file = open('./'+client_name+'/'+product_name+'/node.txt','w')
    link_file = open('./'+client_name+'/'+product_name+'/link.txt','w')

    write_file(category_file,category_data)
    write_file(node_file,node_data)
    write_file(link_file,link_data)

In [8]:
# iterate through data for a specific client and product at a time
import random
for client in df['Client'].unique():
    df_client = df.loc[df['Client'] == client]
    for product in df_client['Product'].unique():
        df_product = df_client.loc[df['Product'] == product]
        category_data = get_product_data(df_product)
        # hard coding random color for now; can change later
        node_data = get_node_data(category_data,product,"#9d75c2")
        link_data = get_link_data(category_data,product)
        write_data(product,client,category_data,node_data,link_data)