## Gathering Data

In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

file_path = "datasets/v1_cleaned_messied_dataset.csv"
df = pd.read_csv(file_path)
print(df.head())

   2017.NAICS.Code                                   2017.NAICS.Title  \
0              NaN  Plastics Materials and Basic Forms and Shapes ...   
1              NaN           Translation and Interpretation Services!   
2              NaN                                    Dance Companie%   
3              NaN       Other Chemical and Fertilizer Mineral Mining   
4              NaN                                  Silver Ore Mining   

        GHG                               Unit  \
0  All GHGs  kg CO2e/2022 USD, purchaser price   
1  All GHGs  kg CO2e/2022 USD, purchaser price   
2  All GHGs  kg CO2e/2022 USD, purchaser price   
3  All GHGs  kg CO2e/2022 USD, purchaser price   
4  All GHGs  kg CO2e/2022 USD, purchaser price   

  Supply.Chain.Emission.Factors.without.Margins  \
0                                         0.144   
1                                          0.08   
2                                         0.086   
3                                         0.184   
4    

## Cleaning

In [50]:
#Checking for duplicates
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")
df = df.drop_duplicates()

Number of duplicate rows: 0


In [51]:
#Clean the 2017.NAICS.Code column: remove all empty values and make strings with no decimals
def clean_naics_code(value):
    try:
        return str(int(float(value))) if pd.notna(value) and str(value).strip() not in ["", "nan"] else np.nan
    except ValueError:
        return np.nan 

df["2017.NAICS.Code"] = df["2017.NAICS.Code"].apply(clean_naics_code)
print(df["2017.NAICS.Code"].unique()[:10])

[nan '111110' '111120' '111130' '111150' '111160' '111191' '111199'
 '111211' '111219']


In [52]:
# Ensure all items in 2017.NAICS.Title are clean strings
df["2017.NAICS.Title"] = df["2017.NAICS.Title"].astype(str).str.strip().str.title()

print(df["2017.NAICS.Title"].unique()[:10])

['Plastics Materials And Basic Forms And Shapes Merchant Wholesalers'
 'Translation And Interpretation Services!' 'Dance Companie%'
 'Other Chemical And Fertilizer Mineral Mining' 'Silver Ore Mining'
 'Farm Management Services'
 'Ornamental And Architectural Metal Work Manufacturing!'
 'Geophysical Surveying And Mapping Services!'
 'Bare Printed Circuit Board Manufacturing'
 'Pipeline Transportation Of Crude Oil']


In [53]:
#Clean the Reference.USEEIO.Code column. Remove any special characters and add leading zeros if it does not have 5 digits.  
def clean_reference_code(code):
    cleaned_code = re.sub(r'\D', '', str(code))  # Remove special characters
    return cleaned_code.zfill(5) if cleaned_code else ""

df["Reference.USEEIO.Code"] = df["Reference.USEEIO.Code"].apply(clean_reference_code)
print(df["Reference.USEEIO.Code"].head())

0     04240
1     54190
2    711100
3     02123
4     02122
Name: Reference.USEEIO.Code, dtype: object


In [54]:
#Ensure all elements of Margins.of.Supply.Chain.Emission.Factors and Supply.Chain.Emission.Factors.with.Margins are numeric
num_cols = [
    "Margins.of.Supply.Chain.Emission.Factors",
    "Supply.Chain.Emission.Factors.with.Margins"
]

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  

print("Updated Data Types:")
print(df.dtypes)

Updated Data Types:
2017.NAICS.Code                                   object
2017.NAICS.Title                                  object
GHG                                               object
Unit                                              object
Supply.Chain.Emission.Factors.without.Margins     object
Margins.of.Supply.Chain.Emission.Factors         float64
Supply.Chain.Emission.Factors.with.Margins       float64
Reference.USEEIO.Code                             object
dtype: object


In [55]:
print("Data after cleaning:")
print(df.head())

Data after cleaning:
  2017.NAICS.Code                                   2017.NAICS.Title  \
0             NaN  Plastics Materials And Basic Forms And Shapes ...   
1             NaN           Translation And Interpretation Services!   
2             NaN                                    Dance Companie%   
3             NaN       Other Chemical And Fertilizer Mineral Mining   
4             NaN                                  Silver Ore Mining   

        GHG                               Unit  \
0  All GHGs  kg CO2e/2022 USD, purchaser price   
1  All GHGs  kg CO2e/2022 USD, purchaser price   
2  All GHGs  kg CO2e/2022 USD, purchaser price   
3  All GHGs  kg CO2e/2022 USD, purchaser price   
4  All GHGs  kg CO2e/2022 USD, purchaser price   

  Supply.Chain.Emission.Factors.without.Margins  \
0                                         0.144   
1                                          0.08   
2                                         0.086   
3                                        