In [24]:
import pandas as pd


embodied_carbon_path = "/Users/nattybatty/HPC_Emissions_project/emissions-tools/raw_data/node-models-embodiedcarbon (1).xlsx"
node_info_path = "/Users/nattybatty/HPC_Emissions_project/emissions-tools/raw_data/node-info-combined-2025-04-01.xlsx"


embodied_df = pd.read_excel(embodied_carbon_path)
node_info_df = pd.read_excel(node_info_path)

# clean csv
embodied_df = embodied_df.rename(columns={
    "Model": "Model",
    "Mean Emissinos (Kg CO2e)": "embodied_carbon_kgco2e"
})

# Keep all rows for each model , fill n/a
embodied_df["Model"] = embodied_df["Model"].fillna("N/a")
embodied_df["embodied_carbon_kgco2e"] = embodied_df["embodied_carbon_kgco2e"].fillna("N/a")

# Drop duplicates only
embodied_mean = embodied_df.drop_duplicates(subset=["Model"])[["Model", "embodied_carbon_kgco2e"]]

# clean 
node_info_df = node_info_df.rename(columns={
    "Name": "node_name",
    "CPUs": "cpu_cores",
    "Model": "Model",
    "Manufacturer": "Manufacturer"
})

# Fill missing values instead of dropping
node_info_df["cpu_cores"] = node_info_df["cpu_cores"].fillna("N/a")
node_info_df["Model"] = node_info_df["Model"].fillna("N/a")
node_info_df["Manufacturer"] = node_info_df["Manufacturer"].fillna("N/a")

# Create a new column for model_manufacturer_id
node_info_df["model_manufacturer_id"] = node_info_df["Manufacturer"].str.strip().str.replace(" ", "") + "_" + node_info_df["Model"].str.replace(" ", "")

# Convert cpu_cores to string for consistent comparison
node_info_df["cpu_cores"] = node_info_df["cpu_cores"].astype(str)

# Keep required columns, preserve all rows
node_info_df = node_info_df[["node_name", "cpu_cores", "Model", "model_manufacturer_id"]]

# merge
merged_df = pd.merge(node_info_df, embodied_mean, on="Model", how="left")

# fill missing values 
merged_df["embodied_carbon_kgco2e"] = merged_df["embodied_carbon_kgco2e"].fillna("N/a")

# add energy usage
merged_df["energy_100pct_kw"] = "N/a"
merged_df["energy_0pct_kw"] = "N/a"

#final sort
final_df = merged_df[[
    "node_name", "cpu_cores", "embodied_carbon_kgco2e", "energy_100pct_kw", "energy_0pct_kw", "model_manufacturer_id"
]].sort_values(by=["node_name"])

final_df.head(20)



Unnamed: 0,node_name,cpu_cores,embodied_carbon_kgco2e,energy_100pct_kw,energy_0pct_kw,model_manufacturer_id
227,erc-hpc-az-eduhub-testone001,4.0,N/a,N/a,N/a,N/a_N/a
228,erc-hpc-az-eduhub-testone002,4.0,N/a,N/a,N/a,N/a_N/a
229,erc-hpc-az-eduhub-testone003,4.0,N/a,N/a,N/a,N/a_N/a
230,erc-hpc-az-eduhub-testone004,4.0,N/a,N/a,N/a,N/a_N/a
231,erc-hpc-az-xand-test001,4.0,N/a,N/a,N/a,N/a_N/a
225,erc-hpc-comp001,128.0,13036.0,N/a,N/a,Lenovo_ThinkSystemSR645
224,erc-hpc-comp002,128.0,13036.0,N/a,N/a,Lenovo_ThinkSystemSR645
223,erc-hpc-comp003,128.0,13036.0,N/a,N/a,Lenovo_ThinkSystemSR645
222,erc-hpc-comp004,128.0,13036.0,N/a,N/a,Lenovo_ThinkSystemSR645
221,erc-hpc-comp005,128.0,13036.0,N/a,N/a,Lenovo_ThinkSystemSR645


In [25]:
final_df.to_csv("cleaned_nodes_final.csv", index=False)


In [26]:
# dictionary 
node_carbon = {
    row["node_name"]: {
        "cpu_cores": row["cpu_cores"],
        "embodied": row["embodied_carbon_kgco2e"],
        "max_power_draw": row["energy_100pct_kw"],
        "min_power_draw": row["energy_0pct_kw"],
        "model_id": row["model_manufacturer_id"]
    }
    for _, row in final_df.iterrows()
}

# Example usage:
# print(node_carbon["erc-hpc-comp131"])



In [None]:
print(node_carbon["erc-hpc-comp131"])


{'cpu_cores': '32.0', 'embodied': 7730.0, 'max_power_draw': 'N/a', 'min_power_draw': 'N/a', 'model_id': 'Dell_PowerEdgeR640'}


In [None]:

import pandas as pd


embodied_carbon_path = "/Users/nattybatty/HPC_Emissions_project/emissions-tools/raw_data/node-models-embodiedcarbon (1).xlsx"
node_info_path = "/Users/nattybatty/HPC_Emissions_project/emissions-tools/raw_data/node-info-combined-2025-04-01.xlsx"


embodied_df = pd.read_excel(embodied_carbon_path)
node_info_df = pd.read_excel(node_info_path)


# === Clean Embodied Carbon Data ===
embodied_df = embodied_df.rename(columns={
    "Model": "Model",
    "Mean Emissinos (Kg CO2e)": "embodied_carbon_kgco2e"
})

# Keep all rows, just fill missing values
embodied_df["Model"] = embodied_df["Model"].fillna("N/a")
embodied_df["embodied_carbon_kgco2e"] = embodied_df["embodied_carbon_kgco2e"].fillna("N/a")

# Drop duplicates only, no aggregation
embodied_mean = embodied_df.drop_duplicates(subset=["Model"])[["Model", "embodied_carbon_kgco2e"]]

# === Clean Node Info ===
node_info_df = node_info_df.rename(columns={
    "Name": "node_name",
    "CPUs": "cpu_cores",
    "Model": "Model",
    "Manufacturer": "Manufacturer"
})

# Fill missing values instead of dropping
node_info_df["cpu_cores"] = node_info_df["cpu_cores"].fillna("N/a")
node_info_df["Model"] = node_info_df["Model"].fillna("N/a")
node_info_df["Manufacturer"] = node_info_df["Manufacturer"].fillna("N/a")

# Create a new column for model_manufacturer_id
node_info_df["model_manufacturer_id"] = node_info_df["Manufacturer"].str.strip().str.replace(" ", "") + "_" + node_info_df["Model"].str.replace(" ", "")

# Convert cpu_cores to string for consistent comparison
node_info_df["cpu_cores"] = node_info_df["cpu_cores"].astype(str)

# Keep required columns, preserve all rows
node_info_df = node_info_df[["node_name", "cpu_cores", "Model", "model_manufacturer_id"]]

# merge
merged_df = pd.merge(node_info_df, embodied_mean, on="Model", how="left")

# replace missing values with average 
avg_carbon = pd.to_numeric(merged_df["embodied_carbon_kgco2e"], errors="coerce").dropna().astype(float).mean()
merged_df["embodied_carbon_kgco2e"] = merged_df["embodied_carbon_kgco2e"].apply(lambda x: avg_carbon if x == "N/a" or pd.isna(x) else x)

# add energy 
merged_df["energy_100pct_kw"] = "N/a"
merged_df["energy_0pct_kw"] = "N/a"

# reorder
final_df = merged_df[[
    "node_name", "cpu_cores", "embodied_carbon_kgco2e", "energy_100pct_kw", "energy_0pct_kw", "model_manufacturer_id"
]].sort_values(by=["node_name"])

final_df.head(20)

Unnamed: 0,node_name,cpu_cores,embodied_carbon_kgco2e,energy_100pct_kw,energy_0pct_kw,model_manufacturer_id
227,erc-hpc-az-eduhub-testone001,4.0,10343.072785,N/a,N/a,N/a_N/a
228,erc-hpc-az-eduhub-testone002,4.0,10343.072785,N/a,N/a,N/a_N/a
229,erc-hpc-az-eduhub-testone003,4.0,10343.072785,N/a,N/a,N/a_N/a
230,erc-hpc-az-eduhub-testone004,4.0,10343.072785,N/a,N/a,N/a_N/a
231,erc-hpc-az-xand-test001,4.0,10343.072785,N/a,N/a,N/a_N/a
225,erc-hpc-comp001,128.0,13036.0,N/a,N/a,Lenovo_ThinkSystemSR645
224,erc-hpc-comp002,128.0,13036.0,N/a,N/a,Lenovo_ThinkSystemSR645
223,erc-hpc-comp003,128.0,13036.0,N/a,N/a,Lenovo_ThinkSystemSR645
222,erc-hpc-comp004,128.0,13036.0,N/a,N/a,Lenovo_ThinkSystemSR645
221,erc-hpc-comp005,128.0,13036.0,N/a,N/a,Lenovo_ThinkSystemSR645


In [30]:
final_df.to_csv("cleaned_nodes_final_with_avg.csv", index=False)