In [None]:

import os
import pandas as pd
import numpy as np
import joblib
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import warnings

warnings.filterwarnings("ignore")

# Paths
BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, "data")
MODEL_DIR = os.path.join(BASE_DIR, "model")

os.makedirs(MODEL_DIR, exist_ok=True)


In [None]:

DATA_PATH = os.path.join(DATA_DIR, "lca_dataset.csv")

df = pd.read_csv(DATA_PATH)
print("✅ Dataset Loaded. Shape:", df.shape)

# Preview
df.head()


✅ Dataset Loaded. Shape: (25000, 45)


Unnamed: 0,Process Stage,Technology,Time Period,Location,Functional Unit,Raw Material Type,Raw Material Quantity (kg or unit),Energy Input Type,Energy Input Quantity (MJ),Processing Method,...,GHG_per_Material,Time_Period_Numeric,Total_Cost,Circularity_Score,Circular_Economy_Index,Recycled Content (%),Resource Efficiency (%),Extended Product Life (years),Recovery Rate (%),Reuse Potential (%)
0,Transport,Conventional,2020-2025,South America,1 kg Copper Wire,Aluminium Scrap,1000,Electricity,2289.61,Conventional,...,1.08,2023,3763.25,50.66,0.51,74.52,74.77,26.8,9.0,0.09
1,Use,Emerging,2015-2019,Asia,1 m2 Aluminium Panel,Aluminium Ore,500,Electricity,7368.72,Emerging,...,1.05,2017,2063.83,26.93,0.27,10.0,11.83,46.4,93.64,25.18
2,Manufacturing,Advanced,2020-2025,North America,1 kg Copper Wire,Aluminium Scrap,1000,Coal,1586.35,Advanced,...,1.65,2023,2613.4,99.3,0.99,81.46,82.33,101.6,66.04,73.52
3,Use,Emerging,2010-2014,North America,1 m2 Aluminium Panel,Aluminium Ore,500,Natural Gas,7448.21,Emerging,...,6.46,2023,1995.41,23.49,0.23,10.0,12.3,69.3,85.74,39.43
4,Use,Conventional,2015-2019,South America,1 kg Aluminium Sheet,Aluminium Scrap,1000,Coal,1470.09,Conventional,...,2.13,2017,2565.81,100.0,1.0,75.85,76.84,23.1,62.67,90.2


In [4]:
missing_summary = df.isnull().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)
print("Columns with Missing Values:\n")
print(missing_summary)

Columns with Missing Values:

Series([], dtype: int64)


In [5]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

print("Categorical Columns:", categorical_cols)
print("Numeric Columns:", numeric_cols)

Categorical Columns: ['Process Stage', 'Technology', 'Time Period', 'Location', 'Functional Unit', 'Raw Material Type', 'Energy Input Type', 'Processing Method', 'Transport Mode', 'Fuel Type', 'Metal Quality Grade', 'Material Scarcity Level', 'End-of-Life Treatment']
Numeric Columns: ['Raw Material Quantity (kg or unit)', 'Energy Input Quantity (MJ)', 'Transport Distance (km)', 'Material Cost (USD)', 'Processing Cost (USD)', 'Emissions to Air CO2 (kg)', 'Emissions to Air SOx (kg)', 'Emissions to Air NOx (kg)', 'Emissions to Air Particulate Matter (kg)', 'Emissions to Water Acid Mine Drainage (kg)', 'Emissions to Water Heavy Metals (kg)', 'Emissions to Water BOD (kg)', 'Greenhouse Gas Emissions (kg CO2-eq)', 'Scope 1 Emissions (kg CO2-eq)', 'Scope 2 Emissions (kg CO2-eq)', 'Scope 3 Emissions (kg CO2-eq)', 'Environmental Impact Score', 'Metal Recyclability Factor', 'Energy_per_Material', 'Total_Air_Emissions', 'Total_Water_Emissions', 'Transport_Intensity', 'GHG_per_Material', 'Time_Peri

In [6]:
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

print("✅ All categorical columns encoded.")

✅ All categorical columns encoded.


In [9]:
from xgboost import XGBRegressor
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# Define the estimator for the imputer
xgb_estimator = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    tree_method="hist"  # faster and memory efficient
)

# Define the imputer
imputer = IterativeImputer(
    estimator=xgb_estimator,
    max_iter=10,
    random_state=42,
    verbose=2
)

# Fit on training data
imputer.fit(df_train)

[IterativeImputer] Completing matrix with shape (20000, 45)
[IterativeImputer] Ending imputation round 1/10, elapsed time 60.86
[IterativeImputer] Change: 0.0, scaled tolerance: 17.992720000000002 
[IterativeImputer] Early stopping criterion reached.


In [10]:
# --- Cell 8: Evaluate imputer performance on validation set ---
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score

# Make a copy of validation data and artificially remove some values to test imputation accuracy
df_val_copy = df_val.copy()
mask = np.random.rand(*df_val_copy.shape) < 0.1  # randomly remove 10% of values
df_val_masked = df_val_copy.mask(mask)

# Impute the masked dataset
df_val_imputed = pd.DataFrame(imputer.transform(df_val_masked), columns=df_val.columns)

# Compare only numeric columns
numeric_cols = df_val.select_dtypes(include=[np.number]).columns

mae_scores = {}
r2_scores = {}
for col in numeric_cols:
    true_vals = df_val_copy[col].values
    pred_vals = df_val_imputed[col].values
    mae = mean_absolute_error(true_vals, pred_vals)
    r2 = r2_score(true_vals, pred_vals)
    mae_scores[col] = mae
    r2_scores[col] = r2

print("📊 Imputer Evaluation Results:")
for col in numeric_cols:
    print(f"{col:<40} | MAE: {mae_scores[col]:.4f} | R²: {r2_scores[col]:.4f}")


[IterativeImputer] Completing matrix with shape (5000, 45)
[IterativeImputer] Ending imputation round 1/1, elapsed time 0.40
📊 Imputer Evaluation Results:
Process Stage                            | MAE: 0.1106 | R²: 0.9105
Technology                               | MAE: 0.0112 | R²: 0.9739
Time Period                              | MAE: 0.0667 | R²: 0.9037
Location                                 | MAE: 0.0947 | R²: 0.9049
Functional Unit                          | MAE: 0.0129 | R²: 0.9815
Raw Material Type                        | MAE: 0.0918 | R²: 0.9245
Raw Material Quantity (kg or unit)       | MAE: 0.9363 | R²: 0.9932
Energy Input Type                        | MAE: 0.0206 | R²: 0.9766
Energy Input Quantity (MJ)               | MAE: 67.0580 | R²: 0.9894
Processing Method                        | MAE: 0.0110 | R²: 0.9740
Transport Mode                           | MAE: 0.0152 | R²: 0.9705
Transport Distance (km)                  | MAE: 10.4977 | R²: 0.9693
Fuel Type                  

In [11]:
# --- Cell 9: Save trained imputer & encoders ---
import joblib
import os

MODEL_DIR = os.path.join(BASE_DIR, "model")
os.makedirs(MODEL_DIR, exist_ok=True)

IMPUTER_PATH = os.path.join(MODEL_DIR, "xgb_imputer.pkl")
ENCODERS_PATH = os.path.join(MODEL_DIR, "label_encoders.pkl")

joblib.dump(imputer, IMPUTER_PATH)
joblib.dump(label_encoders, ENCODERS_PATH)

print(f"✅ Imputer saved at: {IMPUTER_PATH}")
print(f"✅ Encoders saved at: {ENCODERS_PATH}")


✅ Imputer saved at: c:\Users\ommah\Python_om_eng\Git\ml_Alloyance\model\xgb_imputer.pkl
✅ Encoders saved at: c:\Users\ommah\Python_om_eng\Git\ml_Alloyance\model\label_encoders.pkl


In [13]:
# --- Cell 10: Reload & quick test ---
imputer_loaded = joblib.load(IMPUTER_PATH)
encoders_loaded = joblib.load(ENCODERS_PATH)

# Sample autofilling of row data with missing values
sample_test = df_val.sample(1).copy()
sample_test.iloc[0, :5] = np.nan  # intentionally wipe out some values

print("Before Imputation:")
display(sample_test)

sample_imputed = pd.DataFrame(imputer_loaded.transform(sample_test), columns=df_val.columns)

print("After Imputation:")
display(sample_imputed)


Before Imputation:


Unnamed: 0,Process Stage,Technology,Time Period,Location,Functional Unit,Raw Material Type,Raw Material Quantity (kg or unit),Energy Input Type,Energy Input Quantity (MJ),Processing Method,...,GHG_per_Material,Time_Period_Numeric,Total_Cost,Circularity_Score,Circular_Economy_Index,Recycled Content (%),Resource Efficiency (%),Extended Product Life (years),Recovery Rate (%),Reuse Potential (%)
7930,,,,,,0,1000,2,13840.42,2,...,8.81,2023,4213.79,0.0,0.0,10.0,6.68,41.8,4.17,0.56


[IterativeImputer] Completing matrix with shape (1, 45)
[IterativeImputer] Ending imputation round 1/1, elapsed time 0.01
After Imputation:


Unnamed: 0,Process Stage,Technology,Time Period,Location,Functional Unit,Raw Material Type,Raw Material Quantity (kg or unit),Energy Input Type,Energy Input Quantity (MJ),Processing Method,...,GHG_per_Material,Time_Period_Numeric,Total_Cost,Circularity_Score,Circular_Economy_Index,Recycled Content (%),Resource Efficiency (%),Extended Product Life (years),Recovery Rate (%),Reuse Potential (%)
0,1.833018,2.015562,1.114949,1.486339,0.992329,0.0,1000.0,2.0,13840.42,2.0,...,8.81,2023.0,4213.79,0.0,0.0,10.0,6.68,41.8,4.17,0.56
