In [1]:
print('J')

J


In [9]:
import pandas as pd
import json
import numpy as np
import os
import joblib  # <-- for saving/loading
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

# --- Paths ---
file_path = os.path.join("..", "data", "lca_dataset.csv")
imputer_path = os.path.join("..", "model", "rf_imputer.pkl")
encoders_path = os.path.join("..", "model", "label_encoders.pkl")

# Ensure models directory exists
os.makedirs(os.path.dirname(imputer_path), exist_ok=True)

# --- Load dataset ---
if not os.path.exists(file_path):
    raise FileNotFoundError(f"Dataset not found at {file_path}")

df_training = pd.read_csv(file_path)

# 1. Separate categorical and numeric columns
categorical_cols = df_training.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df_training.select_dtypes(include=[np.number]).columns.tolist()

# 2. Encode categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_training[col] = le.fit_transform(df_training[col].astype(str))
    label_encoders[col] = le

# --- Load or Fit Imputer ---
if os.path.exists(imputer_path) and os.path.exists(encoders_path):
    print("Loading saved imputer and encoders...")
    imputer = joblib.load(imputer_path)
    label_encoders = joblib.load(encoders_path)
else:
    print("Fitting new imputer on training subset...")
    imputer = IterativeImputer( estimator=RandomForestRegressor(n_estimators=50, random_state=42)
                               , max_iter=10, 
                               random_state=0 
    )
    imputer.fit(df_training)
    # imputer.fit(df_training.sample(500, random_state=42))  # subset for faster training
    print("Saving fitted imputer and encoders...")
    joblib.dump(imputer, imputer_path)
    joblib.dump(label_encoders, encoders_path)

print("Imputer ready.")

def autofill_lca_data(json_input):
    user_data = json.loads(json_input)
    df_user = pd.DataFrame([user_data])

    # --- Simple rule-based defaults ---
    if "Process Stage" in df_user.columns and df_user["Process Stage"].iloc[0] == "End-of-Life":
        if "End-of-Life Treatment" not in df_user or pd.isna(df_user["End-of-Life Treatment"]).any():
            df_user["End-of-Life Treatment"] = "Recycling"

    # Align columns
    df_user = df_user.reindex(columns=df_training.columns, fill_value=np.nan)

    # Encode categorical columns
    for col in categorical_cols:
        if col in df_user.columns:
            df_user[col] = df_user[col].apply(
                lambda x: label_encoders[col].transform([x])[0]
                if pd.notna(x) and x in label_encoders[col].classes_
                else np.nan
            )

    print("Starting imputation for user data...")
    imputed_array = imputer.transform(df_user)
    print("Imputation complete.")

    df_imputed = pd.DataFrame(imputed_array, columns=df_user.columns)

    # Decode categorical columns
    for col in categorical_cols:
        df_imputed[col] = df_imputed[col].round().astype(int)
        valid_classes = label_encoders[col].classes_
        df_imputed[col] = df_imputed[col].map(
            lambda x: valid_classes[x] if x < len(valid_classes) else "Unknown"
        )

    return df_imputed


Fitting new imputer on training subset...
Saving fitted imputer and encoders...
Imputer ready.


In [3]:

# Example usage
user_json = '''
{
  "Process Stage": "Use",
  "Technology": "Advanced",
  "Location": "Europe",
  "Raw Material Quantity (kg or unit)": null,
  "Energy Input Quantity (MJ)": null,
  "Transport Distance (km)": 500,
  "Emissions to Air CO2 (kg)": null
}
'''

final_df = autofill_lca_data(user_json)
print("Final Autofilled DataFrame:")
print(final_df)

Starting imputation for user data...
Imputation complete.
Final Autofilled DataFrame:
  Process Stage Technology Time Period Location       Functional Unit  \
0           Use   Advanced   2015-2019   Europe  1 m2 Aluminium Panel   

  Raw Material Type  Raw Material Quantity (kg or unit) Energy Input Type  \
0   Aluminium Scrap                          999.995117       Natural Gas   

   Energy Input Quantity (MJ) Processing Method  ... GHG_per_Material  \
0                13757.681641          Advanced  ...         4.929952   

   Time_Period_Numeric   Total_Cost Circularity_Score Circular_Economy_Index  \
0           2016.92749  2165.173828            40.173               0.400002   

   Recycled Content (%)  Resource Efficiency (%)  \
0             10.052256                 11.21825   

   Extended Product Life (years)  Recovery Rate (%)  Reuse Potential (%)  
0                      27.848068          11.937099             0.611402  

[1 rows x 45 columns]


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# --- Load dataset ---
file_path = os.path.join("..", "data", "lca_dataset.csv")
if not os.path.exists(file_path):
    raise FileNotFoundError(f"Dataset not found at {file_path}")

df_training = pd.read_csv(file_path)

# --- Separate categorical and numeric columns ---
categorical_cols = df_training.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df_training.select_dtypes(include=[np.number]).columns.tolist()

# --- Encode categorical columns ---
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_training[col] = le.fit_transform(df_training[col].astype(str))
    label_encoders[col] = le

# --- Function to evaluate imputer ---
def evaluate_imputer(df, numeric_cols, sample_frac=0.2, mask_fraction=0.2):
    """
    Evaluates IterativeImputer performance by masking known values and imputing them.
    """
    results = []
    for col in numeric_cols:
        # Work on a sample for speed
        df_eval = df.sample(n=2000, random_state=42) 
        # df_eval = df.sample(frac=sample_frac, random_state=42).copy()
        mask = df_eval[col].notna()
        if mask.sum() < 5:
            continue  # skip columns with too few non-null values

        mask_idx = np.random.choice(df_eval[mask].index, 
                                    size=int(mask_fraction * mask.sum()), 
                                    replace=False)
        true_values = df_eval.loc[mask_idx, col]
        df_eval.loc[mask_idx, col] = np.nan

        # Fit imputer on this masked dataset
        imputer = IterativeImputer(
            estimator=XGBRegressor(
                n_estimators=100,
                max_depth=5,
                learning_rate=0.1,
                n_jobs=-1,
                tree_method="hist",
                random_state=42
            ),
            max_iter=10,
            random_state=0
        )
        imputed_array = imputer.fit_transform(df_eval)
        df_imputed = pd.DataFrame(imputed_array, columns=df_eval.columns)
        predicted_values = df_imputed.loc[mask_idx, col]

        rmse = np.sqrt(mean_squared_error(true_values, predicted_values))
        r2 = r2_score(true_values, predicted_values)
        results.append({"Column": col, "RMSE": rmse, "R2": r2})

    results_df = pd.DataFrame(results).sort_values("R2", ascending=False)
    return results_df

# --- Run evaluation ---
metrics_df = evaluate_imputer(df_training, numeric_cols, sample_frac=0.5)
print("=== Imputer Evaluation Metrics ===")
print(metrics_df)

# --- Plot true vs. imputed for best column ---
if not metrics_df.empty:
    best_col = metrics_df.iloc[0]["Column"]
    print(f"\nPlotting True vs Imputed for best-performing column: {best_col}")

    # Mask and re-impute just for plotting
    df_eval = df_training.sample(frac=0.5, random_state=42).copy()
    mask = df_eval[best_col].notna()
    mask_idx = np.random.choice(df_eval[mask].index, size=int(0.2 * mask.sum()), replace=False)
    true_values = df_eval.loc[mask_idx, best_col]
    df_eval.loc[mask_idx, best_col] = np.nan

    imputer = IterativeImputer(
        estimator=RandomForestRegressor(n_estimators=50, random_state=42),
        max_iter=10,
        random_state=0
    )
    imputed_array = imputer.fit_transform(df_eval)
    df_imputed = pd.DataFrame(imputed_array, columns=df_eval.columns)
    predicted_values = df_imputed.loc[mask_idx, best_col]

    plt.figure(figsize=(6, 6))
    plt.scatter(true_values, predicted_values, alpha=0.5)
    plt.xlabel("True Values")
    plt.ylabel("Imputed Values")
    plt.title(f"True vs. Imputed: {best_col}")
    plt.plot([true_values.min(), true_values.max()],
             [true_values.min(), true_values.max()],
             color='red', linestyle='--')
    plt.show()
else:
    print("No numeric columns available for evaluation.")


In [None]:
# # For maximum speed (20-30 seconds):
# chunk_size = 10000  # Larger chunks
# max_cols_to_eval = 3  # Evaluate fewer columns

# # For better quality (2-3 minutes):
# chunk_size = 3000  # Smaller chunks
# max_iter = 5  # More iterations

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error, r2_score
from concurrent.futures import ProcessPoolExecutor, as_completed
import time
import warnings
warnings.filterwarnings('ignore')

# --- Configuration for different dataset sizes ---
class ImputationConfig:
    """Configuration based on dataset size"""
    def __init__(self, n_rows):
        self.n_rows = n_rows
        
        if n_rows < 5000:
            self.eval_sample_size = min(500, n_rows)
            self.max_iter = 5
            self.n_jobs = 4
            self.estimator = BayesianRidge()
        elif n_rows < 15000:
            self.eval_sample_size = 300
            self.max_iter = 3
            self.n_jobs = 6
            self.estimator = BayesianRidge()
        else:  # For 25000+ rows
            self.eval_sample_size = 200
            self.max_iter = 2
            self.n_jobs = 8
            # Use simpler estimator for very large datasets
            self.estimator = BayesianRidge(alpha_1=1e-06, lambda_1=1e-06)

# --- Load dataset ---
file_path = os.path.join("..", "data", "lca_dataset.csv")
if not os.path.exists(file_path):
    raise FileNotFoundError(f"Dataset not found at {file_path}")

print("Loading dataset...")
df_training = pd.read_csv(file_path)
print(f"Dataset shape: {df_training.shape}")

# Initialize configuration based on dataset size
config = ImputationConfig(len(df_training))
print(f"Configuration: sample_size={config.eval_sample_size}, max_iter={config.max_iter}, n_jobs={config.n_jobs}")

# --- Separate categorical and numeric columns ---
categorical_cols = df_training.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df_training.select_dtypes(include=[np.number]).columns.tolist()

print(f"Numeric columns: {len(numeric_cols)}, Categorical columns: {len(categorical_cols)}")

# --- Encode categorical columns efficiently ---
print("Encoding categorical columns...")
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    # Use category dtype for memory efficiency with large datasets
    df_training[col] = le.fit_transform(df_training[col].astype(str)).astype('category')
    label_encoders[col] = le

# --- Identify columns with missing values ---
missing_cols = [col for col in numeric_cols if df_training[col].isnull().any()]
complete_cols = [col for col in numeric_cols if not df_training[col].isnull().any()]
print(f"Columns with missing values: {len(missing_cols)}")
print(f"Complete columns: {len(complete_cols)}")

# --- Fast correlation-based feature selection ---
def select_correlated_features(df, target_col, n_features=10):
    """Select most correlated features for imputation"""
    # Use complete columns for correlation calculation
    available_cols = [col for col in df.columns if col != target_col]
    
    # Calculate correlations only with columns that have enough data
    correlations = {}
    for col in available_cols:
        if df[col].notna().sum() > len(df) * 0.5:  # At least 50% non-null
            try:
                corr = df[target_col].corr(df[col])
                if not np.isnan(corr):
                    correlations[col] = abs(corr)
            except:
                pass
    
    # Select top correlated features
    sorted_corrs = sorted(correlations.items(), key=lambda x: x[1], reverse=True)
    selected_features = [col for col, _ in sorted_corrs[:n_features]]
    return selected_features

# --- Hybrid imputation strategy ---
def hybrid_impute(df, numeric_cols, config):
    """
    Hybrid approach: SimpleImputer for initial fill, then IterativeImputer for refinement
    """
    print("\nStarting hybrid imputation...")
    df_imputed = df.copy()
    
    # Step 1: Quick initial imputation with SimpleImputer
    print("Step 1: Initial simple imputation...")
    simple_imputer = SimpleImputer(strategy='median')
    df_imputed[numeric_cols] = simple_imputer.fit_transform(df_imputed[numeric_cols])
    
    # Step 2: Identify columns that need refinement (high missing %)
    missing_percentages = df[numeric_cols].isnull().sum() / len(df)
    cols_to_refine = missing_percentages[missing_percentages > 0.1].index.tolist()
    
    if cols_to_refine:
        print(f"Step 2: Refining {len(cols_to_refine)} columns with >10% missing values...")
        
        # Use IterativeImputer only on columns that need it
        for col in cols_to_refine[:5]:  # Limit to top 5 most missing columns for speed
            print(f"  Refining {col} ({missing_percentages[col]:.1%} missing)...")
            
            # Select correlated features for this column
            selected_features = select_correlated_features(df, col, n_features=10)
            if col in selected_features:
                selected_features.remove(col)
            selected_features.append(col)
            
            # Impute using only selected features
            subset_df = df_imputed[selected_features].copy()
            
            iterative_imputer = IterativeImputer(
                estimator=config.estimator,
                max_iter=config.max_iter,
                initial_strategy='median',
                random_state=42
            )
            
            imputed_subset = iterative_imputer.fit_transform(subset_df)
            df_imputed[col] = imputed_subset[:, selected_features.index(col)]
    
    return df_imputed

# --- Optimized evaluation for large datasets ---
def evaluate_imputer_large_dataset(df, missing_cols, config, max_cols_to_eval=10):
    """
    Evaluation optimized for large datasets
    """
    print(f"\nEvaluating imputation quality on sample of {config.eval_sample_size} rows...")
    
    # Sample once for all evaluations
    df_sample = df.sample(n=min(config.eval_sample_size, len(df)), random_state=42)
    
    # Limit evaluation to most important columns
    cols_to_evaluate = missing_cols[:max_cols_to_eval]
    
    results = []
    for i, col in enumerate(cols_to_evaluate, 1):
        print(f"Evaluating {i}/{len(cols_to_evaluate)}: {col}")
        
        mask = df_sample[col].notna()
        if mask.sum() < 20:
            continue
        
        # Create test set
        df_test = df_sample.copy()
        test_size = min(50, int(0.2 * mask.sum()))
        mask_idx = np.random.choice(df_test[mask].index, size=test_size, replace=False)
        true_values = df_test.loc[mask_idx, col].values
        df_test.loc[mask_idx, col] = np.nan
        
        # Quick imputation for evaluation
        simple_imputer = SimpleImputer(strategy='median')
        imputed_values = simple_imputer.fit_transform(df_test[[col]])
        predicted_values = imputed_values[mask_idx, 0]
        
        rmse = np.sqrt(mean_squared_error(true_values, predicted_values))
        r2 = r2_score(true_values, predicted_values) if len(true_values) > 1 else 0
        
        results.append({
            "Column": col,
            "Missing %": f"{df[col].isnull().sum() / len(df) * 100:.1f}%",
            "RMSE": rmse,
            "R2": r2
        })
    
    return pd.DataFrame(results).sort_values("R2", ascending=False)

# --- Chunked imputation for very large datasets ---
def chunked_impute(df, numeric_cols, chunk_size=5000):
    """
    Process dataset in chunks for memory efficiency
    """
    print(f"\nProcessing dataset in chunks of {chunk_size} rows...")
    n_chunks = (len(df) - 1) // chunk_size + 1
    
    df_imputed_list = []
    for i in range(n_chunks):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, len(df))
        print(f"Processing chunk {i+1}/{n_chunks} (rows {start_idx}-{end_idx})...")
        
        chunk = df.iloc[start_idx:end_idx].copy()
        
        # Simple imputation for each chunk
        simple_imputer = SimpleImputer(strategy='median')
        chunk[numeric_cols] = simple_imputer.fit_transform(chunk[numeric_cols])
        df_imputed_list.append(chunk)
    
    return pd.concat(df_imputed_list, ignore_index=True)

# --- Main execution ---
print("\n" + "="*60)
print("OPTIMIZED IMPUTATION FOR LARGE DATASET")
print("="*60)

start_time = time.time()

# Choose strategy based on dataset size
if len(df_training) > 20000:
    print("\n🚀 Using FAST MODE for large dataset (25,000+ rows)")
    print("Strategy: Chunked processing + Simple imputation")
    
    # Option 1: Chunked simple imputation (fastest)
    df_imputed = chunked_impute(df_training, numeric_cols, chunk_size=5000)
    
elif len(df_training) > 10000:
    print("\n⚡ Using BALANCED MODE for medium-large dataset")
    print("Strategy: Hybrid imputation")
    
    # Option 2: Hybrid imputation
    df_imputed = hybrid_impute(df_training, numeric_cols, config)
    
else:
    print("\n🎯 Using QUALITY MODE for smaller dataset")
    print("Strategy: Full iterative imputation")
    
    # Option 3: Full iterative imputation (best quality, slower)
    imputer = IterativeImputer(
        estimator=config.estimator,
        max_iter=config.max_iter,
        random_state=42
    )
    df_imputed = df_training.copy()
    df_imputed[numeric_cols] = imputer.fit_transform(df_imputed[numeric_cols])

imputation_time = time.time() - start_time
print(f"\n✅ Imputation completed in {imputation_time:.2f} seconds")

# --- Quick evaluation ---
if missing_cols:
    print("\nRunning quick evaluation...")
    metrics_df = evaluate_imputer_large_dataset(df_training, missing_cols, config, max_cols_to_eval=5)
    print("\n=== Top 5 Imputation Results ===")
    print(metrics_df)

# --- Memory-efficient visualization ---
print("\nGenerating visualization...")
if len(missing_cols) >= 3:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    for idx, ax in enumerate(axes[:3]):
        col = missing_cols[idx]
        
        # Use very small sample for visualization
        viz_sample_size = 100
        df_viz = df_training.sample(n=viz_sample_size, random_state=42)
        mask = df_viz[col].notna()
        
        if mask.sum() > 10:
            # Compare simple imputation results
            df_test = df_viz.copy()
            test_idx = mask[mask].sample(n=min(20, mask.sum()//2)).index
            true_values = df_test.loc[test_idx, col]
            df_test.loc[test_idx, col] = np.nan
            
            simple_imputer = SimpleImputer(strategy='median')
            imputed = simple_imputer.fit_transform(df_test[[col]])
            predicted_values = pd.Series(imputed[:, 0], index=df_test.index).loc[test_idx]
            
            ax.scatter(true_values, predicted_values, alpha=0.6, s=30)
            ax.plot([true_values.min(), true_values.max()],
                    [true_values.min(), true_values.max()],
                    'r--', alpha=0.5)
            ax.set_xlabel("True Values", fontsize=9)
            ax.set_ylabel("Imputed Values", fontsize=9)
            ax.set_title(f"{col[:20]}...\nMissing: {df_training[col].isnull().sum()}", fontsize=10)
            ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# --- Save imputed dataset ---
print("\n" + "="*60)
print("FINAL STATISTICS")
print("="*60)

# Missing value comparison
print("\nMissing values BEFORE imputation:")
before_missing = df_training[numeric_cols].isnull().sum()
print(before_missing[before_missing > 0].head(10))

print("\nMissing values AFTER imputation:")
after_missing = df_imputed[numeric_cols].isnull().sum()
print(f"Total: {after_missing.sum()} (should be 0)")

# Save option
save_response = input("\n💾 Save imputed dataset? (y/n): ")
if save_response.lower() == 'y':
    output_path = os.path.join("..", "data", "lca_dataset_imputed_fast.csv")
    
    # Decode categorical columns
    df_save = df_imputed.copy()
    for col in categorical_cols:
        if col in label_encoders:
            df_save[col] = label_encoders[col].inverse_transform(
                df_imputed[col].astype(int)
            )
    
    df_save.to_csv(output_path, index=False)
    print(f"✅ Saved to: {output_path}")
    print(f"   File size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB")

# Performance summary
print("\n" + "="*60)
print("PERFORMANCE SUMMARY")
print("="*60)
print(f"Dataset size: {len(df_training):,} rows × {len(df_training.columns)} columns")
print(f"Imputation time: {imputation_time:.2f} seconds")
print(f"Speed: {len(df_training) / imputation_time:.0f} rows/second")
print(f"Estimated time for 25,000 rows: {25000 / (len(df_training) / imputation_time):.1f} seconds")