In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pickle

# Load data
df = pd.read_csv('carbon_neutrality_realistic_200k.csv')

# Handle missing numeric values
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Handle categorical column
if 'Strategy' in df.columns:
    df['Strategy'] = df['Strategy'].fillna('Default')
    le = LabelEncoder()
    df['Strategy'] = le.fit_transform(df['Strategy'])
else:
    # If no strategy column, create a dummy one
    df['Strategy'] = 0
    le = LabelEncoder()
    le.fit(['Default'])

# Feature engineering: add useful ratios
df['Cost_per_Tonne'] = df['Cost (USD)'] / (df['Emissions (tonnes)'] + 1e-6)
df['log_Cost'] = np.log1p(df['Cost (USD)'])
df['log_Emission'] = np.log1p(df['Emissions (tonnes)'])

# Define features and target
X = df[['Emissions (tonnes)', 'Cost (USD)', 'Strategy', 'Cost_per_Tonne', 'log_Cost', 'log_Emission']]
y = df[['Effectiveness (tonnes neutralized)']]

# Scale continuous variables to avoid cost domination
scaler = StandardScaler()
X[['Emissions (tonnes)', 'Cost (USD)', 'Cost_per_Tonne', 'log_Cost', 'log_Emission']] = scaler.fit_transform(
    X[['Emissions (tonnes)', 'Cost (USD)', 'Cost_per_Tonne', 'log_Cost', 'log_Emission']]
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=15)
model.fit(X_train, y_train)

# Predict and post-process to enforce physical constraint
y_pred = model.predict(X_test)
y_pred = np.minimum(y_pred, X_test['Emissions (tonnes)'].abs())  # enforce realism

# Save model, scaler, and label encoder
with open('model4.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('scaler4.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

with open('label_encoder4.pkl', 'wb') as le_file:
    pickle.dump(le, le_file)

print("✅ Model trained, scaled, and saved successfully with physical constraints.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['Emissions (tonnes)', 'Cost (USD)', 'Cost_per_Tonne', 'log_Cost', 'log_Emission']] = scaler.fit_transform(
  return fit_method(estimator, *args, **kwargs)


✅ Model trained, scaled, and saved successfully with physical constraints.


In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pickle

# Load data
df = pd.read_csv('carbon_neutrality_realistic.csv')

# Handle missing numeric values
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Handle categorical column
if 'Strategy' in df.columns:
    df['Strategy'] = df['Strategy'].fillna('Default')
    le = LabelEncoder()
    df['Strategy'] = le.fit_transform(df['Strategy'])
else:
    # If no strategy column, create a dummy one
    df['Strategy'] = 0
    le = LabelEncoder()
    le.fit(['Default'])

# Feature engineering: add useful ratios
df['Cost_per_Tonne'] = df['Cost (USD)'] / (df['Emissions (tonnes)'] + 1e-6)
df['log_Cost'] = np.log1p(df['Cost (USD)'])
df['log_Emission'] = np.log1p(df['Emissions (tonnes)'])

# Define features and target
X = df[['Emissions (tonnes)', 'Cost (USD)', 'Strategy', 'Cost_per_Tonne', 'log_Cost', 'log_Emission']]
y = df[['Effectiveness (tonnes neutralized)']]

# Scale continuous variables to avoid cost domination
scaler = StandardScaler()
X[['Emissions (tonnes)', 'Cost (USD)', 'Cost_per_Tonne', 'log_Cost', 'log_Emission']] = scaler.fit_transform(
    X[['Emissions (tonnes)', 'Cost (USD)', 'Cost_per_Tonne', 'log_Cost', 'log_Emission']]
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numeric features
cols_to_scale = ['Emissions (tonnes)', 'Cost (USD)', 'Cost_per_Tonne', 'log_Cost', 'log_Emission']
X.loc[:, cols_to_scale] = scaler.fit_transform(X[cols_to_scale])

# Train model (flatten y to 1D)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train.values.ravel())


# Predict and post-process to enforce physical constraint
y_pred = model.predict(X_test)
y_pred = np.minimum(y_pred, X_test['Emissions (tonnes)'].abs())  # enforce realism

# Save model, scaler, and label encoder
with open('model4.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('scaler4.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

with open('label_encoder4.pkl', 'wb') as le_file:
    pickle.dump(le, le_file)

print("✅ Model trained, scaled, and saved successfully with physical constraints.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['Emissions (tonnes)', 'Cost (USD)', 'Cost_per_Tonne', 'log_Cost', 'log_Emission']] = scaler.fit_transform(


KeyboardInterrupt: 