In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.optimize import minimize
import pickle
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('carbon_neutrality_200k.csv')

In [3]:
print(df.head())

   Emissions (tonnes)    Cost (USD)            Strategy  \
0        13315.900854  3.907805e+05  Soil Sequestration   
1         7056.720462  2.880907e+05    Renewable Energy   
2        15485.951516  7.673180e+05    Renewable Energy   
3        37161.548247  1.779166e+06       Afforestation   
4         6411.487776  1.555803e+05  Direct Air Capture   

   Effectiveness (tonnes neutralized)  
0                         7820.693527  
1                         5962.250833  
2                        14922.907948  
3                        25888.230678  
4                         7121.138662  


In [4]:
# Check for NaN values
print(df.isna().sum())

Emissions (tonnes)                    0
Cost (USD)                            0
Strategy                              0
Effectiveness (tonnes neutralized)    0
dtype: int64


In [5]:
# Handle missing values in the categorical column
if 'Strategy' in df.columns:
    # Impute missing values in the categorical column with the most frequent value
    most_frequent_strategy = df['Strategy'].mode()[0]
    df['Strategy'] = df['Strategy'].fillna(most_frequent_strategy)

In [6]:
# Handle missing values in numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

In [7]:
# Re-check for NaN values to ensure they've been handled
print(df.isna().sum())

Emissions (tonnes)                    0
Cost (USD)                            0
Strategy                              0
Effectiveness (tonnes neutralized)    0
dtype: int64


In [8]:
# Encode the categorical column (if necessary for model)
if 'Strategy' in df.columns:
    # Example using Label Encoding (for simplicity)
    le = LabelEncoder()
    df['Strategy'] = le.fit_transform(df['Strategy'])

In [9]:
# Features and target variables
X = df[['Emissions (tonnes)', 'Cost (USD)', 'Strategy']]
y = df[['Effectiveness (tonnes neutralized)']]

In [10]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)


In [12]:
# Train the model
model.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [13]:
# Predict on the test set
y_pred = model.predict(X_test)

In [14]:
# Define a function to predict effectiveness based on emissions and different strategies
def predict_effectiveness(emissions, cost, strategies):
    results = {}
    for strategy in strategies:
        # Encode strategy
        strategy_encoded = le.transform([strategy])[0]
        
        # Prepare input data
        new_input_df = pd.DataFrame({
            'Emissions (tonnes)': [emissions],
            'Cost (USD)': [cost],
            'Strategy': [strategy_encoded]
        })
        
        # Predict effectiveness
        predicted_effectiveness = model.predict(new_input_df)
        results[strategy] = predicted_effectiveness[0]
    
    return results

In [15]:
# Define a function to predict effectiveness and suggest the best strategy
def suggest_best_strategy(emissions, cost):
    strategies = le.classes_  # Retrieve original strategy names
    
    # Prepare input data for all strategies
    input_data = pd.DataFrame({
        'Emissions (tonnes)': [emissions] * len(strategies),
        'Cost (USD)': [cost] * len(strategies),
        'Strategy': le.transform(strategies)
    })
    
    # Predict effectiveness
    predicted_effectiveness = model.predict(input_data)
    
    # Find the best strategy
    best_index = np.argmax(predicted_effectiveness)
    best_strategy = strategies[best_index]
    best_effectiveness = predicted_effectiveness[best_index]
    
    # Return the best strategy and its predicted effectiveness
    return best_strategy, best_effectiveness

In [16]:
# Example usage
emissions = 889
cost = 65667


In [17]:
best_strategy, best_effectiveness = suggest_best_strategy(emissions, cost)

In [18]:
predictions = predict_effectiveness(emissions, cost, strategies)


NameError: name 'strategies' is not defined

In [19]:
# Print the results
print(f'For {emissions} tonnes of emissions and ${cost} cost:')
print(f'The best strategy to follow is {best_strategy}')
print(f'Predicted Effectiveness: {best_effectiveness:.2f} tonnes neutralized')

For 889 tonnes of emissions and $65667 cost:
The best strategy to follow is Direct Air Capture
Predicted Effectiveness: 1164.71 tonnes neutralized


In [21]:
# Save the model and label encoder
with open('model3.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('label_encoder3.pkl', 'wb') as le_file:
    pickle.dump(le, le_file)