In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Possible strategies
strategies = ["Afforestation", "Renewable Energy", "Carbon Capture", "Direct Air Capture", "Soil Sequestration"]

# Generate 200k rows
n = 200_000

# Emissions in tonnes (skewed distribution: most are moderate, some very high)
emissions = np.random.lognormal(mean=9, sigma=1, size=n)

# Cost in USD (correlated with emissions but with noise)
cost = emissions * np.random.uniform(10, 50, size=n)

# Randomly assign strategies
strategy = np.random.choice(strategies, size=n)

# Effectiveness (tonnes neutralized) formula with noise
strategy_effect = {
    "Afforestation": 0.2,
    "Renewable Energy": 0.5,
    "Carbon Capture": 0.7,
    "Direct Air Capture": 0.9,
    "Soil Sequestration": 0.3
}

effectiveness = (
    emissions * np.array([strategy_effect[s] for s in strategy])
    + cost * 0.01
    + np.random.normal(0, 500, size=n)  # noise
)

# Ensure effectiveness is non-negative
effectiveness = np.maximum(effectiveness, 0)

# Create DataFrame
df_200k = pd.DataFrame({
    "Emissions (tonnes)": emissions,
    "Cost (USD)": cost,
    "Strategy": strategy,
    "Effectiveness (tonnes neutralized)": effectiveness
})

# Save dataset
df_200k.to_csv("carbon_neutrality_200k.csv", index=False)

print("Dataset created with 200k rows and saved as carbon_neutrality_200k.csv")
print(df_200k.head())


Dataset created with 200k rows and saved as carbon_neutrality_200k.csv
   Emissions (tonnes)    Cost (USD)            Strategy  \
0        13315.900854  3.907805e+05  Soil Sequestration   
1         7056.720462  2.880907e+05    Renewable Energy   
2        15485.951516  7.673180e+05    Renewable Energy   
3        37161.548247  1.779166e+06       Afforestation   
4         6411.487776  1.555803e+05  Direct Air Capture   

   Effectiveness (tonnes neutralized)  
0                         7820.693527  
1                         5962.250833  
2                        14922.907948  
3                        25888.230678  
4                         7121.138662  
