In [None]:
import pandas as pd
import numpy as np

In [None]:
## Data Cleaning Script for CO₂ Emissions Dataset

df = pd.read_csv("../data/raw_co2_emissions.csv") # Load your dataset, change name to your file location

## Remove rows with missing critical information

df = df[df["iso_code"].notna()] # Remove rows missing iso_code

required_cols = ["population", "co2", "co2_per_capita", "year"] # remove rows missing critical numeric fields
df = df.dropna(subset=required_cols)

## Convert CO₂ columns to integers safely

co2_cols = [col for col in df.columns if "co2" in col.lower()] # Identify all columns containing "co2"

# Convert them safely to integers
for col in co2_cols:
    # Convert to numeric first (handles strings, errors, etc.)
    df[col] = pd.to_numeric(df[col], errors="coerce")
    # Fill any remaining NaN with 0 or appropriate value
    df[col] = df[col].fillna(0).astype(int)

## Optional - Reset index after cleaning
df = df.reset_index(drop=True)

## Preview cleaned dataset
df.head()

## Export cleaned dataset to a new CSV file

output_path = "../data/raw_co2_emissions_cleaned.csv" ## Change this to your desired output path
df.to_csv(output_path, index=False)

print(f"Cleaned dataset exported to: {output_path}")