In [1]:
import pandas as pd
import numpy as np
data = {
    "energy source": ["solar", "wind", "hydropower", "geothermal", "biomass", "nuclear"],
    "energy consumption (mwh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "cost (million$)": [200, 400, np.nan, 150, 250, np.nan]
}    
energy_df = pd.DataFrame(data)
print("original energy data with missing values:")
print(energy_df)
    

original energy data with missing values:
  energy source  energy consumption (mwh)  cost (million$)
0         solar                    1200.0            200.0
1          wind                       NaN            400.0
2    hydropower                    2900.0              NaN
3    geothermal                       NaN            150.0
4       biomass                    2500.0            250.0
5       nuclear                    3200.0              NaN


In [2]:
cleaned_df = energy_df.dropna()
print("\ndata after removing rows with missing values:")
print(cleaned_df)


data after removing rows with missing values:
  energy source  energy consumption (mwh)  cost (million$)
0         solar                    1200.0            200.0
4       biomass                    2500.0            250.0


In [3]:
missing_values=energy_df.isnull().sum()
missing_values

energy source               0
energy consumption (mwh)    2
cost (million$)             2
dtype: int64

In [4]:
forward_filled_df = energy_df.fillna(method="ffill")
print("\ndata after forward filling:")
print(forward_filled_df)


data after forward filling:
  energy source  energy consumption (mwh)  cost (million$)
0         solar                    1200.0            200.0
1          wind                    1200.0            400.0
2    hydropower                    2900.0            400.0
3    geothermal                    2900.0            150.0
4       biomass                    2500.0            250.0
5       nuclear                    3200.0            250.0


In [5]:
energy_df["energy consumption (mwh)"].fillna(energy_df["energy consumption (mwh)"].mean(), inplace=True)
energy_df["cost (million$)"].fillna(energy_df["cost (million$)"].mean(), inplace=True)
print("\ndata after imputing missing values with mean:")
print(energy_df)


data after imputing missing values with mean:
  energy source  energy consumption (mwh)  cost (million$)
0         solar                    1200.0            200.0
1          wind                    2450.0            400.0
2    hydropower                    2900.0            250.0
3    geothermal                    2450.0            150.0
4       biomass                    2500.0            250.0
5       nuclear                    3200.0            250.0


In [6]:
missing_values=cleaned_df.isnull().sum()
missing_values

energy source               0
energy consumption (mwh)    0
cost (million$)             0
dtype: int64

In [7]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [8]:
energy_encoded_df = pd.get_dummies(energy_df, columns=["energy source"])
print("\ndata after one-hot encoding categorial variables:")
print(energy_encoded_df)


data after one-hot encoding categorial variables:
   energy consumption (mwh)  cost (million$)  energy source_biomass  \
0                    1200.0            200.0                      0   
1                    2450.0            400.0                      0   
2                    2900.0            250.0                      0   
3                    2450.0            150.0                      0   
4                    2500.0            250.0                      1   
5                    3200.0            250.0                      0   

   energy source_geothermal  energy source_hydropower  energy source_nuclear  \
0                         0                         0                      0   
1                         0                         0                      0   
2                         0                         1                      0   
3                         1                         0                      0   
4                         0                        