In [None]:
import pandas as pd
import numpy as np
import rasterio
from rasterio.plot import show
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

# ------------------------
# 1. FAOSTAT Crop Yield Data
# ------------------------

In [None]:
# Load the datasets
faostat_df = pd.read_csv("1_FAOSTAT_data_en_5-13-2025.csv")

In [None]:
# Inspect the dataset
print("FAOSTAT Initial Info:")
print(faostat_df.info())
print(faostat_df.head())

In [None]:
# Select relevant columns (assuming Area, Item, Element, Year, Unit, Value are key)
faostat_df = faostat_df[['Area', 'Item', 'Element', 'Year', 'Unit', 'Value']]

In [None]:
# Filter for Ethiopia and relevant elements (Yield, Area harvested, Production)
faostat_df = faostat_df[faostat_df['Area'] == 'Ethiopia']
elements = ['Yield', 'Area harvested', 'Production']
faostat_df = faostat_df[faostat_df['Element'].isin(elements)]

In [None]:
# Check for missing values
print("FAOSTAT Missing Values:")
print(faostat_df.isnull().sum())

In [None]:
# Drop rows with missing values in 'Value' (if any)
faostat_df = faostat_df.dropna(subset=['Value'])

In [None]:
# Pivot the data to have Yield, Area harvested, and Production as columns
faostat_pivoted = faostat_df.pivot_table(
    index=['Area', 'Item', 'Year'],
    columns='Element',
    values='Value',
    aggfunc='first'
).reset_index()

In [None]:
# Rename columns for clarity
faostat_pivoted.columns = ['Area', 'Crop', 'Year', 'Area_Harvested', 'Production', 'Yield']

In [None]:
# Verify units (Yield: hg/ha, Area: ha, Production: tonnes)
print("FAOSTAT Units:")
print(faostat_df[['Element', 'Unit']].drop_duplicates())

In [None]:
# Save cleaned FAOSTAT dataset
# faostat_pivoted.to_csv("FAOSTAT_cleaned.csv", index=False)
print("FAOSTAT Cleaned Dataset Saved as 'FAOSTAT_cleaned.csv'")