In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load raw data
df = pd.read_csv('../data/raw/yield_df.csv')

# Filter data for Kenya and Maize
df = df[(df['Area'] == 'Kenya') & (df['Item'] == 'Maize')]

# Handle missing values for numeric columns only
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Feature engineering (if needed)
df['rainfall_to_temp_ratio'] = df['average_rain_fall_mm_per_year'] / df['avg_temp']

# Normalize numerical features
scaler = StandardScaler()
df[['average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp', 'rainfall_to_temp_ratio']] = scaler.fit_transform(
    df[['average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp', 'rainfall_to_temp_ratio']]
)

# Save processed data
df.to_csv('../data/processed/maize_yield_kenya_processed.csv', index=False)