In [None]:
# maize_production_analysis.ipynb

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load dataset
data = pd.read_csv('faostat_maize_rwanda_2020_2023.csv')

# Preview data
print(data.head())
print(data.info())

# Data Cleaning

# Check for missing values
print(data.isnull().sum())

# Option 1: Drop rows with missing values (if few)
# data = data.dropna()

# Option 2: Fill missing values (example: fill with mean)
for col in ['Area Harvested', 'Yield', 'Production']:
    if data[col].isnull().sum() > 0:
        data[col].fillna(data[col].mean(), inplace=True)

# Convert Year to datetime (if needed)
data['Year'] = pd.to_datetime(data['Year'], format='%Y')

# Exploratory Data Analysis

# Summary statistics
print(data.describe())

# Distribution plots
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
sns.histplot(data['Area Harvested'], kde=True)
plt.title('Area Harvested Distribution')

plt.subplot(1,3,2)
sns.histplot(data['Yield'], kde=True)
plt.title('Yield Distribution')

plt.subplot(1,3,3)
sns.histplot(data['Production'], kde=True)
plt.title('Production Distribution')
plt.tight_layout()
plt.show()

# Correlation matrix
plt.figure(figsize=(6,4))
sns.heatmap(data[['Area Harvested','Yield','Production']].corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation')
plt.show()

# Feature Engineering

# Example: Extract year as numeric
data['Year_num'] = data['Year'].dt.year

# You can add more features if needed

# Prepare data for modeling
X = data[['Year_num', 'Area Harvested', 'Yield']]  # Predictor variables
y = data['Production']  # Target variable

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Building: Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Model Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Model Performance:")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"MAE: {mae:.2f}")

# Feature Importance
importances = model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(6,4))
sns.barplot(x=importances, y=feature_names)
plt.title('Feature Importance')
plt.show()

# Save cleaned dataset (optional)
data.to_csv('cleaned_maize_data.csv', index=False)
