In [1]:
import pandas as pd

df = pd.read_csv('MachineLearinningDataSet.csv')

# Randomly select 50% of the DataFrame
subset_df = df.sample(frac=0.5, random_state=42)

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

df = pd.read_csv('sampled_data_FS.csv')

# Define feature sets
filter_features = ['Surface area (sq. km)', 'Agricultural land (sq. km)', 'Land area (sq. km)', 'Forest area (sq. km)', 'Agricultural methane emissions (thousand metric tons of CO2 equivalent)', 'Rural population', 'Arable land (hectares)', 'Agriculture, forestry, and fishing, value added (current US$)', 'Agricultural nitrous oxide emissions (thousand metric tons of CO2 equivalent)', 'Arable land (% of land area)', 'Average precipitation in depth (mm per year)', 'Agricultural land (% of land area)', 'Land under cereal production (hectares)']

wrapper_features = ['Access to electricity, rural (% of rural population)', 'Agricultural raw materials exports (% of merchandise exports)', 'Agricultural raw materials imports (% of merchandise imports)', 'Agriculture, forestry, and fishing, value added (% of GDP)', 'Arable land (% of land area)', 'Arable land (hectares per person)', 'Employment in agriculture (% of total employment) (modeled ILO estimate)', 'Employment in agriculture, female (% of female employment) (modeled ILO estimate)', 'Employment in agriculture, male (% of male employment) (modeled ILO estimate)', 'Permanent cropland (% of land area)']

hybrid_features = ['Permanent cropland (% of land area)', 'Employment in agriculture (% of total employment) (modeled ILO estimate)', 'Agricultural raw materials imports (% of merchandise imports)', 'Access to electricity, rural (% of rural population)', 'Agricultural raw materials exports (% of merchandise exports)', 'Arable land (% of land area)', 'Employment in agriculture, male (% of male employment) (modeled ILO estimate)', 'Rural population growth (annual %)', 'Arable land (hectares per person)', 'Rural population (% of total population)']

target = 'Cereal production (metric tons)'

def evaluate_rf(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    smape = np.mean(2 * np.abs(y_test - y_pred) / (np.abs(y_test) + np.abs(y_pred))) * 100
    return rmse, r2, mae, smape

# Evaluate feature sets
results = {}
for name, features in [('Filter', filter_features), ('Wrapper', wrapper_features), ('Hybrid', hybrid_features)]:
    X = df[features]
    y = df[target]
    rmse, r2, mae, smape = evaluate_rf(X, y)
    results[name] = {'RMSE': rmse, 'R2': r2, 'MAE': mae, 'SMAPE': smape}

for method, metrics in results.items():
    print(f"{method} Method:")
    print(f"  MAE: {metrics['MAE']:.4f}")
    print(f"  SMAPE: {metrics['SMAPE']:.4f}%")
    print(f"  RMSE: {metrics['RMSE']:.4f}")
    print(f"  R2: {metrics['R2']:.4f}")
    print()


Filter Method:
  MAE: 798523.0720
  SMAPE: 10.4627%
  RMSE: 2928187.1590
  R2: 0.9589

Wrapper Method:
  MAE: 1431224.9817
  SMAPE: 28.7225%
  RMSE: 4052795.5255
  R2: 0.9214

Hybrid Method:
  MAE: 1517885.7894
  SMAPE: 34.6370%
  RMSE: 4419371.6298
  R2: 0.9065

