# Random Forest

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

def find_best_params(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('rf', RandomForestRegressor(random_state=42))
    ])

    param_grid = {
        'rf__n_estimators': [200, 300, 400, 500],
        'rf__max_depth': [None, 20, 30, 40],
        'rf__min_samples_split': [2, 5, 10],
        'rf__min_samples_leaf': [1, 2, 4],
        'rf__max_features': ['sqrt', 'log2']
    }

    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print("RF - Best Parameters:", grid_search.best_params_)

    return grid_search.best_params_

if __name__ == "__main__":
    df = pd.read_csv("MachineLearinningDataSet.csv")
    target_column = 'Cereal production (metric tons)'
    
    variables = ['Agriculture, forestry, and fishing, value added (current US$)', 
                 'Agricultural land (sq. km)', 
                 'Forest area (sq. km)', 
                 'Rural population', 
                 'Agricultural land (% of land area)', 
                 'Arable land (% of land area)', 
                 'Agricultural methane emissions (thousand metric tons of CO2 equivalent)', 
                 'Land under cereal production (hectares)', 
                 'Average precipitation in depth (mm per year)']

    X = df[variables].copy() 
    y = df[target_column]

    # Feature Creation using columns from the original DataFrame
    X['arable_land_ratio'] = df['Arable land (hectares)'] / df['Land area (sq. km)']
    X['forest_ratio'] = df['Forest area (sq. km)'] / df['Land area (sq. km)']
    X['agricultural_land_ratio'] = df['Agricultural land (sq. km)'] / df['Land area (sq. km)']
    X['rural_population_density'] = df['Rural population'] / df['Land area (sq. km)']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Find best parameters for RF model
    best_params = find_best_params(X_train, y_train)

RF - Best Parameters: {'rf__max_depth': 30, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 500}


# Best Parameter, Cross Validation - RF

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import joblib

def smpe(y_true, y_pred):
    return 100 * np.mean(np.abs((y_true - y_pred) / ((y_true + y_pred) / 2)))

def train_rf_cv(X, y, n_splits=5):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_scores = {'mae': [], 'rmse': [], 'r2': [], 'smpe': []}
    pipelines = []

    for train_index, test_index in kfold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        pipeline = Pipeline([
            ('scaler', MinMaxScaler()),
            ('rf', RandomForestRegressor(
                n_estimators=500, 
                max_depth=30, 
                min_samples_split=2, 
                min_samples_leaf=1, 
                max_features='sqrt', 
                random_state=42))])

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        smpe_score = smpe(y_test, y_pred)

        cv_scores['mae'].append(mae)
        cv_scores['rmse'].append(rmse)
        cv_scores['r2'].append(r2)
        cv_scores['smpe'].append(smpe_score)
        pipelines.append(pipeline)

    print("Cross-validation results:")
    print("Mean MAE:", np.mean(cv_scores['mae']))
    print("Mean SMPE:", np.mean(cv_scores['smpe']))
    print("Mean RMSE:", np.mean(cv_scores['rmse']))
    print("Mean R2:", np.mean(cv_scores['r2']))


    final_pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('rf', RandomForestRegressor(
            n_estimators=500,
            max_depth=30,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='sqrt',
            random_state=42
        ))
    ])

    final_pipeline.fit(X, y)
    return final_pipeline


if __name__ == "__main__":
    df = pd.read_csv("MachineLearinningDataSet.csv")
    target_column = 'Cereal production (metric tons)'

    variables = ['Agriculture, forestry, and fishing, value added (current US$)', 
                 'Agricultural land (sq. km)', 
                 'Forest area (sq. km)', 
                 'Rural population', 
                 'Agricultural land (% of land area)', 
                 'Arable land (% of land area)', 
                 'Agricultural methane emissions (thousand metric tons of CO2 equivalent)', 
                 'Land under cereal production (hectares)', 
                 'Average precipitation in depth (mm per year)']

    X = df[variables].copy()
    y = df[target_column]
    
    X['arable_land_ratio'] = df['Arable land (hectares)'] / df['Land area (sq. km)']
    X['forest_ratio'] = df['Forest area (sq. km)'] / df['Land area (sq. km)']
    X['agricultural_land_ratio'] = df['Agricultural land (sq. km)'] / df['Land area (sq. km)']
    X['rural_population_density'] = df['Rural population'] / df['Land area (sq. km)']

    X = X.values
    y = y.values

    rf_model = train_rf_cv(X, y, n_splits=5)
    
    joblib.dump(rf_model, 'random_forest_model.pkl')

Cross-validation results:
Mean MAE: 551689.7434524591
Mean SMPE: 20.35047759262318
Mean RMSE: 2107075.5321499975
Mean R2: 0.9667551183576917


In [2]:
print(f"Number of features: {rf_model.named_steps['rf'].n_features_in_}")

Number of features: 13


# Artificial Neural Network 

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from kerastuner import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

def build_model(hp):
    model = keras.Sequential()
    model.add(keras.layers.Dense(units=hp.Int('units_1', min_value=32, max_value=512, step=32),
                                 activation='relu',
                                 input_shape=[X_train_scaled.shape[1]]))
    
    for i in range(hp.Int('num_layers', 1, 5)):
        model.add(keras.layers.Dense(units=hp.Int(f'units_{i+2}', min_value=32, max_value=512, step=32),
                                     activation='relu'))
    
    model.add(keras.layers.Dense(1))
    
    model.compile(
        optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
        loss='mse',
        metrics=['mae']
    )
    return model

def find_best_params(X_train_scaled, y_train_scaled, max_trials=10, epochs=100):
    tuner = RandomSearch(
        build_model,
        objective='val_loss',
        max_trials=max_trials,
        executions_per_trial=1,
        directory='ann_tuning',
        project_name='cereal_production'
    )
    
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
    
    tuner.search(X_train_scaled, y_train_scaled,
                 epochs=epochs,
                 validation_split=0.2,
                 callbacks=[early_stop])
    
    best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
    
    print("Best Hyperparameters:", best_hyperparameters.values)
    
    return best_hyperparameters

def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    target_column = 'Cereal production (metric tons)'
    
    variables = ['Agriculture, forestry, and fishing, value added (current US$)', 
                 'Agricultural land (sq. km)', 
                 'Forest area (sq. km)', 
                 'Rural population', 
                 'Agricultural land (% of land area)', 
                 'Arable land (% of land area)', 
                 'Agricultural methane emissions (thousand metric tons of CO2 equivalent)', 
                 'Land under cereal production (hectares)', 
                 'Average precipitation in depth (mm per year)']
    
    X = df[variables].copy()
    y = df[target_column]
    
    # Feature Creation using columns from the original DataFrame
    X['arable_land_ratio'] = df['Arable land (hectares)'] / df['Land area (sq. km)']
    X['forest_ratio'] = df['Forest area (sq. km)'] / df['Land area (sq. km)']
    X['agricultural_land_ratio'] = df['Agricultural land (sq. km)'] / df['Land area (sq. km)']
    X['rural_population_density'] = df['Rural population'] / df['Land area (sq. km)']
    
    return X, y

def split_and_scale_data(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Apply MinMaxScaler to features
    X_scaler = MinMaxScaler()
    X_train_scaled = X_scaler.fit_transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    
    # Apply MinMaxScaler to target variable
    y_scaler = MinMaxScaler()
    y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
    
    return X_train_scaled, X_test_scaled, y_train_scaled, y_test, y_scaler

def main():
    # Load and preprocess data
    X, y = load_and_preprocess_data("MachineLearinningDataSet.csv")
    
    # Split and scale data
    X_train_scaled, X_test_scaled, y_train_scaled, y_test, y_scaler = split_and_scale_data(X, y)
    
    # Find best hyperparameters for ANN model
    best_hyperparameters = find_best_params(X_train_scaled, y_train_scaled)

if __name__ == "__main__":
    main()

Reloading Tuner from ann_tuning\cereal_production\tuner0.json
Best Hyperparameters: {'units_1': 480, 'num_layers': 4, 'units_2': 64, 'learning_rate': 0.0001, 'units_3': 320, 'units_4': 128, 'units_5': 128}


  from kerastuner import RandomSearch


# Best Parameter, Cross Validation - ANN

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow import keras
import matplotlib.pyplot as plt


def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def build_model(input_shape):
    model = keras.Sequential([
        keras.layers.Dense(480, activation='relu', input_shape=[input_shape]),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(320, activation='relu'),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dense(1)
    ])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001),
                  loss='mse', metrics=['mae'])
    return model

def train_ann_cv(X, y, n_splits=5, epochs=100):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_scores = {'mae': [], 'smape': [], 'rmse': [], 'r2': []}
    
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        X_scaler, y_scaler = MinMaxScaler(), MinMaxScaler()
        X_train_scaled = X_scaler.fit_transform(X_train)
        X_test_scaled = X_scaler.transform(X_test)
        y_train_scaled = y_scaler.fit_transform(y_train.reshape(-1, 1))
        
        model = build_model(X_train_scaled.shape[1])
        early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
        
        model.fit(X_train_scaled, y_train_scaled, epochs=epochs,
                  validation_split=0.2, callbacks=[early_stop], verbose=0)
        
        y_pred = y_scaler.inverse_transform(model.predict(X_test_scaled)).flatten()
        
        mae = mean_absolute_error(y_test, y_pred)
        smape_score = smape(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        
        cv_scores['mae'].append(mae)
        cv_scores['smape'].append(smape_score)
        cv_scores['rmse'].append(rmse)
        cv_scores['r2'].append(r2)
    
    for metric, scores in cv_scores.items():
        print(f"Mean {metric.upper()}: {np.mean(scores)}")
    
    final_model = build_model(X.shape[1])
    X_scaler, y_scaler = MinMaxScaler(), MinMaxScaler()
    final_model.fit(X_scaler.fit_transform(X), y_scaler.fit_transform(y.reshape(-1, 1)),
                    epochs=epochs, validation_split=0.2, callbacks=[early_stop], verbose=0)
    
    return final_model, X_scaler, y_scaler


if __name__ == "__main__":
    df = pd.read_csv("MachineLearinningDataSet.csv")
    target_column = 'Cereal production (metric tons)'
    
    filter_features = ['Agricultural land (sq. km)', 'Land under cereal production (hectares)', 'Rural population', 
                       'Average precipitation in depth (mm per year)', 'Agricultural nitrous oxide emissions (thousand metric tons of CO2 equivalent)', 
                       'Arable land (% of land area)', 'Forest area (sq. km)', 'Surface area (sq. km)', 'Land area (sq. km)', 
                       'Agricultural methane emissions (thousand metric tons of CO2 equivalent)', 'Agriculture, forestry, and fishing, value added (current US$)', 
                       'Arable land (hectares)']
    
    X = df[filter_features].copy()
    y = df[target_column]
    
    # Feature engineering using original DataFrame
    X['arable_land_ratio'] = df['Arable land (hectares)'] / df['Land area (sq. km)']
    X['forest_ratio'] = df['Forest area (sq. km)'] / df['Land area (sq. km)']
    X['agricultural_land_ratio'] = df['Agricultural land (sq. km)'] / df['Land area (sq. km)']
    X['rural_population_density'] = df['Rural population'] / df['Land area (sq. km)']
    
    # Convert to numpy arrays
    X = X.values
    y = y.values
    
    ann_model, X_scaler, y_scaler = train_ann_cv(X, y, n_splits=5, epochs=100)
    

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Mean MAE: 1487108.90177726
Mean SMAPE: 88.40906372114145
Mean RMSE: 3226804.190204205
Mean R2: 0.9255573960443254


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
