In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

In [2]:
# Custom transformer for Label Encoding
class LabelEncoderPipelineFriendly(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoders = {}
    
    def fit(self, X, y=None):
        for col in X.columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.label_encoders[col] = le
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        for col, le in self.label_encoders.items():
            X_copy[col] = le.transform(X_copy[col])
        return X_copy

# Custom transformer for Autoencoder feature extraction
class AutoencoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, hidden_layer_sizes=(5,), activation='relu', solver='adam', max_iter=1000, random_state=42):
        self.hidden_layer_sizes = hidden_layer_sizes
        self.activation = activation
        self.solver = solver
        self.max_iter = max_iter
        self.random_state = random_state
        self.autoencoder = None

    def fit(self, X, y=None):
        self.autoencoder = MLPRegressor(hidden_layer_sizes=self.hidden_layer_sizes, activation=self.activation,
                                        solver=self.solver, max_iter=self.max_iter, random_state=self.random_state)
        self.autoencoder.fit(X, X)
        return self
    
    def transform(self, X):
        return self.autoencoder.predict(X)

In [3]:
# Load data
data = pd.read_csv("synthetic_veg_crop_data (1).csv")

In [4]:
# Move the 'Yield' column to the end
move = data.pop('Yield')
data['Yield'] = move

# Split data into features (X) and target (y)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [5]:
# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns
numeric_columns = X.select_dtypes(exclude=['object']).columns

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', LabelEncoderPipelineFriendly(), categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ])

# Define the pipeline with preprocessing, autoencoder, and Random Forest model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('autoencoder', AutoencoderTransformer()),
    ('regressor', RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42))
])

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline
pipeline.fit(X_train, y_train)

In [7]:
# Predict using the pipeline
y_pred = pipeline.predict(X_test)

In [8]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')

Mean Squared Error: 1.2519057383896512
Mean Absolute Error: 0.6174548966741473
R-squared: 0.9945911035627988


In [9]:
# Save the pipeline
joblib.dump(pipeline, 'hybrid_model.pkl')

['hybrid_model.pkl']