# Random forest Regression 

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
# Load data
data = pd.read_csv("synthetic_veg_crop_data (1).csv")

In [4]:
data.head()

Unnamed: 0,S. no.,Name,Soil_Type,Fertility,Photoperiod,N-P-K Ratio,Temperature,Rainfall,pH,Light_Hours,Light_Intensity,Rh,Yield,Season
0,1,Tomatoes,"Fertile, well-drained loam or sandy loam soil",High,Short Day Period,10:10:10,26.483577,555.640013,6.723995,6.161744,459.558388,55.174597,59.082461,Summer
1,1,Tomatoes,"Fertile, well-drained loam or sandy loam soil",High,Short Day Period,10:10:10,24.658866,548.369437,6.630615,7.341362,403.734739,58.475277,61.836617,Summer
2,1,Tomatoes,"Fertile, well-drained loam or sandy loam soil",High,Short Day Period,10:10:10,23.036639,548.656752,6.228059,6.155745,567.426957,53.527221,62.014527,Spring
3,1,Tomatoes,"Fertile, well-drained loam or sandy loam soil",High,Short Day Period,10:10:10,25.187162,588.83248,5.832263,6.274787,485.063743,59.937303,60.940745,Fall
4,1,Tomatoes,"Fertile, well-drained loam or sandy loam soil",High,Short Day Period,10:10:10,25.678596,504.97938,5.836229,6.185052,637.59025,56.769625,64.556501,Spring


In [5]:
# Move the 'Yield' column to the end
move = data.pop('Yield')
data['Yield'] = move

In [6]:
# Split data into features (X) and target (y)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

# Custom transformer for Label Encoding
class LabelEncoderPipelineFriendly(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoders = {}
    
    def fit(self, X, y=None):
        for col in X.columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.label_encoders[col] = le
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        for col, le in self.label_encoders.items():
            X_copy[col] = le.transform(X_copy[col])
        return X_copy

In [8]:
# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('label_encoder', LabelEncoderPipelineFriendly())
        ]), categorical_columns),
        ('num', StandardScaler(), X.select_dtypes(exclude=['object']).columns)
    ])

# Define the pipeline with preprocessing and the Random Forest model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42))
])

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline
pipeline.fit(X_train, y_train)

# Predict using the pipeline
y_pred = pipeline.predict(X_test)

In [10]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.7686847921383441
Mean Absolute Error: 0.5343487330757495
R-squared: 0.9966788742107086


In [11]:
import joblib

# Save the pipeline
joblib.dump(pipeline, 'randomforest.pkl')

['randomforest.pkl']