In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import numpy as np

# Step 2: Load the dataset (replace with your dataset)
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/tea project/tea_cultivation_sri_lanka_dataset.csv')

# Step 3: Preprocess the data
# Convert 'Soil type' into numerical format using one-hot encoding
df = pd.get_dummies(df, columns=['Soil type'], drop_first=True)

# Label encode 'Month' column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Month'] = le.fit_transform(df['Month'])

# Step 4: Separate the features (X) and the target variable (y)
X = df.drop(columns=['Yield (kg/ha)'])
y = df['Yield (kg/ha)']

# Step 5: Feature scaling using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 7: Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Step 8: Train the Random Forest Regressor model with tuned parameters
rf_model = RandomForestRegressor(**best_params, random_state=42)
rf_model.fit(X_train, y_train)

# Step 9: Make predictions
y_pred = rf_model.predict(X_test)

# Step 10: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Step 11: Calculate Adjusted R-squared
n = len(y_test)
p = X_test.shape[1]
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Step 12: Calculate Mean Absolute Percentage Error (MAPE) for accuracy
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Output results
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")
print(f"Adjusted R-squared: {adjusted_r2}")
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")

# Optional: Feature importance
importances = rf_model.feature_importances_
feature_names = df.drop(columns=['Yield (kg/ha)']).columns

# Display feature importance
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)

# Step 13: Cross-validation (Optional for better evaluation)
cv_scores = cross_val_score(rf_model, X_scaled, y, cv=5, scoring='r2')
print(f"Cross-validated R-squared scores: {cv_scores}")
print(f"Mean cross-validated R-squared: {np.mean(cv_scores)}")


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Mean Squared Error (MSE): 21723.3335989924
Mean Absolute Error (MAE): 127.51013978588186
R-squared: -0.018505854133513022
Adjusted R-squared: -0.10804483032107459
Mean Absolute Percentage Error (MAPE): 8.136856473159535%
                Feature  Importance
1          Humidity (%)    0.219780
0               Soil pH    0.187074
4                 Month    0.174690
2     Temperature (Â°C)    0.154579
3        Sunlight Hours    0.095002
5     Plant Age (years)    0.086366
6       Soil type_Loamy    0.050180
7  Soil type_Sandy Loam    0.032329
Cross-validated R-squared scores: [-0.19873779  0.00792888 -0.07903433  0.01081214 -0.04750267]
Mean cross-validated R-squared: -0.061306753628776266


In [None]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Step 2: Load the dataset (replace with your dataset)
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/tea project/tea_cultivation_sri_lanka_dataset.csv')

# Step 3: Preprocess the data
# Convert 'Soil type' into numerical format using one-hot encoding
df = pd.get_dummies(df, columns=['Soil type'], drop_first=True)

# Label encode 'Month' column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Month'] = le.fit_transform(df['Month'])

# Step 4: Clustering using K-Means (Unsupervised Learning)
kmeans = KMeans(n_clusters=3, random_state=42)  # Adjust number of clusters if needed
df['Cluster'] = kmeans.fit_predict(df.drop(columns=['Yield (kg/ha)']))

# Step 5: Separate the features (X) and the target variable (y)
X = df.drop(columns=['Yield (kg/ha)'])
y = df['Yield (kg/ha)']

# Step 6: Feature scaling using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 7: Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 8: Train the Gradient Boosting Regressor model (Supervised Learning)
gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42)
gbr.fit(X_train, y_train)

# Step 9: Make predictions
y_pred = gbr.predict(X_test)

# Step 10: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Adjusted R-squared calculation
n = len(y_test)
p = X_test.shape[1]
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Mean Absolute Percentage Error (MAPE)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Output results
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")
print(f"Adjusted R-squared: {adjusted_r2}")
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")

# Optional: Cross-validation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(gbr, X_scaled, y, cv=5, scoring='r2')
print(f"Cross-validated R-squared scores: {cv_scores}")
print(f"Mean cross-validated R-squared: {np.mean(cv_scores)}")


Mean Squared Error (MSE): 29334.238559087076
Mean Absolute Error (MAE): 148.48444212120341
R-squared: -0.37534571122937654
Adjusted R-squared: -0.5128802823523144
Mean Absolute Percentage Error (MAPE): 9.537879887029131%
Cross-validated R-squared scores: [-0.70494759 -0.44430271 -0.46429116 -0.17312398 -0.24056213]
Mean cross-validated R-squared: -0.40544551397362394


In [None]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cluster import KMeans
import numpy as np

# Step 2: Load the dataset (replace with your dataset)
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/tea project/tea_cultivation_sri_lanka_dataset.csv')

# Step 3: Preprocess the data
# Convert 'Soil type' into numerical format using one-hot encoding
df = pd.get_dummies(df, columns=['Soil type'], drop_first=True)

# Label encode 'Month' column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Month'] = le.fit_transform(df['Month'])

# Step 4: Clustering using K-Means (Unsupervised Learning)
kmeans = KMeans(n_clusters=3, random_state=42)  # Adjust number of clusters if needed
df['Cluster'] = kmeans.fit_predict(df.drop(columns=['Yield (kg/ha)']))

# Step 5: Separate the features (X) and the target variable (y)
X = df.drop(columns=['Yield (kg/ha)'])
y = df['Yield (kg/ha)']

# Step 6: Feature scaling using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 7: Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 8: Train the Gradient Boosting Regressor model (Supervised Learning)
gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42)
gbr.fit(X_train, y_train)

# Step 9: Make predictions
y_pred = gbr.predict(X_test)

# Step 10: Calculate accuracy percentage based on MAPE
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
accuracy = 100 - mape

# Output only the accuracy percentage
print(f"Accuracy: {accuracy:.2f}%")


Accuracy: 90.46%


In [None]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cluster import KMeans
import numpy as np
import joblib  # For saving the model
from google.colab import files  # To download the file in Google Colab

# Step 2: Load the dataset (replace with your dataset)
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/tea project/tea_cultivation_sri_lanka_dataset.csv')

# Step 3: Preprocess the data
# Convert 'Soil type' into numerical format using one-hot encoding
df = pd.get_dummies(df, columns=['Soil type'], drop_first=True)

# Label encode 'Month' column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Month'] = le.fit_transform(df['Month'])

# Step 4: Clustering using K-Means (Unsupervised Learning)
kmeans = KMeans(n_clusters=3, random_state=42)  # Adjust number of clusters if needed
df['Cluster'] = kmeans.fit_predict(df.drop(columns=['Yield (kg/ha)']))

# Step 5: Separate the features (X) and the target variable (y)
X = df.drop(columns=['Yield (kg/ha)'])
y = df['Yield (kg/ha)']

# Step 6: Feature scaling using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 7: Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 8: Train the Gradient Boosting Regressor model (Supervised Learning)
gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42)
gbr.fit(X_train, y_train)

# Step 9: Make predictions
y_pred = gbr.predict(X_test)

# Step 10: Calculate accuracy percentage based on MAPE
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
accuracy = 100 - mape

# Output only the accuracy percentage
print(f"Accuracy: {accuracy:.2f}%")

# Step 11: Save the trained model to a file
model_filename = 'gbr_tea_yield_model.pkl'
joblib.dump(gbr, model_filename)

# Step 12: Download the model file
files.download(model_filename)


Accuracy: 90.46%


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cluster import KMeans
import numpy as np
import joblib
from google.colab import files

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/tea project/tea_cultivation_sri_lanka_dataset.csv')

df = pd.get_dummies(df, columns=['Soil type'], drop_first=True)

le = LabelEncoder()
df['Month'] = le.fit_transform(df['Month'])

kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(df.drop(columns=['Yield (kg/ha)']))

X = df.drop(columns=['Yield (kg/ha)'])
y = df['Yield (kg/ha)']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42)
gbr.fit(X_train, y_train)

y_pred = gbr.predict(X_test)

mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
accuracy = 100 - mape

print(f"Accuracy: {accuracy:.2f}%")

model_filename = 'gbr_tea_yield_model.pkl'
label_encoder_filename = 'label_encoder_month.pkl'
scaler_filename = 'scaler.pkl'

joblib.dump(gbr, model_filename)

joblib.dump(le, label_encoder_filename)

joblib.dump(scaler, scaler_filename)

files.download(model_filename)
files.download(label_encoder_filename)
files.download(scaler_filename)


Accuracy: 90.46%


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cluster import KMeans
import numpy as np
import joblib
from google.colab import files

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/tea project/tea_cultivation_sri_lanka_dataset.csv')

df = pd.get_dummies(df, columns=['Soil type'], drop_first=True)

le = LabelEncoder()
df['Month'] = le.fit_transform(df['Month'])

kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(df.drop(columns=['Yield (kg/ha)']))

X = df.drop(columns=['Yield (kg/ha)'])
y = df['Yield (kg/ha)']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42)
gbr.fit(X_train, y_train)

joblib.dump(gbr, 'gbr_tea_yield_model.pkl')

joblib.dump(kmeans, 'kmeans_model.pkl')

joblib.dump(le, 'label_encoder_month.pkl')

joblib.dump(scaler, 'scaler.pkl')

files.download('gbr_tea_yield_model.pkl')
files.download('kmeans_model.pkl')
files.download('label_encoder_month.pkl')
files.download('scaler.pkl')

def predict_tea_yield(soil_type, soil_pH, humidity, temperature, sunlight_hours, month, plant_age):
    user_data = pd.DataFrame({
        'Soil pH': [soil_pH],
        'Humidity (%)': [humidity],
        'Temperature (Â°C)': [temperature],
        'Sunlight Hours': [sunlight_hours],
        'Month': [month],
        'Plant Age (years)': [plant_age],
        'Soil type_Loamy': [1 if soil_type == 'Loamy' else 0],
        'Soil type_Sandy Loam': [1 if soil_type == 'Sandy Loam' else 0]
    })

    user_data['Month'] = le.transform(user_data['Month'])

    cluster_features = ['Soil pH', 'Humidity (%)', 'Temperature (Â°C)', 'Sunlight Hours', 'Month', 'Plant Age (years)',
                        'Soil type_Loamy', 'Soil type_Sandy Loam']
    user_data['Cluster'] = kmeans.predict(user_data[cluster_features])

    user_data_scaled = scaler.transform(user_data)

    predicted_yield = gbr_model.predict(user_data_scaled)

    return predicted_yield[0]

gbr_model = joblib.load('/content/drive/MyDrive/Colab Notebooks/tea project/gbr_tea_yield_model.pkl')
le = joblib.load('/content/drive/MyDrive/Colab Notebooks/tea project/label_encoder_month.pkl')
scaler = joblib.load('/content/drive/MyDrive/Colab Notebooks/tea project/scaler.pkl')
kmeans = joblib.load('/content/drive/MyDrive/Colab Notebooks/tea project/kmeans_model.pkl')

soil_type_input = 'Loamy'
soil_pH_input = 6.5
humidity_input = 75
temperature_input = 23
sunlight_hours_input = 5
month_input = 'August'
plant_age_input = 4

predicted_yield = predict_tea_yield(soil_type_input, soil_pH_input, humidity_input, temperature_input, sunlight_hours_input, month_input, plant_age_input)

print(f"Predicted Tea Yield (kg/ha): {predicted_yield:.2f}")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Predicted Tea Yield (kg/ha): 1567.10


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cluster import KMeans
import numpy as np
import joblib

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/files/tea/tea_cultivation_sri_lanka_dataset.csv')

df = pd.get_dummies(df, columns=['Soil type'], drop_first=True)

le = LabelEncoder()
df['Month'] = le.fit_transform(df['Month'])

kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(df.drop(columns=['Yield (kg/ha)']))

X = df.drop(columns=['Yield (kg/ha)'])
y = df['Yield (kg/ha)']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42)
gbr.fit(X_train, y_train)

joblib.dump(gbr, 'gbr_tea_yield_model.pkl')
joblib.dump(kmeans, 'kmeans_model.pkl')
joblib.dump(le, 'label_encoder_month.pkl')
joblib.dump(scaler, 'scaler.pkl')

def predict_tea_yield(soil_type, soil_pH, humidity, temperature, sunlight_hours, month, plant_age):

    user_data = pd.DataFrame({
        'Soil pH': [soil_pH],
        'Humidity (%)': [humidity],
        'Temperature (Â°C)': [temperature],
        'Sunlight Hours': [sunlight_hours],
        'Month': [month],
        'Plant Age (years)': [plant_age],
        'Soil type_Loamy': [1 if soil_type == 'Loamy' else 0],
        'Soil type_Sandy Loam': [1 if soil_type == 'Sandy Loam' else 0]
    })

    user_data['Month'] = le.transform(user_data['Month'])

    cluster_features = ['Soil pH', 'Humidity (%)', 'Temperature (Â°C)', 'Sunlight Hours', 'Month', 'Plant Age (years)',
                        'Soil type_Loamy', 'Soil type_Sandy Loam']
    user_data['Cluster'] = kmeans.predict(user_data[cluster_features])

    user_data_scaled = scaler.transform(user_data)

    predicted_yield = gbr_model.predict(user_data_scaled)

    return predicted_yield[0]

gbr_model = joblib.load('/content/drive/MyDrive/Colab Notebooks/files/tea/gbr_tea_yield_model.pkl')
le = joblib.load('/content/drive/MyDrive/Colab Notebooks/files/tea/label_encoder_month.pkl')
scaler = joblib.load('/content/drive/MyDrive/Colab Notebooks/files/tea/scaler.pkl')
kmeans = joblib.load('/content/drive/MyDrive/Colab Notebooks/files/tea/kmeans_model.pkl')

print("Enter the following details to predict tea yield:")

soil_type_input = input("Soil Type (Loamy/Sandy Loam): ")
soil_pH_input = float(input("Soil pH (e.g., 6.5): "))
humidity_input = float(input("Humidity (%) (e.g., 75): "))
temperature_input = float(input("Temperature (°C) (e.g., 23): "))
sunlight_hours_input = float(input("Sunlight Hours (e.g., 5): "))
month_input = input("Month (e.g., August): ")
plant_age_input = float(input("Plant Age (years) (e.g., 4): "))

predicted_yield = predict_tea_yield(soil_type_input, soil_pH_input, humidity_input, temperature_input, sunlight_hours_input, month_input, plant_age_input)

print(f"\nPredicted Tea Yield (kg/ha): {predicted_yield:.2f}")
