In [2]:
import pandas as pd

# Load the uploaded dataset
file_path = r'C:\Users\Ashish\Desktop\Analysis_project\cleaned_dataset_1.xlsx'
data = pd.read_excel(file_path)



In [4]:
# Ensure 'Headwater_status_encoded' is properly added to the dataset
data['Headwater_status_encoded'] = data['Headwater_status'].map({'Yes': 1, 'No': 0})

# Check if the encoding was successful
print(data[['Headwater_status', 'Headwater_status_encoded']].head())

# Preparing data for modeling
X = data[['EOT', 'Headwater_status_encoded']]  # Features
y_biotic = data['Biotic_Index']  # Target for Biotic Index
y_simpson = data['Simpson_Diversity_Index']  # Target for Simpson Index

# Splitting data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train_biotic, y_test_biotic = train_test_split(
    X, y_biotic, test_size=0.2, random_state=42
)

X_train, X_test, y_train_simpson, y_test_simpson = train_test_split(
    X, y_simpson, test_size=0.2, random_state=42
)

# Verifying split
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Biotic Index training labels shape:", y_train_biotic.shape)
print("Simpson Index training labels shape:", y_train_simpson.shape)


  Headwater_status  Headwater_status_encoded
0               No                       0.0
1               No                       0.0
2               No                       0.0
3               No                       0.0
4               No                       0.0
Training features shape: (244, 2)
Testing features shape: (61, 2)
Biotic Index training labels shape: (244,)
Simpson Index training labels shape: (244,)


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

# Initialize Random Forest models
rf_biotic = RandomForestRegressor(random_state=42)
rf_simpson = RandomForestRegressor(random_state=42)

# Train models
rf_biotic.fit(X_train, y_train_biotic)
rf_simpson.fit(X_train, y_train_simpson)

# Predict on the test set
y_pred_biotic = rf_biotic.predict(X_test)
y_pred_simpson = rf_simpson.predict(X_test)

# Evaluate models
biotic_rmse = mean_squared_error(y_test_biotic, y_pred_biotic, squared=False)
biotic_r2 = r2_score(y_test_biotic, y_pred_biotic)

simpson_rmse = mean_squared_error(y_test_simpson, y_pred_simpson, squared=False)
simpson_r2 = r2_score(y_test_simpson, y_pred_simpson)

print("Biotic Index RMSE:", biotic_rmse, "R2:", biotic_r2)
print("Simpson Diversity Index RMSE:", simpson_rmse, "R2:", simpson_r2)


ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Plot predictions vs actual values
plt.figure(figsize=(12, 6))

# Biotic Index
plt.subplot(1, 2, 1)
plt.scatter(y_test_biotic, y_pred_biotic, alpha=0.7, edgecolor='k')
plt.plot([y_test_biotic.min(), y_test_biotic.max()], [y_test_biotic.min(), y_test_biotic.max()], 'r--')
plt.title('Biotic Index: Actual vs Predicted')
plt.xlabel('Actual Biotic Index')
plt.ylabel('Predicted Biotic Index')

# Simpson Diversity Index
plt.subplot(1, 2, 2)
plt.scatter(y_test_simpson, y_pred_simpson, alpha=0.7, edgecolor='k')
plt.plot([y_test_simpson.min(), y_test_simpson.max()], [y_test_simpson.min(), y_test_simpson.max()], 'r--')
plt.title('Simpson Index: Actual vs Predicted')
plt.xlabel('Actual Simpson Index')
plt.ylabel('Predicted Simpson Index')

plt.tight_layout()
plt.show()

# Time-series-like visualization (if you have a temporal variable)
# If you don't have a time variable, these plots will be sorted by prediction index
plt.figure(figsize=(12, 6))

# Biotic Index over prediction index
plt.subplot(1, 2, 1)
plt.plot(np.arange(len(y_test_biotic)), y_test_biotic.values, label='Actual', marker='o')
plt.plot(np.arange(len(y_pred_biotic)), y_pred_biotic, label='Predicted', marker='x')
plt.title('Biotic Index Predictions Over Time')
plt.xlabel('Index')
plt.ylabel('Biotic Index')
plt.legend()

# Simpson Diversity Index over prediction index
plt.subplot(1, 2, 2)
plt.plot(np.arange(len(y_test_simpson)), y_test_simpson.values, label='Actual', marker='o')
plt.plot(np.arange(len(y_pred_simpson)), y_pred_simpson, label='Predicted', marker='x')
plt.title('Simpson Index Predictions Over Time')
plt.xlabel('Index')
plt.ylabel('Simpson Index')
plt.legend()

plt.tight_layout()
plt.show()
