In [1]:
import pandas as pd

cover_df = pd.read_excel('combined_sorted_fieldform_cover.xlsx')  
print("\nMissing values in cover data:")
print(cover_df.isnull().sum())


Missing values in cover data:
plot_name                 0
date                      0
fieldform                 0
lat                       0
long                      0
transect                  0
sampling_point            0
functional_group       3600
presence                 14
g                        13
ng                       35
g%                        3
ng%                    3620
source_file               0
presence.1           283200
unnamed: 13          286799
functional groups    283200
unnamed: 14          286799
dtype: int64


In [2]:
cover_df = cover_df.dropna(subset=['functional_group', 'presence', 'g', 'ng'])

cover_df = cover_df.drop(columns=['presence.1', 'unnamed: 13', 'functional groups', 'unnamed: 14'], errors='ignore')

# Replace 'NotApp' strings in all object (string) columns with NaN
cover_df.replace("NotApp", pd.NA, inplace=True)

# Drop any rows that now have missing values in selected features
X_cols = ['functional_group', 'presence', 'g', 'ng', 'sampling_point', 'transect']
cover_df = cover_df.dropna(subset=X_cols + ['g%'])

print("\nMissing values in cover data after cleaning:")
print(cover_df.isnull().sum())


Missing values in cover data after cleaning:
plot_name           0
date                0
fieldform           0
lat                 0
long                0
transect            0
sampling_point      0
functional_group    0
presence            0
g                   0
ng                  0
g%                  0
ng%                 8
source_file         0
dtype: int64


In [3]:
from sklearn.model_selection import train_test_split

# Keep only rows where g% exists (some may still be missing)
cover_df = cover_df[cover_df['g%'].notna()]

# Define features and target
X = cover_df[['functional_group', 'presence', 'g', 'ng', 'sampling_point', 'transect']]
y = cover_df['g%']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

# Define preprocessing
categorical_features = ['functional_group', 'transect']
numeric_features = ['presence', 'g', 'ng', 'sampling_point']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', 'passthrough', numeric_features)
])

# Define pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse:.2f}")

X = cover_df[X_cols]
y = cover_df['g%']

Test MSE: 0.01


In [5]:
# Compare actual vs. predicted for first 10 samples
for i in range(10):
    actual = round(y_test.iloc[i], 2)
    predicted = round(y_pred[i], 2)
    print(f"Sample {i+1}: Actual = {actual}, Predicted = {predicted}")


Sample 1: Actual = 0.25, Predicted = 0.3
Sample 2: Actual = 0, Predicted = 0.0
Sample 3: Actual = 0.2, Predicted = 0.28
Sample 4: Actual = 0.2, Predicted = 0.22
Sample 5: Actual = 0.03, Predicted = 0.13
Sample 6: Actual = 0.05, Predicted = 0.07
Sample 7: Actual = 0, Predicted = 0.2
Sample 8: Actual = 0, Predicted = 0.22
Sample 9: Actual = 0, Predicted = 0.0
Sample 10: Actual = 0, Predicted = 0.06


In [6]:
import joblib
joblib.dump(model, 'model.pkl')

['model.pkl']

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# 🔹 Convert to numeric
cover_df['presence'] = cover_df['presence'].astype(int)
cover_df['g'] = cover_df['g'].astype(int)
cover_df['ng'] = cover_df['ng'].astype(int)
cover_df['sampling_point'] = cover_df['sampling_point'].astype(int)

# 🔹 Features and target
features = ['functional_group', 'presence', 'g', 'ng', 'sampling_point', 'transect']
target = 'g%'  

X = cover_df[features]
y = cover_df[target]

# 🔹 Preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['functional_group', 'transect']),
], remainder='passthrough')

# 🔹 Pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# 🔹 Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔹 Train the model
model.fit(X_train, y_train)

# 🔹 Save model to pkl
joblib.dump(model, 'model.pkl')
print("✅ Model retrained and saved successfully.")

✅ Model retrained and saved successfully.
