In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer

In [2]:
# Set seed for reproducibility
np.random.seed(42)

# Step 1: Simulate features with different sample sizes
sample_sizes = {
    'soil_moisture': 1200,
    'rainfall': 950,
    'temperature': 1000,
    'ndvi': 850
}

max_len = max(sample_sizes.values())
# Create features with NaN padding
features = {}
for name, size in sample_sizes.items():
    data = np.random.rand(size)
    if size < max_len:
        data = np.concatenate([data, [np.nan] * (max_len - size)])
    features[name] = data

df = pd.DataFrame(features)



We define a dictionary of synthetic "features" and their sample sizes.

This simulates how much data we have for each environmental variable.

    soil_moisture: 1200 samples → complete

    rainfall: 950 samples → incomplete

    temperature: 1000 samples → complete

    ndvi: 850 samples → incomplete
    max_len = max(sample_sizes.values())

    Determine the maximum length (1200) so we can pad all shorter features with NaN up to this size.
    For each feature:

    Generate size number of random values between 0 and 1 using np.random.rand(size).

    If the feature's sample size is less than max_len:

        Append NaN values until its length becomes equal to the maximum.

        This simulates missing data.

    Add the padded array to the features dictionary.

In [3]:
# Step 2: Mark features with <1000 valid entries as missing
missing_features = [col for col in df.columns if df[col].count() < 1000]
print("Features with missing data:", missing_features)


Features with missing data: ['rainfall', 'ndvi']


In [4]:
# Step 3: Impute missing values (e.g., with mean)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(df)

imputer = SimpleImputer(strategy='mean')

    We create an imputer object that will replace missing values using the mean of each column (feature).

    You can also change the strategy to:

        'median' → for skewed data.

        'most_frequent' → for categorical data.

        'constant' → fill with a specific number.
        X = imputer.fit_transform(df)

    We apply the imputer to our DataFrame df, which contains missing values.

    fit_transform() does two things:

        Fit: Calculates the mean of each column (ignoring NaNs).

        Transform: Replaces all NaN values in that column with the computed mean.

In [5]:
# Step 4: Simulate a target variable (y) based on available features
features_imputed = pd.DataFrame(X, columns=df.columns)

score = (
    0.3 * features_imputed['soil_moisture'] +
    0.2 * features_imputed['rainfall'] -
    0.4 * np.abs(features_imputed['temperature'] - 25) +
    50 * features_imputed['ndvi']
)
# Use percentiles for balanced classes
bins = np.percentile(score, [33, 66])

y = np.digitize(score, bins=[50, 100])  # 0: low, 1: medium, 2: high

y = np.digitize(score, bins=[50, 100])

    This line converts the continuous score into discrete classes (labels) for classification.

    How np.digitize works:

        bins=[50, 100] defines the thresholds.

        Values <= 50 get class 0 (low yield class).

        Values > 50 and <= 100 get class 1 (medium yield class).

        Values > 100 get class 2 (high yield class).

In [6]:
# Check balance
print("Target class distribution:", np.unique(y, return_counts=True))


Target class distribution: (array([0], dtype=int64), array([1200], dtype=int64))


In [7]:
# Step 5: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [8]:
# Step 6: Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [9]:
# Step 7: Optimize KNN with cross-validation
param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=5)
grid.fit(X_train, y_train)


GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [3, 5, 7],
                         'weights': ['uniform', 'distance']})

In [10]:
# Step 8: Evaluate the model
best_knn = grid.best_estimator_
y_pred = best_knn.predict(X_test)


In [13]:
print("\nBest Parameters:", grid.best_params_)
print("Best Cross-Validation Score:", grid.best_score_)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))




Best Parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Best Cross-Validation Score: 1.0
Test Accuracy: 1.0

Confusion Matrix:
 [[240]]



Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       240

    accuracy                           1.00       240
   macro avg       1.00      1.00      1.00       240
weighted avg       1.00      1.00      1.00       240



In [17]:
# Step 9: Predict new unseen crop data (optional)
test_data = pd.DataFrame({
    'soil_moisture': np.random.rand(10),
    'rainfall': np.random.rand(10),
    'temperature': np.random.rand(10) * 40,
    'ndvi': np.random.rand(10)
})

test_data_imputed = imputer.transform(test_data)
test_data_scaled = scaler.transform(test_data_imputed)
y_pred_missing = best_knn.predict(test_data_scaled)
test_data['predicted_crop_class'] = y_pred_missing

print("\nPredictions for New Data:\n", test_data)



Predictions for New Data:
    soil_moisture  rainfall  temperature      ndvi  predicted_crop_class
0       0.924073  0.425998    18.852331  0.725287                     0
1       0.820324  0.967743    18.900618  0.360394                     0
2       0.561864  0.657765    15.128981  0.087071                     0
3       0.671406  0.564395    20.691739  0.202149                     0
4       0.926308  0.932957    16.524296  0.545490                     0
5       0.619168  0.586512     7.973820  0.207057                     0
6       0.824807  0.526083    16.494659  0.543973                     0
7       0.688811  0.759311     2.891078  0.231599                     0
8       0.551936  0.712666    23.031435  0.848924                     0
9       0.241838  0.193160    34.146182  0.397271                     0
