In [1]:
import pandas as pd

DATA = '/kaggle/input/crop-predication-dataset-based-on-nashik-region/Crop_Predication_dataset.xlsx'
df = pd.read_excel(DATA)
df.head()

Unnamed: 0,ph,EC,CaCO3,OC,N,P,K,label
0,5.6,0.33,6.0,0.16,125.0,37.0,200.0,bajra
1,7.5,0.82,7.0,0.15,132.0,35.0,169.0,bajra
2,5.6,0.212,3.0,0.19,145.0,38.0,122.0,bajra
3,6.9,0.469,9.0,0.14,124.0,38.0,134.0,bajra
4,7.3,0.649,6.0,0.18,125.0,35.0,162.0,bajra


Are our classes balanced?

In [2]:
df['label'].value_counts().to_frame().T

label,grapes,jowar,bajra,cabbage,maize,potato,sugarcane,wheat,pomegranate,soybean,cotton,tomato,rice,onion,cauliflower
count,116,102,101,101,101,101,101,101,100,85,84,55,51,34,6


Eleven of our classes have 100 give or take 16 instances, and four are poorly represented. We may be in for a challenge.

First let's use dimensionality reduction to see how our data clusters.

In [3]:
from sklearn.manifold import TSNE

tsne = TSNE(random_state=2025)
tsne_df = pd.DataFrame(data=tsne.fit_transform(X=df.drop(columns=['label'])), columns=['tx', 'ty'])
tsne_df['label'] = df['label'].tolist()

In [4]:
from plotly import colors
from plotly import express

express.scatter(data_frame=tsne_df, x='tx', y='ty', color='label', color_discrete_sequence=colors.qualitative.Light24)

What do we see? We see that TSNE will cluster a handful of classes well, a couple are intertwined, and the others are kind of a mess. We should probably have moderate expectations regarding classifier accuracy. Let's build a model.

In [5]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split


TARGET = 'label'
COLUMNS = ['ph', 'EC', 'CaCO3', 'OC', 'N', 'P', 'K',]
X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])

qda = QuadraticDiscriminantAnalysis().fit(X_train, y_train)
print('accuracy: {:5.4f} f1: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=qda.predict(X=X_test)), f1_score(average='weighted', y_true=y_test, y_pred=qda.predict(X=X_test), zero_division=0)))
print(classification_report(y_true=y_test, y_pred=qda.predict(X=X_test), zero_division=0))

accuracy: 0.8992 f1: 0.8967
              precision    recall  f1-score   support

       bajra       1.00      1.00      1.00        20
     cabbage       1.00      1.00      1.00        20
 cauliflower       0.00      0.00      0.00         1
      cotton       1.00      1.00      1.00        17
      grapes       0.76      0.67      0.71        24
       jowar       0.91      0.95      0.93        21
       maize       1.00      1.00      1.00        20
       onion       0.60      0.43      0.50         7
 pomegranate       1.00      1.00      1.00        20
      potato       1.00      1.00      1.00        20
        rice       1.00      0.80      0.89        10
     soybean       0.57      0.76      0.65        17
   sugarcane       1.00      1.00      1.00        20
      tomato       0.60      0.55      0.57        11
       wheat       0.91      1.00      0.95        20

    accuracy                           0.90       248
   macro avg       0.82      0.81      0.81       24


Variables are collinear



Not surprisingly, we have some collinear variables.