In [1]:
# Cell 1: working dir aur dataset file check + load first CSV/XLSX found
import os, pandas as pd
print("Working dir:", os.getcwd())
files = [f for f in os.listdir() if f.lower().endswith(('.csv','.xlsx'))]
print("Found dataset files:", files)
if not files:
    raise FileNotFoundError("Koi .csv/.xlsx file current folder me nahi mili. Dataset usi folder me rakho.")
fname = files[0]
print("Loading:", fname)
if fname.lower().endswith('.csv'):
    data = pd.read_csv(fname)
else:
    data = pd.read_excel(fname)
print("Shape:", data.shape)
data.head()


Working dir: C:\Users\mdtab\Downloads\Crop
Found dataset files: ['Crop_recommendation.csv']
Loading: Crop_recommendation.csv
Shape: (2200, 8)


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [2]:
# Cell 2: column names lowercase and strip
data.columns = [c.strip() for c in data.columns]
print("Columns:", data.columns.tolist())
# show datatypes
data.dtypes


Columns: ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label']


N                int64
P                int64
K                int64
temperature    float64
humidity       float64
ph             float64
rainfall       float64
label           object
dtype: object

In [3]:
# Cell 3: find label column (common names) otherwise use last column
possible = [c for c in data.columns if c.lower() in ('label','crop','target','class','crops')]
label_col = possible[0] if possible else data.columns[-1]
print("Using label column:", label_col)
data[label_col].unique()[:20]


Using label column: label


array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
       'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
       'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
       'orange', 'papaya', 'coconut', 'cotton'], dtype=object)

In [4]:
# Cell 4: common rename mapping (optional) to match expected features
data.rename(columns=lambda x: x.strip().lower(), inplace=True)
# map short names if present
data.rename(columns={'n':'nitrogen','p':'phosphorus','k':'potassium','temp':'temperature','humid':'humidity'}, inplace=True)
print("Final columns:", data.columns.tolist())


Final columns: ['nitrogen', 'phosphorus', 'potassium', 'temperature', 'humidity', 'ph', 'rainfall', 'label']


In [5]:
# Cell 5: prepare X and y
label_col = label_col.strip().lower()
X = data.drop(columns=[label_col])
y = data[label_col]
print("Features used:", X.columns.tolist())
print("Number of examples:", len(X))


Features used: ['nitrogen', 'phosphorus', 'potassium', 'temperature', 'humidity', 'ph', 'rainfall']
Number of examples: 2200


In [6]:
# Cell 6: encode crop names to integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_enc = le.fit_transform(y)
classes = le.classes_
print("Classes:", classes)


Classes: ['apple' 'banana' 'blackgram' 'chickpea' 'coconut' 'coffee' 'cotton'
 'grapes' 'jute' 'kidneybeans' 'lentil' 'maize' 'mango' 'mothbeans'
 'mungbean' 'muskmelon' 'orange' 'papaya' 'pigeonpeas' 'pomegranate'
 'rice' 'watermelon']


In [7]:
# Cell 7: split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42, stratify=y_enc)
print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (1760, 7) Test: (440, 7)


In [8]:
# Cell 8: train models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree accuracy:", round(acc_dt*100,2), "%")

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest accuracy:", round(acc_rf*100,2), "%")


Decision Tree accuracy: 97.95 %
Random Forest accuracy: 99.55 %


In [9]:
# Cell 9: detailed metrics
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

best = rf if acc_rf >= acc_dt else dt
model_name = "RandomForest" if best is rf else "DecisionTree"
print("Best model chosen:", model_name)

y_pred_best = best.predict(X_test)
print("\nClassification report:")
print(classification_report(y_test, y_pred_best, target_names=classes))

cm = confusion_matrix(y_test, y_pred_best)
cm_df = pd.DataFrame(cm, index=classes, columns=classes)
cm_df


Best model chosen: RandomForest

Classification report:
              precision    recall  f1-score   support

       apple       1.00      1.00      1.00        20
      banana       1.00      1.00      1.00        20
   blackgram       1.00      0.95      0.97        20
    chickpea       1.00      1.00      1.00        20
     coconut       1.00      1.00      1.00        20
      coffee       1.00      1.00      1.00        20
      cotton       1.00      1.00      1.00        20
      grapes       1.00      1.00      1.00        20
        jute       0.95      1.00      0.98        20
 kidneybeans       1.00      1.00      1.00        20
      lentil       1.00      1.00      1.00        20
       maize       0.95      1.00      0.98        20
       mango       1.00      1.00      1.00        20
   mothbeans       1.00      1.00      1.00        20
    mungbean       1.00      1.00      1.00        20
   muskmelon       1.00      1.00      1.00        20
      orange       1.00  

Unnamed: 0,apple,banana,blackgram,chickpea,coconut,coffee,cotton,grapes,jute,kidneybeans,...,mango,mothbeans,mungbean,muskmelon,orange,papaya,pigeonpeas,pomegranate,rice,watermelon
apple,20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
banana,0,20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
blackgram,0,0,19,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chickpea,0,0,0,20,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
coconut,0,0,0,0,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
coffee,0,0,0,0,0,20,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cotton,0,0,0,0,0,0,20,0,0,0,...,0,0,0,0,0,0,0,0,0,0
grapes,0,0,0,0,0,0,0,20,0,0,...,0,0,0,0,0,0,0,0,0,0
jute,0,0,0,0,0,0,0,0,20,0,...,0,0,0,0,0,0,0,0,0,0
kidneybeans,0,0,0,0,0,0,0,0,0,20,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Cell 10: save model and label encoder
import pickle
with open("trained_model.pkl", "wb") as f:
    pickle.dump({'model': best, 'label_encoder': le, 'feature_columns': X.columns.tolist()}, f)
print("Saved trained_model.pkl")


Saved trained_model.pkl


In [11]:
# Cell 11: test a single input (edit the numbers to test)
import pandas as pd
sample = pd.DataFrame([{
    # change keys to match X.columns printed earlier
    'nitrogen': 90,
    'phosphorus': 42,
    'potassium': 43,
    'temperature': 25,
    'humidity': 12,
    'ph': 6.5,
    'rainfall': 200
}])
# reorder to exact feature columns
sample = sample.reindex(columns=X.columns)
pred_num = best.predict(sample)[0]
pred_crop = le.inverse_transform([pred_num])[0]
print("Predicted crop for sample:", pred_crop)


Predicted crop for sample: coffee
