In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [2]:
# Read crops dataset 
df=pd.read_csv('/kaggle/input/preprocessed-700-crops/preprocessed_700.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23800 entries, 0 to 23799
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             23800 non-null  object 
 1   Fertility        23800 non-null  object 
 2   Photoperiod      23800 non-null  object 
 3   N-P-K Ratio      23800 non-null  object 
 4   Temperature      23800 non-null  float64
 5   Rainfall         23800 non-null  float64
 6   pH               23800 non-null  float64
 7   Light_Hours      23800 non-null  float64
 8   Light_Intensity  23800 non-null  float64
 9   Rh               23800 non-null  float64
 10  Nitrogen         23800 non-null  float64
 11  Phosphorus       23800 non-null  float64
 12  Potassium        23800 non-null  float64
 13  Yield            23800 non-null  float64
 14  Category_pH      23800 non-null  object 
 15  Soil_Type        23800 non-null  object 
 16  Season           23800 non-null  object 
dtypes: float64(1

In [3]:
df.drop(["pH"],axis='columns', inplace=True)
df.head(5)

Unnamed: 0,Name,Fertility,Photoperiod,N-P-K Ratio,Temperature,Rainfall,Light_Hours,Light_Intensity,Rh,Nitrogen,Phosphorus,Potassium,Yield,Category_pH,Soil_Type,Season
0,Apple,High,Day Neutral,10:10:10,21.063204,1932.402709,12.716549,860.189066,92.677579,89.266502,40.330099,180.63574,12.847482,low_acidic,Sandy Loam,Fall
1,Apple,High,Day Neutral,10:10:10,19.511305,1589.295994,13.54456,797.66076,92.293923,92.80815,37.131922,179.042979,13.894292,neutral,Sandy Loam,Fall
2,Apple,High,Day Neutral,10:10:10,23.045662,1269.789133,12.330668,910.861369,91.798926,84.24859,38.693498,163.604138,13.372204,low_acidic,Sandy Loam,Fall
3,Apple,High,Short Day Period,10:10:10,17.986016,1944.180144,12.96534,922.725203,92.74271,84.780429,43.950592,173.881606,11.801568,neutral,Sandy Loam,Spring
4,Apple,High,Day Neutral,10:10:10,23.775354,1790.352815,12.895817,821.411003,90.98153,91.197126,45.56447,174.324935,10.660521,neutral,Sandy Loam,Fall


In [4]:
# Drop the Yield Column as its not relevant for Crop selection
df.drop('Yield', axis=1, inplace=True)

In [5]:
df_target = df['Name'].to_frame()
df_target

Unnamed: 0,Name
0,Apple
1,Apple
2,Apple
3,Apple
4,Apple
...,...
23795,Green Peas
23796,Green Peas
23797,Green Peas
23798,Green Peas


In [6]:
#Separate target from Predictors
df.drop('Name', axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23800 entries, 0 to 23799
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Fertility        23800 non-null  object 
 1   Photoperiod      23800 non-null  object 
 2   N-P-K Ratio      23800 non-null  object 
 3   Temperature      23800 non-null  float64
 4   Rainfall         23800 non-null  float64
 5   Light_Hours      23800 non-null  float64
 6   Light_Intensity  23800 non-null  float64
 7   Rh               23800 non-null  float64
 8   Nitrogen         23800 non-null  float64
 9   Phosphorus       23800 non-null  float64
 10  Potassium        23800 non-null  float64
 11  Category_pH      23800 non-null  object 
 12  Soil_Type        23800 non-null  object 
 13  Season           23800 non-null  object 
dtypes: float64(8), object(6)
memory usage: 2.5+ MB


## Split dataset into train and test set

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df, df_target, test_size=0.20, random_state=42)

len(X_train), len(X_test)

(19040, 4760)

In [8]:
X_train.head()

Unnamed: 0,Fertility,Photoperiod,N-P-K Ratio,Temperature,Rainfall,Light_Hours,Light_Intensity,Rh,Nitrogen,Phosphorus,Potassium,Category_pH,Soil_Type,Season
10448,Moderate,Long Day Period,10:10:10,20.663673,999.537984,12.419979,352.962624,91.087684,113.139649,252.944092,234.112895,acidic,Loam,Summer
3829,Moderate,Short Day Period,10:10:10,18.21641,830.790702,13.224105,476.467441,92.934905,180.425725,116.302324,236.02131,low_acidic,Loam,Spring
5259,High,Short Day Period,10:10:05,29.786036,1772.225842,13.074305,217.927973,91.784408,94.258877,53.769354,131.2263,low_acidic,Loam,Summer
6480,High,Day Neutral,10:10:10,28.797606,1117.929456,12.637688,731.395266,92.73492,201.386599,150.492864,52.203201,neutral,Sandy Loam,Winter
4199,Moderate,Short Day Period,10:10:10,17.179926,704.313825,12.731825,500.769106,92.880961,159.190886,119.149946,240.222999,low_acidic,Loam,Spring


In [9]:
# column_transformer = make_column_transformer(
#     (OneHotEncoder(), ['Fertility', 'Photoperiod', 'N-P-K Ratio', 'Category_pH', 'Soil_Type', 'Season']),
#     remainder='passthrough')

# X_train = column_transformer.fit_transform(X_train)
# X_train = pd.DataFrame(data=X_train, columns=column_transformer.get_feature_names_out())

# X_test = column_transformer.transform(X_test)
# X_test = pd.DataFrame(data=X_test, columns=column_transformer.get_feature_names_out())

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and 
                        X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_test = X_test[my_cols].copy()

## Using DecisionTreeClassifier model for training for crop selection

In [10]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

# Create a Pipeline 
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

my_pipeline.fit(X_train, np.ravel(y_train))

In [11]:
my_pipeline.score(X_test, y_test)

0.9978991596638656

In [12]:
y_pred = my_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))



                precision    recall  f1-score   support

         Apple       1.00      1.00      1.00       139
       Arugula       1.00      0.99      1.00       128
     Asparagus       1.00      1.00      1.00       147
          Beet       0.99      1.00      1.00       140
     Blueberry       1.00      1.00      1.00       159
      Broccoli       1.00      0.99      1.00       130
       Cabbage       0.99      1.00      0.99       135
  Cauliflowers       0.99      0.98      0.98       136
         Chard       1.00      1.00      1.00       156
      Cherries       1.00      1.00      1.00       134
Chilli Peppers       1.00      0.99      0.99       149
         Cress       1.00      1.00      1.00       136
     Cucumbers       0.99      1.00      1.00       152
     Eggplants       1.00      0.99      1.00       139
        Endive       1.00      1.00      1.00       138
          Figs       1.00      1.00      1.00       148
        Grapes       1.00      1.00      1.00  

In [13]:
accuracy_score(y_test, y_pred)

0.9978991596638656

## Saving the object to a pickl file for later use

In [15]:
# # Save the Pipeline object
with open('cropselection.pkl', 'wb') as file:
     pickle.dump(my_pipeline, file)