In [None]:
# Importing libraries and packages for basic statistics
import os # To change working directory
import pandas as pd # to read and manipulating data 
import numpy as np # to calculate mean and standard deviations

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

import warnings 
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [None]:
#Load dataset to start EDA
#Changing working directory

# To read 'csv' file with panda library
df = pd.read_csv('/content/Fertilizer_Prediction.csv')

In [None]:
# To display the first 10 rows of dataset
display(df.head(10))

Unnamed: 0,pH,N,P,K,Fertilizer
0,6.59,4.0,2.0,2.5,DAP and MOP
1,6.08,4.0,4.0,4.0,Good NPK
2,5.01,4.0,4.0,2.0,MOP
3,5.55,2.0,2.1,3.5,Urea and DAP
4,6.18,2.0,3.9,2.1,Urea and MOP
5,6.46,2.5,4.0,3.0,Urea
6,5.84,2.5,4.0,2.0,Urea and MOP
7,6.98,4.2,2.3,3.2,DAP
8,6.0,2.0,3.9,2.1,Urea and MOP
9,5.95,2.9,3.5,3.9,Urea


In [None]:
# To find Column name
df.columns

Index(['pH', 'N', 'P', 'K', 'Fertilizer'], dtype='object')

In [None]:
# To find the number of rows and columns
print(df.shape)

# check for the data types, memory usage, etc
display(df.info())

(200, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pH          200 non-null    float64
 1   N           200 non-null    float64
 2   P           200 non-null    float64
 3   K           200 non-null    float64
 4   Fertilizer  200 non-null    object 
dtypes: float64(4), object(1)
memory usage: 7.9+ KB


None

In [None]:
# checking the no. of missing values in the dataset
df.isnull().sum()

pH            0
N             0
P             0
K             0
Fertilizer    0
dtype: int64

In [None]:
# statistics of the numerical variables
display(df.describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pH,200.0,6.2113,0.622228,5.01,5.76,6.125,6.44,7.87
N,200.0,3.1155,0.910193,2.0,2.2,2.5,4.0,4.2
P,200.0,3.1305,0.929213,2.0,2.075,3.9,4.0,4.0
K,200.0,2.956,0.721364,1.9,2.1,3.0,3.5,4.0


In [None]:
# statistics of the category variables
display(df.describe(include='object'))

Unnamed: 0,Fertilizer
count,200
unique,7
top,Urea and MOP
freq,42


In [None]:
from sklearn.preprocessing import MinMaxScaler # to normalize data
from sklearn.preprocessing import LabelEncoder # to encode object variable to numeric
from sklearn.model_selection import train_test_split # to split data into training

In [None]:
X = df.drop(['Fertilizer'], axis=1) #feature variables
y = df[['Fertilizer']] #Target variable
print('The shape of feature set, X is ' , X.shape)
print('The shape of target, y is ' , y.shape)

The shape of feature set, X is  (200, 4)
The shape of target, y is  (200, 1)


In [None]:
#Label Encoding 
le = LabelEncoder()
df['Fertilizer']= le.fit_transform(df['Fertilizer'])

In [None]:
display(df.head())


Unnamed: 0,pH,N,P,K,Fertilizer
0,6.59,4.0,2.0,2.5,1
1,6.08,4.0,4.0,4.0,2
2,5.01,4.0,4.0,2.0,3
3,5.55,2.0,2.1,3.5,5
4,6.18,2.0,3.9,2.1,6


In [None]:
# normalize the feature(X) columns 
scaler = MinMaxScaler()

for col in X.columns:
    X[col] = scaler.fit_transform(X[[col]])

display(X.sample(10))

Unnamed: 0,pH,N,P,K
14,0.440559,0.227273,1.0,0.047619
70,0.255245,0.909091,1.0,1.0
2,0.0,0.909091,1.0,0.047619
172,0.332168,0.909091,1.0,1.0
196,0.737762,0.0,0.95,0.095238
75,0.828671,0.0,1.0,0.095238
151,0.398601,0.227273,0.0,0.52381
128,0.36014,0.0,1.0,0.095238
161,0.444056,0.909091,1.0,0.047619
8,0.346154,0.0,0.95,0.095238


In [None]:
# Create train and test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42, 
                                                    stratify=y)

print('Shape of X_train is', X_train.shape)
print('Shape of X_test is', X_test.shape)
print('Shape of y_train is', y_train.shape)
print('Shape of y_test is',  y_test.shape)

Shape of X_train is (140, 4)
Shape of X_test is (60, 4)
Shape of y_train is (140, 1)
Shape of y_test is (60, 1)


In [None]:
# Importing libraries for classification and performance evaluation
from sklearn.neighbors import KNeighborsClassifier #to build KNeighbors model
from sklearn.ensemble import GradientBoostingClassifier #to build GradientBoosting model
from sklearn.ensemble import RandomForestClassifier #to build RandomForest model
from sklearn.tree import DecisionTreeClassifier #to build a classification tree
from sklearn.tree import plot_tree # to draw a classification tree
from sklearn.model_selection import GridSearchCV # to best select hyperparameter

from sklearn.metrics import accuracy_score, classification_report # to calcutate accuracy of model
from sklearn.metrics import classification_report #to calculte precision, recall, f1-score
#from sklearn.metrics import plot_confusion_matrix # to draw confusion_matrix


In [None]:
#Random Forest model
model_RF = RandomForestClassifier(random_state=42)

# Train the model using the training sets
model_RF = model_RF.fit(X_train, y_train)

#Predict the response for test dataset
y_pred_RF = model_RF.predict(X_test)

In [None]:
# Model Accuracy, how often is the classifier correct?
print('Accuracy: ', accuracy_score(y_test, y_pred_RF))

#Classification report
print(classification_report(y_test, y_pred_RF))

Accuracy:  0.9666666666666667
              precision    recall  f1-score   support

         DAP       1.00      1.00      1.00         9
 DAP and MOP       1.00      0.60      0.75         5
    Good NPK       0.83      1.00      0.91        10
         MOP       1.00      1.00      1.00         5
       Urea        1.00      1.00      1.00         6
Urea and DAP       1.00      1.00      1.00        12
Urea and MOP       1.00      1.00      1.00        13

    accuracy                           0.97        60
   macro avg       0.98      0.94      0.95        60
weighted avg       0.97      0.97      0.96        60



In [None]:
print("""Random Forest\t\t\t {:.4f}""".format(accuracy_score(y_test, y_pred_RF)))


Random Forest			 0.9667


In [None]:
df # Displaying dataset again


Unnamed: 0,pH,N,P,K,Fertilizer
0,6.59,4.0,2.0,2.5,1
1,6.08,4.0,4.0,4.0,2
2,5.01,4.0,4.0,2.0,3
3,5.55,2.0,2.1,3.5,5
4,6.18,2.0,3.9,2.1,6
...,...,...,...,...,...
195,5.85,4.0,2.0,3.0,0
196,7.12,2.0,3.9,2.1,6
197,5.58,2.0,2.1,3.5,5
198,5.54,4.2,2.3,3.2,0


In [None]:
#For Random Forest model
#inputs: pH, N, P, K
# values = (curent - goal)
model = DecisionTreeClassifier()
X = X.values # conversion of X  into array
data = np.array([[(13-6.5), (0-110), (45-30), (23-110)]])
prediction = model_RF.predict(data)
print(prediction)

['Urea and MOP']
