In [None]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
import sklearn.metrics as mt
from sklearn import metrics
from sklearn import tree
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
data = pd.read_csv('dataset.csv')

In [None]:
data.shape

(620, 12)

In [None]:
data.head()

Unnamed: 0,N,P,K,ph,EC,S,Cu,Fe,Mn,Zn,B,label
0,143,69,217,5.9,0.58,0.23,10.2,116.35,59.96,54.85,21.29,pomegranate
1,170,36,216,5.9,0.15,0.28,15.69,114.2,56.87,31.28,28.62,pomegranate
2,158,66,219,6.8,0.34,0.2,15.29,65.87,51.81,57.12,27.59,pomegranate
3,133,45,207,6.4,0.94,0.21,8.48,103.1,43.81,68.5,47.29,pomegranate
4,132,48,218,6.7,0.54,0.19,5.59,63.4,56.4,46.71,31.04,pomegranate


In [None]:
data['label'].unique()

array(['pomegranate', 'mango', 'grapes', 'mulberry', 'ragi', 'potato'],
      dtype=object)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 620 entries, 0 to 619
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   N       620 non-null    int64  
 1   P       620 non-null    int64  
 2   K       620 non-null    int64  
 3   ph      620 non-null    float64
 4   EC      620 non-null    float64
 5   S       620 non-null    float64
 6   Cu      620 non-null    float64
 7   Fe      620 non-null    float64
 8   Mn      620 non-null    float64
 9   Zn      620 non-null    float64
 10  B       620 non-null    float64
 11  label   620 non-null    object 
dtypes: float64(8), int64(3), object(1)
memory usage: 58.2+ KB


In [None]:
data.columns

Index(['N', 'P', 'K', 'ph', 'EC', 'S', 'Cu', 'Fe', 'Mn', 'Zn', 'B', 'label'], dtype='object')

In [None]:

data.rename(columns = {'ph':'pH'}, inplace = True)
data.columns

Index(['N', 'P', 'K', 'pH', 'EC', 'S', 'Cu', 'Fe', 'Mn', 'Zn', 'B', 'label'], dtype='object')

In [None]:
data.dtypes

N          int64
P          int64
K          int64
pH       float64
EC       float64
S        float64
Cu       float64
Fe       float64
Mn       float64
Zn       float64
B        float64
label     object
dtype: object

In [None]:
data.isnull().sum()

N        0
P        0
K        0
pH       0
EC       0
S        0
Cu       0
Fe       0
Mn       0
Zn       0
B        0
label    0
dtype: int64

In [None]:
data['label'].value_counts()

pomegranate    104
mango          104
grapes         104
mulberry       104
ragi           104
potato         100
Name: label, dtype: int64

In [None]:
all_columns = data.columns[:-1]

plt.figure(figsize=(15,13))
i = 1
for column in all_columns[:-1]:
    plt.subplot(4,3,i)
    sns.histplot(data[column])
    i+=1
plt.show()

sns.histplot(data[all_columns[-1]])
plt.show()

In [None]:
for column in all_columns:
    plt.figure(figsize=(19,7))
    sns.barplot(x = "label", y = column, data = data)
    plt.xticks(rotation=90)
    plt.title(f"{column} vs Crop Type")
    plt.show()


In [None]:
plt.figure(figsize=(100,80))
sns.pairplot(data, hue = "label")
plt.show()

In [None]:
plt.figure(figsize = (20,15))
sns.heatmap(data.corr(), center = 0, annot = True)
plt.show()

In [None]:
label_encoder = LabelEncoder()
X = data[all_columns]
y = label_encoder.fit_transform(data["label"])
print(X.shape, y.shape)

(620, 11) (620,)


In [None]:
label_dict = {}
for i in range(6):
    label_dict[i] = label_encoder.inverse_transform([i])[0]
label_dict

{0: 'grapes',
 1: 'mango',
 2: 'mulberry',
 3: 'pomegranate',
 4: 'potato',
 5: 'ragi'}

## spilitting dataset

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size = 0.2, random_state = 0)
# print(f"Train Data: {X_train.shape}, {y_train.shape}")
# print(f"Train Data: {X_test.shape}, {y_test.shape}")

In [None]:
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix
import pickle
from sklearn.pipeline import make_pipeline

In [None]:
acc = []
acc_test = []
model = []
f1scores = []

###Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_pipeline = make_pipeline(StandardScaler(), LogisticRegression(random_state=2))
lr_pipeline.fit(X_train, y_train)

predictions = lr_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
acc_test.append(accuracy*100)
print(f"Accuracy: {accuracy*100}%")


Accuracy: 94.35483870967742%


###Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

dt_pipeline = make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=2))

dt_pipeline.fit(X_train, y_train)

predictions = dt_pipeline.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy*100}%")


Accuracy: 93.54838709677419%


##Neural Network -- MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

ann_pipeline = make_pipeline(StandardScaler(), MLPClassifier(random_state=2))

ann_pipeline.fit(X_train, y_train)

predictions = ann_pipeline.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy*100}%")


Accuracy: 94.35483870967742%




###Ada-boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ada_boost = AdaBoostClassifier(random_state=2)

ada_boost.fit(X_train_scaled, y_train)

predictions = ada_boost.predict(X_test_scaled)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy*100}%")



Accuracy: 41.935483870967744%


###XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

xgb_pipeline = make_pipeline(StandardScaler(), XGBClassifier(random_state=2))

xgb_pipeline.fit(X_train, y_train)

predictions = xgb_pipeline.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy*100}%")


Accuracy: 94.35483870967742%


###Light gradient boosting machine

In [None]:
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

lgbm_pipeline = make_pipeline(StandardScaler(), LGBMClassifier(random_state=2))

lgbm_pipeline.fit(X_train, y_train)

predictions = lgbm_pipeline.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy on Test Data: {accuracy*100}%")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000084 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1445
[LightGBM] [Info] Number of data points in the train set: 496, number of used features: 11
[LightGBM] [Info] Start training from score -1.752229
[LightGBM] [Info] Start training from score -1.740668
[LightGBM] [Info] Start training from score -1.812127
[LightGBM] [Info] Start training from score -1.875843
[LightGBM] [Info] Start training from score -1.787735
[LightGBM] [Info] Start training from score -1.787735
Accuracy on Test Data: 93.54838709677419%
