In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,KFold,train_test_split,cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix,classification_report

In [2]:
df=pd.read_csv('CropYeildDataset.csv')

In [3]:
df.head()

Unnamed: 0,Moisture,rainfall,Average Humidity,Mean Temp,max Temp,Min temp,alkaline,sandy,Yield
0,12.801685,0.01236,57,62,71,52,0,1,2
1,12.851654,0.004172,57,58,73,43,0,1,0
2,12.776774,0.0,56,58,69,46,0,0,4
3,12.942001,0.031747,62,57,70,43,0,1,0
4,12.984652,0.0,65,56,70,42,0,0,1


In [4]:
X=df.drop('Yield',axis=1)
y=df.Yield

In [5]:
X.isnull().sum()

Moisture            0
rainfall            0
Average Humidity    0
Mean Temp           0
max Temp            0
Min temp            0
alkaline            0
sandy               0
dtype: int64

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
sc=StandardScaler()
X=sc.fit_transform(X)

In [12]:
knn=KNeighborsClassifier()
param={'n_neighbors':np.arange(1,51),'weights':['uniform','distance']}
GS=GridSearchCV(knn,param,cv=3,scoring='roc_auc')
GS.fit(X,y)

GridSearchCV(cv=3, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]),
                         'weights': ['uniform', 'distance']},
             scoring='roc_auc')

In [13]:
GS.best_estimator_

KNeighborsClassifier(n_neighbors=1)

In [18]:
lr=LogisticRegression()
knn=GS.best_estimator_
dtc=DecisionTreeClassifier(criterion='entropy',random_state=0)
dtc_reg=DecisionTreeClassifier(criterion='entropy',random_state=0,max_depth=5)
rf=RandomForestClassifier(n_estimators=9,random_state=0)
gb=GradientBoostingClassifier()
models=[]
models.append(('knn',knn))
models.append(('lr',lr))
models.append(('dtc',dtc))
models.append(('dtc_reg',dtc_reg))
models.append(('rf',rf))
models.append(('gb',gb))

In [15]:
from sklearn.model_selection import cross_val_score,KFold

In [19]:
results=[]
names=[]
for name, model in models:
    kfold=KFold(shuffle=True,n_splits=3,random_state=0)
    cv_results=cross_val_score(model,X,y,cv=kfold,scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    #print(cv_results)
    print("%s:  %f (%f)" %(name,np.mean(cv_results),np.var(cv_results,ddof=1)))

knn:  0.743023 (0.000002)
lr:  0.702288 (0.000239)
dtc:  0.926072 (0.000112)
dtc_reg:  0.860198 (0.000068)
rf:  0.934119 (0.000054)
gb:  0.952977 (0.000117)


In [20]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
model=GradientBoostingClassifier()
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.964824120603015

In [23]:
y_predict=model.predict(x_test)

In [24]:
confusion_matrix(y_test,y_predict)

array([[349,  21,   6,   0,   0],
       [  0, 220,   0,   1,   1],
       [  0,   0, 197,   0,   0],
       [  1,   0,   2,  15,   0],
       [  0,  10,   0,   0, 371]], dtype=int64)

In [25]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96       376
           1       0.88      0.99      0.93       222
           2       0.96      1.00      0.98       197
           3       0.94      0.83      0.88        18
           4       1.00      0.97      0.99       381

    accuracy                           0.96      1194
   macro avg       0.95      0.95      0.95      1194
weighted avg       0.97      0.96      0.97      1194

