### determine the outcome of the harvest season
<h5> whether the crop would be healthy (alive),damaged by pesticides
or damaged by other reasons.</h5>


In [5]:
# import the necessary libraries 
import numpy as np
import pandas as pd


In [6]:
# read in the training data
df = pd.read_csv('farm_train.csv')
df.head()

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season,Crop_Damage
0,F00000001,188,1,0,1,0,0.0,0,1,0
1,F00000003,209,1,0,1,0,0.0,0,2,1
2,F00000004,257,1,0,1,0,0.0,0,2,1
3,F00000005,257,1,1,1,0,0.0,0,2,1
4,F00000006,342,1,0,1,0,0.0,0,2,1


In [7]:
# determine the data types 
df.dtypes

ID                          object
Estimated_Insects_Count      int64
Crop_Type                    int64
Soil_Type                    int64
Pesticide_Use_Category       int64
Number_Doses_Week            int64
Number_Weeks_Used          float64
Number_Weeks_Quit            int64
Season                       int64
Crop_Damage                  int64
dtype: object

In [8]:
# check if there is null values
df.isnull().sum()

ID                            0
Estimated_Insects_Count       0
Crop_Type                     0
Soil_Type                     0
Pesticide_Use_Category        0
Number_Doses_Week             0
Number_Weeks_Used          9000
Number_Weeks_Quit             0
Season                        0
Crop_Damage                   0
dtype: int64

In [9]:
# number of weeks used has 9000 missing values. lets see what is the
# total number 
df.shape 

(88858, 10)

#### Total number is greater than double of 9000. So we cannot drop it

In [10]:
# let fill the missing value with mean
# calculate the mean
avg_wks = df["Number_Weeks_Used"].astype("float").mean(axis=0)
print("Average of Number of week used:", avg_wks)

Average of Number of week used: 28.62397004683313


In [11]:
# fill in the the empty space with avg_wks
df["Number_Weeks_Used"].replace(np.nan, avg_wks, inplace=True)

In [12]:
# let see if the change was affected 
df.isnull().sum()

ID                         0
Estimated_Insects_Count    0
Crop_Type                  0
Soil_Type                  0
Pesticide_Use_Category     0
Number_Doses_Week          0
Number_Weeks_Used          0
Number_Weeks_Quit          0
Season                     0
Crop_Damage                0
dtype: int64

### Let do the same with testing data

In [13]:
df2 = pd.read_csv('farm_test.csv')
df2.head()

Unnamed: 0,ID,Estimated_Insects_Count,Crop_Type,Soil_Type,Pesticide_Use_Category,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Season
0,F00000002,188,1,1,1,0,,0,2
1,F00000007,410,1,1,1,0,0.0,0,2
2,F00000011,626,1,0,1,0,0.0,0,2
3,F00000013,731,1,0,1,0,0.0,0,2
4,F00000014,789,0,0,1,0,0.0,0,1


In [14]:
df2.isnull().sum()

ID                            0
Estimated_Insects_Count       0
Crop_Type                     0
Soil_Type                     0
Pesticide_Use_Category        0
Number_Doses_Week             0
Number_Weeks_Used          5893
Number_Weeks_Quit             0
Season                        0
dtype: int64

In [15]:
# let fill the missing value with mode
avg_wk = df2["Number_Weeks_Used"].astype("float").mean(axis=0)
print("Average of Number of week used:", avg_wk)

Average of Number of week used: 28.70509388396952


In [16]:
df2["Number_Weeks_Used"].replace(np.nan, avg_wk, inplace=True)

In [17]:
df2.isnull().sum()

ID                         0
Estimated_Insects_Count    0
Crop_Type                  0
Soil_Type                  0
Pesticide_Use_Category     0
Number_Doses_Week          0
Number_Weeks_Used          0
Number_Weeks_Quit          0
Season                     0
dtype: int64

In [19]:
# lets see the shape of both 
df.shape, df2.shape 

((88858, 10), (59310, 9))

In [20]:
# we will not use  the ID column in our training and testing. 
# so,lets drop them
xtrain = df.drop(['ID','Crop_Damage'], axis =1)
xtest = df2.drop(['ID'], axis = 1)
xtrain.shape, xtest.shape

((88858, 9), (59310, 8))

In [22]:
# lets separate our target variable from independent variables 
y = df['Crop_Damage']
y.shape

(88858,)

In [24]:
# lets get our independent variables by dropping the target variable 
#xtrain = df.drop(['Crop_Damage','ID'], axis = 1) 
# xtrain.shape 

(88858, 8)

In [26]:
# let see all the shapes
xtrain.shape, y.shape, xtest.shape 

((88858, 8), (88858,), (59310, 8))

### Let build our models using ensemble, methods 

In [27]:
# let start with logistics regression 
from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import confusion_matrix
LR = LogisticRegression(C=0.01, solver='liblinear').fit(xtrain,y)
LR

LogisticRegression(C=0.01, solver='liblinear')

In [28]:
# leta make our prediction 
yhat = LR.predict(xtest)
yhat[0:6]

array([0, 0, 0, 0, 0, 0])

In [34]:
# ***DONT USE ACCURACY SCORE ON LOGISTIC REGRESSION***
#from sklearn import metrics
#print("Train set Accuracy: ", metrics.accuracy_score(y, LR.predict(xtrain)))
#print("Test set Accuracy: ", metrics.accuracy_score(xtest, yhat))

In [26]:
from sklearn import svm
clf = svm.SVC(kernel='rbf')
clf.fit(xtrain, y) 

SVC()

In [28]:
from sklearn.metrics import accuracy_score


In [29]:
yhat2 = clf.predict(xtest)
yhat2

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
#from sklearn import metrics
#print("Train set Accuracy: ", metrics.accuracy_score(y, clf.predict(xtrain)))
#print("Test set Accuracy: ", metrics.accuracy_score(xtest, yhat2))

In [30]:
y = (yhat + yhat2)/2
y

array([0., 0., 0., ..., 0., 0., 0.])

In [39]:
from sklearn.neighbors import KNeighborsClassifier

In [42]:
xtrain.shape, y.shape

((88858, 8), (59310,))

In [41]:
k = 4
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k)
neigh.fit(xtrain,y)
neigh

ValueError: Found input variables with inconsistent numbers of samples: [88858, 59310]

In [None]:
yhat = neigh.predict(X_test)
yhat[0:5]

In [31]:
sub = pd.DataFrame({'ID': df2.ID, 'Crop_Damage':y})
sub.to_csv('sub1.csv',index=False)  