<a href="https://colab.research.google.com/github/pantprakhar04/smart_crop_prediction/blob/master/ML-Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!git config --global user.email "prakhar.sag22@outlook.com"
!git config --global user.name  "pantprakhar04"

In [0]:
import warnings
warnings.filterwarnings("ignore")

# **DATASET DESCRIPTION**

This data set provides an integrated collection of 

1.   ground-based meteorological, radiometric, and vegetation measurements,
2.   flux-based estimates of gross primary production (GPP), and
3.   numerous vegetation indices derived from satellite imagery

for three eddy covariance flux tower locations near Lincoln, Nebraska, USA. 

[LINK](https://daac.ornl.gov/cgi-bin/dsviewer.pl?ds_id=1217)

In [2]:
#importing dataset from local drive

from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [0]:
#importing libraries

import pickle
import flask
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from IPython.display import display

In [0]:
#storing dataset in the form of a Pandas dataframe

data = pd.read_csv('train.csv')

In [0]:
#separate data into target variable and feature set

y_all = data['Crop']
X_all = data.drop(['Year', 'Date', 'Field', 'Field Type', 'Crop', 'DOY'], 1)

In [0]:
#data standardization

from sklearn.preprocessing import scale

cols = data.columns.drop(['Year', 'Date', 'Field', 'Field Type', 'Crop', 'DOY'], 1)
for col in cols:
  X_all[col] = scale(X_all[col])

In [0]:
#shuffle and split dataset into training and testing set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, train_size=0.8, random_state=2, stratify=y_all)

In [0]:
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print ("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print ("Made predictions in {:.4f} seconds.".format(end - start))
    
    return f1_score(target, y_pred, pos_label='H', average=None), sum(target == y_pred) / float(len(y_pred))
  
def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print ("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print (f1, acc)
    print ("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1.mean() , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print ("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1.mean(), acc))

In [9]:
#to measure training time
from time import time

#build some models
clf_A = LogisticRegression(random_state=42)
clf_B = SVC(random_state=912, kernel='rbf')
clf_C = xgb.XGBClassifier(seed=82)
clf_D = RandomForestClassifier(random_state=80, n_jobs=-1)

train_predict(clf_A, X_train, y_train, X_test, y_test)
print ('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print ('')
train_predict(clf_D, X_train, y_train, X_test, y_test)
print ('')
train_predict(clf_C, X_train, y_train, X_test, y_test)
print ('')

Training a LogisticRegression using a training set size of 398. . .
Trained model in 0.0463 seconds
Made predictions in 0.0054 seconds.
[0.91060291 0.5        0.92015209] 0.8869346733668342
F1 score and accuracy score for training set: 0.7769 , 0.8869.
Made predictions in 0.0011 seconds.
F1 score and accuracy score for test set: 0.8344 , 0.9100.

Training a SVC using a training set size of 398. . .
Trained model in 0.0206 seconds
Made predictions in 0.0149 seconds.
[0.87346939 0.         0.87732342] 0.8341708542713567
F1 score and accuracy score for training set: 0.5836 , 0.8342.
Made predictions in 0.0043 seconds.
F1 score and accuracy score for test set: 0.6066 , 0.8600.

Training a RandomForestClassifier using a training set size of 398. . .
Trained model in 0.1105 seconds
Made predictions in 0.1029 seconds.
[0.99563319 0.97297297 1.        ] 0.9949748743718593
F1 score and accuracy score for training set: 0.9895 , 0.9950.
Made predictions in 0.1029 seconds.
F1 score and accuracy sc

In [0]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [0]:
#tuning some parameters

parameters = { 'learning_rate' : [0.1],
               'n_estimators' : [40],
               'max_depth': [3],
               'min_child_weight': [3],
               'gamma':[0.4],
               'subsample' : [0.8],
               'colsample_bytree' : [0.8],
               'scale_pos_weight' : [1],
               'reg_alpha':[1e-5]
             }

In [0]:
#Initialize the classifier
clf = xgb.XGBClassifier(seed=2)

#Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score, average='micro')

#Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf,
                        scoring=f1_scorer,
                        param_grid=parameters,
                        cv=5)

In [0]:
#Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train,y_train)

In [14]:
# Get the estimator
clf = grid_obj.best_estimator_
print (clf)

# Report the final F1 score for training and testing after parameter tuning
f1, acc = predict_labels(clf, X_train, y_train)
print ("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1.max() , acc))
    
f1, acc = predict_labels(clf, X_test, y_test)
print ("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1.max() , acc))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=3, missing=None,
       n_estimators=40, n_jobs=1, nthread=None, objective='multi:softprob',
       random_state=0, reg_alpha=1e-05, reg_lambda=1, scale_pos_weight=1,
       seed=2, silent=True, subsample=0.8)
Made predictions in 0.0087 seconds.
F1 score and accuracy score for training set: 0.9631 , 0.9497.
Made predictions in 0.0031 seconds.
F1 score and accuracy score for test set: 0.9231 , 0.8800.


In [0]:
#serializing our model to a file called model.pkl
pickle.dump(clf, open("model.pkl","wb"))