In [1]:
import json
import os
import pandas as pd
import seaborn as sns
import warnings

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from statsmodels.tsa.arima.model import ARIMA
from xgboost import XGBClassifier

from typing import Set, Tuple, Any, Dict

In [2]:
warnings.filterwarnings('ignore')

In [3]:
sns.set_theme()
sns.set_palette('Paired')

In [4]:
def prepare_data_for_tarinig(data:pd.DataFrame,
    dictrict: str, order: Set[int], indices:Dict[str, Any]
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    
    # Train ARIMAs.
    data_prepared = {}
    for gfid in set(data[dictrict]['merged']['gfid']):
        d = data[dictrict]['merged'].loc[data[dictrict]['merged']['gfid'] ==
                                         gfid, 'ndvi'].copy()
        arima = ARIMA(d, order=order).fit()
        data_prepared[gfid] = arima.params.values
    data_prepared = pd.DataFrame(data_prepared).T
    data_prepared = data_prepared.sort_index()

    # Split for train and test
    target = data[dictrict]['points'][['wheat', 'gfid']]
    target = target.sort_values('gfid')
    
    train = indices[dictrict]['train']
    test = indices[dictrict]['test']
    
    X_train = data_prepared.loc[train]
    X_test = data_prepared.loc[test]
    
    y_train = target[target['gfid'].isin(train)]
    y_test = target[target['gfid'].isin(test)]

    #X_train, X_test, y_train, y_test = train_test_split(
    #    data_prepared,
    #    target['wheat'],
    #    test_size=0.2,
    #    shuffle=True,
    #    stratify=target['wheat'])
    
    return X_train, X_test, y_train, y_test

In [5]:
MODELS = [
    LogisticRegression(),
    LinearSVC(random_state=42),
    RandomForestClassifier(random_state=42),
    CatBoostClassifier(verbose=False, iterations=100, depth=3, random_state=42,
                       l2_leaf_reg=7),
    LGBMClassifier(random_state=42),
    XGBClassifier(eval_metric='logloss', random_state=42)
]

In [6]:
MODEL_NAMES = ['LogReg', 'SVM', 'RandomForest', 'CatBoost', 'LGBM', 'XGB']

In [7]:
def train_models(X_train, X_test, y_train, y_test, results:Dict[str, Any], district) -> Dict[str, Any]:
    results_ = results.copy()
    
    # Train each model.
    for model, model_name in zip(MODELS, MODEL_NAMES):
        res = {}
        #model = model_.copy()

        model.fit(X_train, y_train)
        
        predict_train = model.predict(X_train)
        predict_test = model.predict(X_test)
        # Save results_ for each model.
        res['report_train'] = classification_report(y_train, predict_train)
        res['report_test'] = classification_report(y_test, predict_test)
        res['roc_auc_train'] = roc_auc_score(y_train, predict_train)
        res['roc_auc_test'] = roc_auc_score(y_test, predict_test)
        res['accuracy_train'] = accuracy_score(y_train, predict_train)
        res['accuracy_test'] = accuracy_score(y_test, predict_test)
        results_[district][model_name] = res

    return results_

In [8]:
districts = ['Dewas', 'Kaithal', 'Karnal']

In [9]:
# Prepare results dict
results_all = dict(zip(districts, [dict(zip(MODEL_NAMES, [{}]*len(MODEL_NAMES)))]*3))

In [10]:
results_all

{'Dewas': {'LogReg': {},
  'SVM': {},
  'RandomForest': {},
  'CatBoost': {},
  'LGBM': {},
  'XGB': {}},
 'Kaithal': {'LogReg': {},
  'SVM': {},
  'RandomForest': {},
  'CatBoost': {},
  'LGBM': {},
  'XGB': {}},
 'Karnal': {'LogReg': {},
  'SVM': {},
  'RandomForest': {},
  'CatBoost': {},
  'LGBM': {},
  'XGB': {}}}

In [11]:
# Get the same train and test indices as for Deep Learning
indices = dict(zip(districts, [{}]*3))

for district_ in districts:
    with open(f'{district_}.json') as json_file:
        indices[district_] = json.load(json_file)

In [12]:
os.listdir('data')    

['Dewas_NDVI.csv',
 'Dewas_points.csv',
 'Kaithal_NDVI.csv',
 'Kaithal_points.csv',
 'Karnal_NDVI.csv',
 'Karnal_points.csv']

In [13]:
# Read all datasets into 1 dict
data = dict(zip(districts, [{}, {}, {}]))
for district in districts:
    points = pd.read_csv(f'data/{district}_points.csv')
    ndvi = pd.read_csv(f'data/{district}_NDVI.csv')
    data[district]['points'] = points
    data[district]['NDVI'] = ndvi
    data[district]['merged'] = points.merge(ndvi, left_on='gfid', right_on='gfid')

In [14]:
data['Dewas']['merged'].head()

Unnamed: 0,gfid,state,district,village,lon,lat,wheat,datenum,date,ndvi
0,72001,Madhya Pradesh,Dewas,VijayaganjMandi,75.96199,23.218479,0,0,2020-10-20,0.186
1,72001,Madhya Pradesh,Dewas,VijayaganjMandi,75.96199,23.218479,0,1,2020-10-21,0.184
2,72001,Madhya Pradesh,Dewas,VijayaganjMandi,75.96199,23.218479,0,2,2020-10-22,0.183
3,72001,Madhya Pradesh,Dewas,VijayaganjMandi,75.96199,23.218479,0,3,2020-10-23,0.182
4,72001,Madhya Pradesh,Dewas,VijayaganjMandi,75.96199,23.218479,0,4,2020-10-24,0.181


# Dewas

In [15]:
X_train, X_test, y_train, y_test = prepare_data_for_tarinig(data, 'Dewas', (5,1,5), indices)

In [19]:
results_all = train_models(X_train, X_test, y_train['wheat'], y_test['wheat'], results_all, 'Dewas')

In [20]:
for model in results_all['Dewas'].keys():
    print(model, '\n')
    for metric in results_all['Dewas'][model].keys():
        print(metric)
        print(results_all['Dewas'][model][metric], '\n')

LogReg 

report_train
              precision    recall  f1-score   support

           0       0.57      0.68      0.62       157
           1       0.55      0.43      0.49       143

    accuracy                           0.56       300
   macro avg       0.56      0.56      0.55       300
weighted avg       0.56      0.56      0.56       300
 

report_test
              precision    recall  f1-score   support

           0       0.58      0.62      0.60        40
           1       0.53      0.49      0.51        35

    accuracy                           0.56        75
   macro avg       0.56      0.56      0.55        75
weighted avg       0.56      0.56      0.56        75
 

roc_auc_train
0.5575475479934079 

roc_auc_test
0.5553571428571429 

accuracy_train
0.5633333333333334 

accuracy_test
0.56 

SVM 

report_train
              precision    recall  f1-score   support

           0       0.56      0.67      0.61       157
           1       0.54      0.43      0.48       143


# Kaithal

In [22]:
X_train, X_test, y_train, y_test = prepare_data_for_tarinig(data, 'Kaithal', (1,1,5), indices)

In [23]:
results_all = train_models(X_train, X_test, y_train['wheat'], y_test['wheat'], results_all, 'Kaithal')

In [24]:
for model in results_all['Kaithal'].keys():
    print(model, '\n')
    for metric in results_all['Kaithal'][model].keys():
        print(metric)
        print(results_all['Kaithal'][model][metric], '\n')

LogReg 

report_train
              precision    recall  f1-score   support

           0       0.65      0.37      0.47        70
           1       0.60      0.82      0.69        80

    accuracy                           0.61       150
   macro avg       0.62      0.60      0.58       150
weighted avg       0.62      0.61      0.59       150
 

report_test
              precision    recall  f1-score   support

           0       0.27      0.17      0.21        18
           1       0.52      0.67      0.58        24

    accuracy                           0.45        42
   macro avg       0.39      0.42      0.39        42
weighted avg       0.41      0.45      0.42        42
 

roc_auc_train
0.5982142857142857 

roc_auc_test
0.41666666666666663 

accuracy_train
0.6133333333333333 

accuracy_test
0.4523809523809524 

SVM 

report_train
              precision    recall  f1-score   support

           0       0.67      0.46      0.54        70
           1       0.63      0.80      

# Karnal

In [25]:
X_train, X_test, y_train, y_test = prepare_data_for_tarinig(data, 'Karnal', (1,0,5), indices)

In [26]:
results_all = train_models(X_train, X_test, y_train['wheat'], y_test['wheat'], results_all, 'Karnal')

In [27]:
for model in results_all['Karnal'].keys():
    print(model, '\n')
    for metric in results_all['Karnal'][model].keys():
        print(metric)
        print(results_all['Karnal'][model][metric], '\n')

LogReg 

report_train
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        88
           1       0.58      1.00      0.73       122

    accuracy                           0.58       210
   macro avg       0.29      0.50      0.37       210
weighted avg       0.34      0.58      0.43       210
 

report_test
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        20
           1       0.66      1.00      0.79        38

    accuracy                           0.66        58
   macro avg       0.33      0.50      0.40        58
weighted avg       0.43      0.66      0.52        58
 

roc_auc_train
0.5 

roc_auc_test
0.5 

accuracy_train
0.580952380952381 

accuracy_test
0.6551724137931034 

SVM 

report_train
              precision    recall  f1-score   support

           0       0.50      0.11      0.19        88
           1       0.59      0.92      0.72       122

    accuracy    