In [19]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix,plot_confusion_matrix
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
from explore import all_crop_codes
import math

In [20]:
df = pd.read_csv("resources/training_data/final_features.csv",index_col=0)

In [21]:
band_names = ["B06", "B12"] + ["NDVI", "NDMI", "NDGI", "ANIR", "NDRE1", "NDRE2", "NDRE5"] + ["ratio", "VV", "VH"]
tstep_labels = ["t" + str(4 * index) for index in range(0, 6)]
all_bands = [band + "_" + stat for band in band_names for stat in ["p10", "p50", "p90", "sd"] + tstep_labels]

In [29]:
df["ids"].unique()

array([910091109120,         1200,         1510,         1910,
               4351,         5100,         1110,         8100],
      dtype=int64)

In [22]:
df["y"] = df["ids"].apply(lambda num: all_crop_codes[num // 10 ** (int(math.log(num, 10)) - 4 + 1)])
X = df[all_bands]
y = df["y"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [23]:
param_grid = {'learning_rate': [0.07],#[0.03, 0.1],
        'depth': [4,10,20],#[4, 6, 10] ### 10 beter dan 4 (~0.02 accuracy verschil), 20 kan niet
        'l2_leaf_reg': [4,10,20],#[1, 3, 5,], ### 4 beter dan 10 (~0.02 accuracy verschil), 20 nog slechter 
        'iterations': [150]}#, 100, 150]}
cb = CatBoostClassifier()
grid_search = GridSearchCV(estimator = cb, param_grid = param_grid, cv = 3, n_jobs = -1)

In [24]:
grid_search.fit(X_train, y_train)
grid_search.best_params_

        nan        nan        nan]


0:	learn: 1.9724374	total: 2.75s	remaining: 6m 49s
1:	learn: 1.8700904	total: 5.42s	remaining: 6m 41s
2:	learn: 1.7742679	total: 8.04s	remaining: 6m 33s
3:	learn: 1.6950379	total: 10.7s	remaining: 6m 28s
4:	learn: 1.6300055	total: 13.2s	remaining: 6m 23s
5:	learn: 1.5682136	total: 15.9s	remaining: 6m 21s
6:	learn: 1.5205639	total: 18.5s	remaining: 6m 17s
7:	learn: 1.4671221	total: 21.1s	remaining: 6m 14s
8:	learn: 1.4239101	total: 23.7s	remaining: 6m 11s
9:	learn: 1.3869271	total: 26.3s	remaining: 6m 8s
10:	learn: 1.3457934	total: 28.9s	remaining: 6m 5s
11:	learn: 1.3160661	total: 31.5s	remaining: 6m 2s
12:	learn: 1.2869072	total: 34.2s	remaining: 6m
13:	learn: 1.2583854	total: 36.8s	remaining: 5m 57s
14:	learn: 1.2271466	total: 39.3s	remaining: 5m 53s
15:	learn: 1.2010176	total: 41.9s	remaining: 5m 51s
16:	learn: 1.1752045	total: 44.6s	remaining: 5m 48s
17:	learn: 1.1513124	total: 47.1s	remaining: 5m 45s
18:	learn: 1.1269891	total: 49.7s	remaining: 5m 42s
19:	learn: 1.1058199	total: 5

{'depth': 10, 'iterations': 150, 'l2_leaf_reg': 4, 'learning_rate': 0.07}

In [25]:
gs_results = pd.DataFrame(grid_search.cv_results_).sort_values(by=["mean_test_score"],axis=0,ascending=False)
print(gs_results.head())

   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_depth  \
3    2827.030459      9.079513         1.373186        0.782502          10   
4    2834.905129      2.291468         0.659074        0.406151          10   
0      43.716978      1.786057         0.360187        0.132006           4   
1      38.894648      2.826189         0.134575        0.067611           4   
5    1982.432080   1096.855543         0.533187        0.651618          10   

  param_iterations param_l2_leaf_reg param_learning_rate  \
3              150                 4                0.07   
4              150                10                0.07   
0              150                 4                0.07   
1              150                10                0.07   
5              150                20                0.07   

                                              params  split0_test_score  \
3  {'depth': 10, 'iterations': 150, 'l2_leaf_reg'...           0.735103   
4  {'depth': 1

In [28]:
gs_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_iterations,param_l2_leaf_reg,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
3,2827.030459,9.079513,1.373186,0.782502,10,150,4,0.07,"{'depth': 10, 'iterations': 150, 'l2_leaf_reg'...",0.735103,0.748998,0.752505,0.745535,0.007515,1
4,2834.905129,2.291468,0.659074,0.406151,10,150,10,0.07,"{'depth': 10, 'iterations': 150, 'l2_leaf_reg'...",0.718077,0.730962,0.736473,0.728504,0.007709,2
0,43.716978,1.786057,0.360187,0.132006,4,150,4,0.07,"{'depth': 4, 'iterations': 150, 'l2_leaf_reg':...",0.716575,0.733467,0.729459,0.7265,0.007207,3
1,38.894648,2.826189,0.134575,0.067611,4,150,10,0.07,"{'depth': 4, 'iterations': 150, 'l2_leaf_reg':...",0.709564,0.721944,0.726954,0.719487,0.007309,4
5,1982.43208,1096.855543,0.533187,0.651618,10,150,20,0.07,"{'depth': 10, 'iterations': 150, 'l2_leaf_reg'...",0.705558,0.719439,0.721443,0.71548,0.007063,5
2,97.029415,77.968264,0.536097,0.394589,4,150,20,0.07,"{'depth': 4, 'iterations': 150, 'l2_leaf_reg':...",0.70656,0.716433,0.71493,0.712641,0.004343,6
6,0.290054,0.104142,0.0,0.0,20,150,4,0.07,"{'depth': 20, 'iterations': 150, 'l2_leaf_reg'...",,,,,,7
7,0.398036,0.21977,0.0,0.0,20,150,10,0.07,"{'depth': 20, 'iterations': 150, 'l2_leaf_reg'...",,,,,,8
8,0.250971,0.098238,0.0,0.0,20,150,20,0.07,"{'depth': 20, 'iterations': 150, 'l2_leaf_reg'...",,,,,,9


In [33]:
# y_pred = grid_search.predict(X_test)
# print(y_pred[0:10])

## Winter barley (1510) = Winter cereal
## Winter cereal (1910) = Winter cereal
## Winter wheat (1110) = Winter cereal

y_pred[y_pred=="Winter barley"] = "Winter cereal"
y_pred[y_pred=="Winter wheat"] = "Winter cereal"
y_test[y_test=="Winter barley"] = "Winter cereal"
y_test[y_test=="Winter wheat"] = "Winter cereal"


In [45]:
print("Accuracy on test set: "+str(accuracy_score(y_test,y_pred))[0:5])
prec, rec, fscore, sup = precision_recall_fscore_support(y_test,y_pred)
# plot_confusion_matrix(grid_search,X_test, y_test)
# plt.show()

Accuracy on test set: 0.816
