# Experimento 03

* Incluimos todas las variables en el modelo
* Calculamos los índices: NDVI1, NDVI2, SAVI
* Elminamos cultivos= 10 y 20
* Calculamos las tasas de diferencia entre 2 meses consecutivos.

En este ejercicio vamos a crear una serie de variables sobre el dataset original. Luego vamos a experimentar con cross validation y grid search para tunear hiperparámetros. Finalmente, reproducimos el dataset en el de predict para generar el archivo de entrega.

1. Generar una variable de NDVI por fecha
2. Filtrar el dataset para quedarnos con las bandas NDVI para cada fecha
3. Separar el dataset de entrenamiento en 3 partes:
  + 3.1 Train
  + 3.2 Validación
  + 3.3 Test
4. Usando los datasets de los puntos 3.1 y 3.2, ajustar una búsqueda por grid search para un Random Forest utilizando cross validation con 5 folds
5. Medir el mejor modelo del punto 4 en el dataset 3.3
6. Reproducir la creación del dataset para `dataset_predict.csv`
7. Generar el archivo `clases.csv` con el formato solicitado

In [146]:
import pandas as pd

In [147]:
df = pd.read_csv('dataset_train.csv')

# 0. Quitamos cultivos 10 y 20

In [148]:
df = df[df.cultivo<10]

## 1. Funciones para generar una variable de NDVI, SAVI por fecha

In [149]:
def indice_normalizado(banda_1, banda_2):
    num = banda_1-banda_2
    den = banda_1+banda_2
    return(num/den)

def indice_SAVI(banda_4, banda_8):
    L=0.5
    num = banda_8-banda_4
    den = banda_8+banda_4+L
    return (num/den*(1.0+L))

# MSAVI=(NIR-Red)/(NIR+Red+L)*(1+L) 
# MSAVI = (2 * Band 4 + 1 – sqrt ((2 * Band 4 + 1)^2 – 8 * (Band 4 – Band 3))) / 2
def indice_MSAVI(banda_3, banda_4):
    return (2 * banda_4 + 1 -  ((2 * banda_4 + 1)**2 - 8 * (banda_4 - banda_3)))**(1/2) / 2

In [150]:
# Creamos los índices NDVI1 para cada mes
df['NDVI_2020-10-01'] = indice_normalizado(df['B8_2020-10-01'], df['B4_2020-10-01'])
df['NDVI_2020-11-01'] = indice_normalizado(df['B8_2020-11-01'], df['B4_2020-11-01'])
df['NDVI_2020-12-01'] = indice_normalizado(df['B8_2020-12-01'], df['B4_2020-12-01'])
df['NDVI_2021-01-01'] = indice_normalizado(df['B8_2021-01-01'], df['B4_2021-01-01'])
df['NDVI_2021-02-20'] = indice_normalizado(df['B8_2021-02-20'], df['B4_2021-02-20'])
df['NDVI_2021-03-17'] = indice_normalizado(df['B8_2021-03-17'], df['B4_2021-03-17'])

# Creamos los índices NDVI2 para cada mes
df['NDVI2_2020-10-01'] = indice_normalizado(df['B8A_2020-10-01'], df['B4_2020-10-01'])
df['NDVI2_2020-11-01'] = indice_normalizado(df['B8A_2020-11-01'], df['B4_2020-11-01'])
df['NDVI2_2020-12-01'] = indice_normalizado(df['B8A_2020-12-01'], df['B4_2020-12-01'])
df['NDVI2_2021-01-01'] = indice_normalizado(df['B8A_2021-01-01'], df['B4_2021-01-01'])
df['NDVI2_2021-02-20'] = indice_normalizado(df['B8A_2021-02-20'], df['B4_2021-02-20'])
df['NDVI2_2021-03-17'] = indice_normalizado(df['B8A_2021-03-17'], df['B4_2021-03-17'])

# Creamos los índices SAVI
df['SAVI_2020-10-01'] = indice_SAVI(df['B4_2020-10-01'], df['B8_2020-10-01'])
df['SAVI_2020-11-01'] = indice_SAVI(df['B4_2020-11-01'], df['B8_2020-11-01'])
df['SAVI_2020-12-01'] = indice_SAVI(df['B4_2020-12-01'], df['B8_2020-12-01'])
df['SAVI_2021-01-01'] = indice_SAVI(df['B4_2021-01-01'], df['B8_2021-01-01'])
df['SAVI_2021-02-20'] = indice_SAVI(df['B4_2021-02-20'], df['B8_2021-02-20'])
df['SAVI_2021-03-17'] = indice_SAVI(df['B4_2021-03-17'], df['B8_2021-03-17'])

# Creamos los índices MSAVI
#df['MSAVI_2020-10-01'] = indice_MSAVI(df['B3_2020-10-01'], df['B4_2020-10-01'])
#df['MSAVI_2020-11-01'] = indice_MSAVI(df['B3_2020-11-01'], df['B4_2020-11-01'])
#df['MSAVI_2020-12-01'] = indice_MSAVI(df['B3_2020-12-01'], df['B4_2020-12-01'])
#df['MSAVI_2021-01-01'] = indice_MSAVI(df['B3_2021-01-01'], df['B4_2021-01-01'])
#df['MSAVI_2021-02-20'] = indice_MSAVI(df['B3_2021-02-20'], df['B4_2021-02-20'])
#df['MSAVI_2021-03-17'] = indice_MSAVI(df['B3_2021-03-17'], df['B4_2021-03-17'])

df.head()

Unnamed: 0,id,cultivo,B2_2020-10-01,B3_2020-10-01,B4_2020-10-01,B8_2020-10-01,B8A_2020-10-01,B11_2020-10-01,B12_2020-10-01,B2_2020-11-01,...,NDVI2_2020-12-01,NDVI2_2021-01-01,NDVI2_2021-02-20,NDVI2_2021-03-17,SAVI_2020-10-01,SAVI_2020-11-01,SAVI_2020-12-01,SAVI_2021-01-01,SAVI_2021-02-20,SAVI_2021-03-17
0,0,1,0.1009,0.1416,0.196,0.2728,0.2809,0.4176,0.3401,0.1122,...,0.207093,0.83772,0.933302,0.623182,0.11891,0.112931,0.135356,0.615502,0.773583,0.416925
1,1,1,0.1032,0.141,0.1974,0.2732,0.2819,0.4179,0.3413,0.1132,...,0.20697,0.827558,0.933307,0.624167,0.117144,0.112691,0.136554,0.612,0.767937,0.415345
2,2,1,0.1052,0.1422,0.1976,0.272,0.2813,0.4185,0.3415,0.1124,...,0.201994,0.807046,0.935169,0.63014,0.115099,0.115385,0.136201,0.581708,0.776534,0.416335
3,3,1,0.1001,0.1366,0.187,0.2624,0.2779,0.4171,0.3374,0.1118,...,0.210557,0.830824,0.933365,0.639012,0.119128,0.110486,0.135846,0.616155,0.770663,0.433227
4,4,1,0.098,0.1356,0.193,0.2662,0.2782,0.4164,0.3374,0.1086,...,0.21423,0.812194,0.930044,0.628649,0.11447,0.112315,0.139791,0.58685,0.772824,0.428888


## 1.1 Tasa de crecimiento

In [151]:
# [(Periodo n/Periodo base) - 1] 
def tasa(per_n, per_base):
    return ((per_n/per_base)-1)

    
    

In [152]:
df['TASA1-B2-11-01'] = tasa(df['B2_2020-11-01'], df['B2_2020-10-01'])
df['TASA1-B2-12-01'] = tasa(df['B2_2020-12-01'], df['B2_2020-11-01'])
df['TASA1-B2-01-01'] = tasa(df['B2_2021-01-01'], df['B2_2020-12-01'])
df['TASA1-B2-02-20'] = tasa(df['B2_2021-02-20'], df['B2_2021-01-01'])
df['TASA1-B2-03-17'] = tasa(df['B2_2021-03-17'], df['B2_2021-02-20'])

df['TASA1-B3-11-01'] = tasa(df['B3_2020-11-01'], df['B3_2020-10-01'])
df['TASA1-B3-12-01'] = tasa(df['B3_2020-12-01'], df['B3_2020-11-01'])
df['TASA1-B3-01-01'] = tasa(df['B3_2021-01-01'], df['B3_2020-12-01'])
df['TASA1-B3-02-20'] = tasa(df['B3_2021-02-20'], df['B3_2021-01-01'])
df['TASA1-B3-03-17'] = tasa(df['B3_2021-03-17'], df['B3_2021-02-20'])

df['TASA1-B4-11-01'] = tasa(df['B4_2020-11-01'], df['B4_2020-10-01'])
df['TASA1-B4-12-01'] = tasa(df['B4_2020-12-01'], df['B4_2020-11-01'])
df['TASA1-B4-01-01'] = tasa(df['B4_2021-01-01'], df['B4_2020-12-01'])
df['TASA1-B4-02-20'] = tasa(df['B4_2021-02-20'], df['B4_2021-01-01'])
df['TASA1-B4-03-17'] = tasa(df['B4_2021-03-17'], df['B4_2021-02-20'])

df['TASA1-B8-11-01'] = tasa(df['B8_2020-11-01'], df['B8_2020-10-01'])
df['TASA1-B8-12-01'] = tasa(df['B8_2020-12-01'], df['B8_2020-11-01'])
df['TASA1-B8-01-01'] = tasa(df['B8_2021-01-01'], df['B8_2020-12-01'])
df['TASA1-B8-02-20'] = tasa(df['B8_2021-02-20'], df['B8_2021-01-01'])
df['TASA1-B8-03-17'] = tasa(df['B8_2021-03-17'], df['B8_2021-02-20'])

df['TASA1-B8A-11-01'] = tasa(df['B8A_2020-11-01'], df['B8A_2020-10-01'])
df['TASA1-B8A-12-01'] = tasa(df['B8A_2020-12-01'], df['B8A_2020-11-01'])
df['TASA1-B8A-01-01'] = tasa(df['B8A_2021-01-01'], df['B8A_2020-12-01'])
df['TASA1-B8A-02-20'] = tasa(df['B8A_2021-02-20'], df['B8A_2021-01-01'])
df['TASA1-B8A-03-17'] = tasa(df['B8A_2021-03-17'], df['B8A_2021-02-20'])

df['TASA1-B11-11-01'] = tasa(df['B11_2020-11-01'], df['B11_2020-10-01'])
df['TASA1-B11-12-01'] = tasa(df['B11_2020-12-01'], df['B11_2020-11-01'])
df['TASA1-B11-01-01'] = tasa(df['B11_2021-01-01'], df['B11_2020-12-01'])
df['TASA1-B11-02-20'] = tasa(df['B11_2021-02-20'], df['B11_2021-01-01'])
df['TASA1-B11-03-17'] = tasa(df['B11_2021-03-17'], df['B11_2021-02-20'])

df['TASA1-B12-11-01'] = tasa(df['B12_2020-11-01'], df['B12_2020-10-01'])
df['TASA1-B12-12-01'] = tasa(df['B12_2020-12-01'], df['B12_2020-11-01'])
df['TASA1-B12-01-01'] = tasa(df['B12_2021-01-01'], df['B12_2020-12-01'])
df['TASA1-B12-02-20'] = tasa(df['B12_2021-02-20'], df['B12_2021-01-01'])
df['TASA1-B12-03-17'] = tasa(df['B12_2021-03-17'], df['B12_2021-02-20'])

df.describe()

Unnamed: 0,id,cultivo,B2_2020-10-01,B3_2020-10-01,B4_2020-10-01,B8_2020-10-01,B8A_2020-10-01,B11_2020-10-01,B12_2020-10-01,B2_2020-11-01,...,TASA1-B11-11-01,TASA1-B11-12-01,TASA1-B11-01-01,TASA1-B11-02-20,TASA1-B11-03-17,TASA1-B12-11-01,TASA1-B12-12-01,TASA1-B12-01-01,TASA1-B12-02-20,TASA1-B12-03-17
count,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,...,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0
mean,5610.290774,2.364086,0.072882,0.095148,0.11819,0.253254,0.261156,0.305753,0.240738,0.071528,...,0.023641,0.266877,-0.239305,-0.090642,0.029679,0.059785,0.282004,-0.328617,-0.036677,0.193148
std,3219.372896,1.166675,0.025654,0.02811,0.0501,0.047344,0.046216,0.08311,0.089762,0.02642,...,0.182934,0.630268,0.181579,0.34039,0.198702,0.279856,0.769273,0.268796,0.654361,0.407614
min,0.0,1.0,0.02165,0.0371,0.0203,0.159,0.1688,0.1198,0.0555,0.0211,...,-0.916458,-0.502723,-0.645075,-0.617812,-0.485315,-0.902374,-0.673805,-0.81782,-0.795064,-0.608206
25%,2813.25,1.0,0.0472,0.0683,0.06645,0.221,0.2299,0.2325,0.147925,0.0474,...,-0.076985,-0.18129,-0.364421,-0.365405,-0.091544,-0.1017,-0.301574,-0.568546,-0.552184,-0.068668
50%,5671.5,2.0,0.0783,0.100725,0.136,0.2432,0.25205,0.3276,0.2906,0.0716,...,0.013177,0.017216,-0.23112,-0.164155,-0.0091,0.052325,0.007662,-0.313349,-0.244827,0.102025
75%,8427.75,3.0,0.0953,0.1182,0.158475,0.2764,0.2845,0.376187,0.3127,0.0944,...,0.085825,0.740149,-0.128038,0.156016,0.110903,0.134253,0.83566,-0.148525,0.357718,0.366593
max,11153.0,5.0,0.1588,0.1848,0.239,0.5176,0.5007,0.4598,0.3865,0.1446,...,0.958696,6.627907,0.377362,1.419456,1.094096,1.386279,6.083333,0.636975,3.132108,2.370371


## 2. No Filtrar el dataset para quedarnos con las bandas NDVI para cada fecha

In [153]:
#df_filter = df.filter(regex=r'(NDVI|id|cultivo)')
#print(df_filter.columns)
#df_filter.head()

# No filtramos, usamos todas las variables
df_filter = df

## 3. Separar el dataset de entrenamiento en 3 partes

In [154]:
from sklearn.model_selection import train_test_split

In [155]:
# Let's say we want to split the data in 80:10:10 for train:valid:test dataset
train_size=0.8

X = df_filter.drop(['id', 'cultivo'], axis=1).copy()
y = df_filter['cultivo']

# In the first step we will split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8)

# Now since we want the valid and test size to be equal (10% each of overall data). 
# we have to define valid_size=0.5 (that is 50% of remaining data)
test_size = 0.5
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

(6798, 95)
(6798,)
(850, 95)
(850,)
(850, 95)
(850,)


(None, None)

## 4. Usando los datasets de los puntos 2.1 y 2.2, ajustar una búsqueda por grid search para un Random Forest utilizando cross validation con 5 folds

In [156]:
grid = { 
    'n_estimators': [2,4,8],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [2,4,6],
    'criterion' :['gini', 'entropy'],
    'random_state' : [18]
}

In [157]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf_cv = GridSearchCV(estimator=RandomForestClassifier(), param_grid=grid, cv= 5, verbose=2, refit=True)
rf_cv.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] criterion=gini, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18 
[CV]  criterion=gini, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18, total=   0.1s
[CV] criterion=gini, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18 
[CV]  criterion=gini, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18, total=   0.1s
[CV] criterion=gini, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18 
[CV]  criterion=gini, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18, total=   0.1s
[CV] criterion=gini, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s



[CV]  criterion=gini, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18, total=   0.1s
[CV] criterion=gini, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18 
[CV]  criterion=gini, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18, total=   0.0s
[CV] criterion=gini, max_depth=2, max_features=sqrt, n_estimators=4, random_state=18 
[CV]  criterion=gini, max_depth=2, max_features=sqrt, n_estimators=4, random_state=18, total=   0.1s
[CV] criterion=gini, max_depth=2, max_features=sqrt, n_estimators=4, random_state=18 
[CV]  criterion=gini, max_depth=2, max_features=sqrt, n_estimators=4, random_state=18, total=   0.1s
[CV] criterion=gini, max_depth=2, max_features=sqrt, n_estimators=4, random_state=18 
[CV]  criterion=gini, max_depth=2, max_features=sqrt, n_estimators=4, random_state=18, total=   0.1s
[CV] criterion=gini, max_depth=2, max_features=sqrt, n_estimators=4, random_state=18 
[CV]  criterion=gini, max_depth=2, max_features=sqrt, n_estimato

[CV]  criterion=gini, max_depth=4, max_features=log2, n_estimators=2, random_state=18, total=   0.1s
[CV] criterion=gini, max_depth=4, max_features=log2, n_estimators=2, random_state=18 
[CV]  criterion=gini, max_depth=4, max_features=log2, n_estimators=2, random_state=18, total=   0.1s
[CV] criterion=gini, max_depth=4, max_features=log2, n_estimators=4, random_state=18 
[CV]  criterion=gini, max_depth=4, max_features=log2, n_estimators=4, random_state=18, total=   0.1s
[CV] criterion=gini, max_depth=4, max_features=log2, n_estimators=4, random_state=18 
[CV]  criterion=gini, max_depth=4, max_features=log2, n_estimators=4, random_state=18, total=   0.1s
[CV] criterion=gini, max_depth=4, max_features=log2, n_estimators=4, random_state=18 
[CV]  criterion=gini, max_depth=4, max_features=log2, n_estimators=4, random_state=18, total=   0.1s
[CV] criterion=gini, max_depth=4, max_features=log2, n_estimators=4, random_state=18 
[CV]  criterion=gini, max_depth=4, max_features=log2, n_estimator

[CV]  criterion=entropy, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18, total=   0.1s
[CV] criterion=entropy, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18 
[CV]  criterion=entropy, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18, total=   0.1s
[CV] criterion=entropy, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18 
[CV]  criterion=entropy, max_depth=2, max_features=sqrt, n_estimators=2, random_state=18, total=   0.1s
[CV] criterion=entropy, max_depth=2, max_features=sqrt, n_estimators=4, random_state=18 
[CV]  criterion=entropy, max_depth=2, max_features=sqrt, n_estimators=4, random_state=18, total=   0.1s
[CV] criterion=entropy, max_depth=2, max_features=sqrt, n_estimators=4, random_state=18 
[CV]  criterion=entropy, max_depth=2, max_features=sqrt, n_estimators=4, random_state=18, total=   0.1s
[CV] criterion=entropy, max_depth=2, max_features=sqrt, n_estimators=4, random_state=18 
[CV]  criterion=entropy, max_depth=

[CV]  criterion=entropy, max_depth=4, max_features=log2, n_estimators=2, random_state=18, total=   0.1s
[CV] criterion=entropy, max_depth=4, max_features=log2, n_estimators=2, random_state=18 
[CV]  criterion=entropy, max_depth=4, max_features=log2, n_estimators=2, random_state=18, total=   0.1s
[CV] criterion=entropy, max_depth=4, max_features=log2, n_estimators=2, random_state=18 
[CV]  criterion=entropy, max_depth=4, max_features=log2, n_estimators=2, random_state=18, total=   0.1s
[CV] criterion=entropy, max_depth=4, max_features=log2, n_estimators=2, random_state=18 
[CV]  criterion=entropy, max_depth=4, max_features=log2, n_estimators=2, random_state=18, total=   0.1s
[CV] criterion=entropy, max_depth=4, max_features=log2, n_estimators=4, random_state=18 
[CV]  criterion=entropy, max_depth=4, max_features=log2, n_estimators=4, random_state=18, total=   0.2s
[CV] criterion=entropy, max_depth=4, max_features=log2, n_estimators=4, random_state=18 
[CV]  criterion=entropy, max_depth=

[CV]  criterion=entropy, max_depth=6, max_features=log2, n_estimators=8, random_state=18, total=   0.4s


[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:   37.1s finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 4, 6],
                         'max_features': ['sqrt', 'log2'],
                         'n_estimators': [2, 4, 8], 'random_state': [18]},
             verbose=2)

In [158]:
rf_cv.best_params_

{'criterion': 'entropy',
 'max_depth': 6,
 'max_features': 'sqrt',
 'n_estimators': 8,
 'random_state': 18}

In [159]:
from sklearn.metrics import confusion_matrix
y_test_pred = rf_cv.predict(X_test)
confusion_matrix(y_test, y_test_pred)

array([[240,   1,   0,   1,   0],
       [  0, 190,   0,   0,   0],
       [  0,   3, 283,   0,   0],
       [  2,   1,   3,  60,   0],
       [  0,   0,   2,   0,  64]], dtype=int64)

In [160]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_test_pred)

0.9847058823529412

## 6. Reproducir la creación del dataset para dataset_predict.csv

In [161]:
df_test = pd.read_csv('dataset_predict.csv')

# Creamos los índices NDVI1 para cada mes
df_test['NDVI_2020-10-01'] = indice_normalizado(df_test['B8_2020-10-01'], df_test['B4_2020-10-01'])
df_test['NDVI_2020-11-01'] = indice_normalizado(df_test['B8_2020-11-01'], df_test['B4_2020-11-01'])
df_test['NDVI_2020-12-01'] = indice_normalizado(df_test['B8_2020-12-01'], df_test['B4_2020-12-01'])
df_test['NDVI_2021-01-01'] = indice_normalizado(df_test['B8_2021-01-01'], df_test['B4_2021-01-01'])
df_test['NDVI_2021-02-20'] = indice_normalizado(df_test['B8_2021-02-20'], df_test['B4_2021-02-20'])
df_test['NDVI_2021-03-17'] = indice_normalizado(df_test['B8_2021-03-17'], df_test['B4_2021-03-17'])

# Creamos los índices NDVI2 para cada mes
df_test['NDVI2_2020-10-01'] = indice_normalizado(df_test['B8A_2020-10-01'], df_test['B4_2020-10-01'])
df_test['NDVI2_2020-11-01'] = indice_normalizado(df_test['B8A_2020-11-01'], df_test['B4_2020-11-01'])
df_test['NDVI2_2020-12-01'] = indice_normalizado(df_test['B8A_2020-12-01'], df_test['B4_2020-12-01'])
df_test['NDVI2_2021-01-01'] = indice_normalizado(df_test['B8A_2021-01-01'], df_test['B4_2021-01-01'])
df_test['NDVI2_2021-02-20'] = indice_normalizado(df_test['B8A_2021-02-20'], df_test['B4_2021-02-20'])
df_test['NDVI2_2021-03-17'] = indice_normalizado(df_test['B8A_2021-03-17'], df_test['B4_2021-03-17'])

# Creamos los índices SAVI
df_test['SAVI_2020-10-01'] = indice_SAVI(df_test['B4_2020-10-01'], df_test['B8_2020-10-01'])
df_test['SAVI_2020-11-01'] = indice_SAVI(df_test['B4_2020-11-01'], df_test['B8_2020-11-01'])
df_test['SAVI_2020-12-01'] = indice_SAVI(df_test['B4_2020-12-01'], df_test['B8_2020-12-01'])
df_test['SAVI_2021-01-01'] = indice_SAVI(df_test['B4_2021-01-01'], df_test['B8_2021-01-01'])
df_test['SAVI_2021-02-20'] = indice_SAVI(df_test['B4_2021-02-20'], df_test['B8_2021-02-20'])
df_test['SAVI_2021-03-17'] = indice_SAVI(df_test['B4_2021-03-17'], df_test['B8_2021-03-17'])

# Creamos los índices MSAVI
#df_test['MSAVI_2020-10-01'] = indice_MSAVI(df_test['B3_2020-10-01'], df_test['B4_2020-10-01'])
#df_test['MSAVI_2020-11-01'] = indice_MSAVI(df_test['B3_2020-11-01'], df_test['B4_2020-11-01'])
#df_test['MSAVI_2020-12-01'] = indice_MSAVI(df_test['B3_2020-12-01'], df_test['B4_2020-12-01'])
#df_test['MSAVI_2021-01-01'] = indice_MSAVI(df_test['B3_2021-01-01'], df_test['B4_2021-01-01'])
#df_test['MSAVI_2021-02-20'] = indice_MSAVI(df_test['B3_2021-02-20'], df_test['B4_2021-02-20'])
#df_test['MSAVI_2021-03-17'] = indice_MSAVI(df_test['B3_2021-03-17'], df_test['B4_2021-03-17'])

In [162]:
df_test['TASA1-B2-11-01'] = tasa(df_test['B2_2020-11-01'], df_test['B2_2020-10-01'])
df_test['TASA1-B2-12-01'] = tasa(df_test['B2_2020-12-01'], df_test['B2_2020-11-01'])
df_test['TASA1-B2-01-01'] = tasa(df_test['B2_2021-01-01'], df_test['B2_2020-12-01'])
df_test['TASA1-B2-02-20'] = tasa(df_test['B2_2021-02-20'], df_test['B2_2021-01-01'])
df_test['TASA1-B2-03-17'] = tasa(df_test['B2_2021-03-17'], df_test['B2_2021-02-20'])

df_test['TASA1-B3-11-01'] = tasa(df_test['B3_2020-11-01'], df_test['B3_2020-10-01'])
df_test['TASA1-B3-12-01'] = tasa(df_test['B3_2020-12-01'], df_test['B3_2020-11-01'])
df_test['TASA1-B3-01-01'] = tasa(df_test['B3_2021-01-01'], df_test['B3_2020-12-01'])
df_test['TASA1-B3-02-20'] = tasa(df_test['B3_2021-02-20'], df_test['B3_2021-01-01'])
df_test['TASA1-B3-03-17'] = tasa(df_test['B3_2021-03-17'], df_test['B3_2021-02-20'])

df_test['TASA1-B4-11-01'] = tasa(df_test['B4_2020-11-01'], df_test['B4_2020-10-01'])
df_test['TASA1-B4-12-01'] = tasa(df_test['B4_2020-12-01'], df_test['B4_2020-11-01'])
df_test['TASA1-B4-01-01'] = tasa(df_test['B4_2021-01-01'], df_test['B4_2020-12-01'])
df_test['TASA1-B4-02-20'] = tasa(df_test['B4_2021-02-20'], df_test['B4_2021-01-01'])
df_test['TASA1-B4-03-17'] = tasa(df_test['B4_2021-03-17'], df_test['B4_2021-02-20'])

df_test['TASA1-B8-11-01'] = tasa(df_test['B8_2020-11-01'], df_test['B8_2020-10-01'])
df_test['TASA1-B8-12-01'] = tasa(df_test['B8_2020-12-01'], df_test['B8_2020-11-01'])
df_test['TASA1-B8-01-01'] = tasa(df_test['B8_2021-01-01'], df_test['B8_2020-12-01'])
df_test['TASA1-B8-02-20'] = tasa(df_test['B8_2021-02-20'], df_test['B8_2021-01-01'])
df_test['TASA1-B8-03-17'] = tasa(df_test['B8_2021-03-17'], df_test['B8_2021-02-20'])

df_test['TASA1-B8A-11-01'] = tasa(df_test['B8A_2020-11-01'], df_test['B8A_2020-10-01'])
df_test['TASA1-B8A-12-01'] = tasa(df_test['B8A_2020-12-01'], df_test['B8A_2020-11-01'])
df_test['TASA1-B8A-01-01'] = tasa(df_test['B8A_2021-01-01'], df_test['B8A_2020-12-01'])
df_test['TASA1-B8A-02-20'] = tasa(df_test['B8A_2021-02-20'], df_test['B8A_2021-01-01'])
df_test['TASA1-B8A-03-17'] = tasa(df_test['B8A_2021-03-17'], df_test['B8A_2021-02-20'])

df_test['TASA1-B11-11-01'] = tasa(df_test['B11_2020-11-01'], df_test['B11_2020-10-01'])
df_test['TASA1-B11-12-01'] = tasa(df_test['B11_2020-12-01'], df_test['B11_2020-11-01'])
df_test['TASA1-B11-01-01'] = tasa(df_test['B11_2021-01-01'], df_test['B11_2020-12-01'])
df_test['TASA1-B11-02-20'] = tasa(df_test['B11_2021-02-20'], df_test['B11_2021-01-01'])
df_test['TASA1-B11-03-17'] = tasa(df_test['B11_2021-03-17'], df_test['B11_2021-02-20'])

df_test['TASA1-B12-11-01'] = tasa(df_test['B12_2020-11-01'], df_test['B12_2020-10-01'])
df_test['TASA1-B12-12-01'] = tasa(df_test['B12_2020-12-01'], df_test['B12_2020-11-01'])
df_test['TASA1-B12-01-01'] = tasa(df_test['B12_2021-01-01'], df_test['B12_2020-12-01'])
df_test['TASA1-B12-02-20'] = tasa(df_test['B12_2021-02-20'], df_test['B12_2021-01-01'])
df_test['TASA1-B12-03-17'] = tasa(df_test['B12_2021-03-17'], df_test['B12_2021-02-20'])


df.describe()

Unnamed: 0,id,cultivo,B2_2020-10-01,B3_2020-10-01,B4_2020-10-01,B8_2020-10-01,B8A_2020-10-01,B11_2020-10-01,B12_2020-10-01,B2_2020-11-01,...,TASA1-B11-11-01,TASA1-B11-12-01,TASA1-B11-01-01,TASA1-B11-02-20,TASA1-B11-03-17,TASA1-B12-11-01,TASA1-B12-12-01,TASA1-B12-01-01,TASA1-B12-02-20,TASA1-B12-03-17
count,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,...,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0,8498.0
mean,5610.290774,2.364086,0.072882,0.095148,0.11819,0.253254,0.261156,0.305753,0.240738,0.071528,...,0.023641,0.266877,-0.239305,-0.090642,0.029679,0.059785,0.282004,-0.328617,-0.036677,0.193148
std,3219.372896,1.166675,0.025654,0.02811,0.0501,0.047344,0.046216,0.08311,0.089762,0.02642,...,0.182934,0.630268,0.181579,0.34039,0.198702,0.279856,0.769273,0.268796,0.654361,0.407614
min,0.0,1.0,0.02165,0.0371,0.0203,0.159,0.1688,0.1198,0.0555,0.0211,...,-0.916458,-0.502723,-0.645075,-0.617812,-0.485315,-0.902374,-0.673805,-0.81782,-0.795064,-0.608206
25%,2813.25,1.0,0.0472,0.0683,0.06645,0.221,0.2299,0.2325,0.147925,0.0474,...,-0.076985,-0.18129,-0.364421,-0.365405,-0.091544,-0.1017,-0.301574,-0.568546,-0.552184,-0.068668
50%,5671.5,2.0,0.0783,0.100725,0.136,0.2432,0.25205,0.3276,0.2906,0.0716,...,0.013177,0.017216,-0.23112,-0.164155,-0.0091,0.052325,0.007662,-0.313349,-0.244827,0.102025
75%,8427.75,3.0,0.0953,0.1182,0.158475,0.2764,0.2845,0.376187,0.3127,0.0944,...,0.085825,0.740149,-0.128038,0.156016,0.110903,0.134253,0.83566,-0.148525,0.357718,0.366593
max,11153.0,5.0,0.1588,0.1848,0.239,0.5176,0.5007,0.4598,0.3865,0.1446,...,0.958696,6.627907,0.377362,1.419456,1.094096,1.386279,6.083333,0.636975,3.132108,2.370371


In [163]:
#df_test = df_test.filter(regex=r'(NDVI|id)')
#print(df_test.columns)
#df_test.head()

In [164]:
X_pred = df_test.drop(['id'], axis=1)
df_test['cultivo'] = rf_cv.predict(X_pred)
df_test.head()

Unnamed: 0,id,B2_2020-10-01,B3_2020-10-01,B4_2020-10-01,B8_2020-10-01,B8A_2020-10-01,B11_2020-10-01,B12_2020-10-01,B2_2020-11-01,B3_2020-11-01,...,TASA1-B11-12-01,TASA1-B11-01-01,TASA1-B11-02-20,TASA1-B11-03-17,TASA1-B12-11-01,TASA1-B12-12-01,TASA1-B12-01-01,TASA1-B12-02-20,TASA1-B12-03-17,cultivo
0,381,0.048,0.0672,0.0714,0.2906,0.3037,0.2367,0.1566,0.065,0.0844,...,0.632599,-0.293228,-0.281857,-0.070839,0.246488,0.689549,-0.436628,-0.472282,0.0,1
1,382,0.048,0.0675,0.0739,0.2876,0.2991,0.2413,0.1592,0.0624,0.0833,...,0.633872,-0.296484,-0.284548,-0.070548,0.238065,0.689498,-0.442342,-0.468767,-0.008616,1
2,383,0.0469,0.0689,0.0713,0.2938,0.3025,0.2413,0.1592,0.0641,0.0845,...,0.625655,-0.285944,-0.297358,-0.064191,0.248744,0.667002,-0.427882,-0.481013,-0.005589,1
3,384,0.0436,0.0695,0.0664,0.2872,0.301,0.2365,0.1542,0.063,0.0832,...,0.645198,-0.30461,-0.281812,-0.045686,0.262646,0.680021,-0.444512,-0.463676,-0.001539,1
4,385,0.0456,0.0679,0.07,0.2802,0.2921,0.2451,0.1624,0.0649,0.0844,...,0.613755,-0.313061,-0.270288,-0.061581,0.253695,0.616896,-0.45322,-0.463611,0.008286,1


## 7. Generar el archivo clases.csv con el formato solicitado

In [165]:
df_test[['id', 'cultivo']].to_csv('03_clases.csv', index=False)