# Training RandomForest and TSAI Models

**Author:** Ivan Zvonkov 

**Last updated:** March 25, 2023

**Description**: This notebook pulls in data from the `cropharvest-private` bucket and demonstrates training with various models.

In [14]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

## Loading data

In [38]:
df = pd.read_csv("gs://cropharvest-private/crop-mask/2023-03.csv")

In [39]:
tqdm.pandas()
df["eo_data"] = df["eo_data"].progress_apply(lambda x: np.array(eval(x))) 
df["timesteps"] = df["eo_data"].apply(lambda x: x.shape[0])

100%|███████████████████████████████████| 98845/98845 [01:14<00:00, 1333.89it/s]


## Splitting data

In [40]:
val_df = df[(df["name"] == "Togo") & (df["subset"] == "validation")]
test_df = df[(df["name"] == "Togo") & (df["subset"] == "testing")]
train_df = df[ 
    # Filter out validation and test data
    (~df.index.isin(val_df.index)) & (~df.index.isin(test_df.index)) &
    
    # Keep only data within Togo latitudes
    (df["lat"] >= (df[(df["name"] == "Togo")]["lat"].min())) & 
    (df["lat"] <= (df[(df["name"] == "Togo")]["lat"].max())) &
    
    # Keep only data that goes at least from February to February
    (df["timesteps"] > 14)    
]

## Random Forest Classifier

In [None]:
def generate_X_y(df):
    X = df["eo_data"].apply(lambda x: x[2:14].flatten()).to_list()
    y = df["is_crop"].to_list()
    return X, y

X_train, y_train = generate_X_y(train_df)
X_val, y_val = generate_X_y(val_df)
X_test, y_test = generate_X_y(test_df)

In [7]:
model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_val)
f1_score(y_val, y_pred)

0.8528301886792453

In [9]:
y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

0.7747747747747749

## LSTM

In [134]:
!pip install tsai -q

In [83]:
from tsai.all import TSClassifier, TSClassification, TSStandardize, LSTM

In [94]:
def generate_X_y(train_df, val_df, test_df):
    df = pd.concat([train_df, val_df, test_df])
    X = np.array(df["eo_data"].apply(lambda x: x[2:14]).to_list())
    y = np.array(df["is_crop"].to_list())
    split1 = len(train_df)
    split2 = len(train_df) + len(val_df)
    split3 = len(train_df) + len(val_df) + len(test_df)
    splits = [list(range(split1)), list(range(split1, split2)), list(range(split2, split3))]
    return X, y, splits

X, y, splits = generate_X_y(train_df, val_df, test_df)

In [124]:
def f1(outputs, target):
    return f1_score(target, outputs.argmax(dim=-1))

tfms = [None, TSClassification()]
batch_tfms = TSStandardize(by_sample=True)
clf = TSClassifier(X, y, splits=splits, arch=LSTM, tfms=tfms, batch_tfms=batch_tfms, metrics=f1)
clf.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,f1,time
0,0.546857,1.018826,0.0,00:01
1,0.475215,0.589971,0.456728,00:01
2,0.43546,0.604516,0.511409,00:01
3,0.414392,0.505568,0.576826,00:01
4,0.386951,0.515112,0.604861,00:01
5,0.361196,0.474702,0.56618,00:01
6,0.348561,0.464301,0.56117,00:01
7,0.325294,0.429301,0.612307,00:01
8,0.303064,0.421301,0.605343,00:01
9,0.288346,0.421249,0.600905,00:01


In [125]:
probas, target, preds = clf.get_X_preds(X[splits[-1]], y[splits[-1]])
f1_score(target, (preds == "True").astype(int))

0.7423580786026202