In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold

In [2]:
class BuffelLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(BuffelLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=2, batch_first=True)
        self.hidden2out = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        output = self.hidden2out(lstm_out[:, -1, :])
        return output

In [7]:
input_dim = 16
lr = 1e-4
batch_size = 2
hidden_dim = 128
num_epochs = 100

## Prepare Data

In [8]:
## load data
train_data_path = '../buffelgrass-onetime-train.csv'
test_data_path = '../buffelgrass-onetime-test.csv'
train_feature_path = '../buffelgrass-onetime-train.npy'
test_feature_path = '../buffelgrass-onetime-test.npy'
variable_path = '../buffelgrass-onetime-variables.npy'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
train_features = np.load(train_feature_path, allow_pickle=True).astype('float')
test_features = np.load(test_feature_path, allow_pickle=True).astype('float')

## labels
train_labels = torch.FloatTensor(train_data.Abundance_Binary.values)
test_labels = torch.from_numpy(test_data.Abundance_Binary.values)

## precipitation feature
train_features = torch.FloatTensor(train_features)
test_features = torch.FloatTensor(test_features)

## normalization
mean, std = torch.mean(train_features), torch.std(train_features)
train_features = ((train_features-mean)/std)
test_features = (test_features-mean)/std

## torch datasets
test_dataset = TensorDataset(test_features, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [9]:
kf = KFold(n_splits=5)
test_acc = []
test_fp = []
test_fn = []
for i, (train_index, val_index) in enumerate(kf.split(train_features)):
    
    ## split
    print('---------------------')
    print(f'Split: {i+1}...')
    val_features = train_features[val_index]
    val_labels = train_labels[val_index]
    train_sub_features = train_features[train_index]
    train_sub_labels = train_labels[train_index]
        
    ## torch datasets
    train_dataset = TensorDataset(train_sub_features, train_sub_labels)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = TensorDataset(val_features, val_labels)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    ## load model
    model = BuffelLSTM(input_dim, hidden_dim)
    pos_weight = torch.tensor([(1 / torch.cat([train_labels, test_labels]).float().mean()) + 0.1]) 
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    ## training
    highest_acc = 0.0
    for epoch in tqdm(range(num_epochs)):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output.squeeze_(-1), target)
            loss.backward()
            optimizer.step()
    
        ## evaluation
        model.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for data, target in val_loader:
                output = model(data)
                preds = (output.squeeze_(-1) > 0.5).float() 
                all_preds.extend(preds.numpy())
                all_labels.extend(target.numpy())
        tn, fp, fn, tp = confusion_matrix(all_preds, all_labels).ravel()
        acc = 100 * (tn + tp) / len(all_labels)       
        if acc > highest_acc:
            highest_acc = acc
            torch.save(model.state_dict(), "highest_accuracy_model.pth")
            print(f"New highest accuracy: {highest_acc}%. Model saved.")
    
    # load highest performing model
    best_model = BuffelLSTM(input_dim, hidden_dim)
    best_model.load_state_dict(torch.load("highest_accuracy_model.pth"))    
    best_model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data, target in test_loader:
            output = best_model(data)
            preds = (output.squeeze_(-1) > 0.5).float() 
            all_preds.extend(preds.numpy())
            all_labels.extend(target.numpy())
    
    tn, fp, fn, tp = confusion_matrix(all_preds, all_labels).ravel()
    acc = 100 * (tn + tp) / len(all_labels)
    fp_rate = 100 * fp / len(all_labels)
    fn_rate = 100 * fn / len(all_labels)
    
    print(f'Accuracy: {acc}%')
    print(f'FP: {fp_rate}%')
    print(f'FN: {fn_rate}%')
    test_acc.append(acc)
    test_fp.append(fp_rate)
    test_fn.append(fn_rate)    

---------------------
Split: 1...


  1%|          | 1/100 [00:00<01:26,  1.15it/s]

New highest accuracy: 48.38709677419355%. Model saved.


  3%|▎         | 3/100 [00:02<01:26,  1.12it/s]

New highest accuracy: 54.83870967741935%. Model saved.


 19%|█▉        | 19/100 [00:16<01:12,  1.12it/s]

New highest accuracy: 58.064516129032256%. Model saved.


 20%|██        | 20/100 [00:17<01:10,  1.14it/s]

New highest accuracy: 70.96774193548387%. Model saved.


100%|██████████| 100/100 [01:29<00:00,  1.12it/s]


Accuracy: 74.35897435897436%
FP: 7.6923076923076925%
FN: 17.94871794871795%
---------------------
Split: 2...


  1%|          | 1/100 [00:00<01:25,  1.16it/s]

New highest accuracy: 64.51612903225806%. Model saved.


100%|██████████| 100/100 [01:28<00:00,  1.13it/s]


Accuracy: 51.282051282051285%
FP: 48.717948717948715%
FN: 0.0%
---------------------
Split: 3...


  1%|          | 1/100 [00:00<01:22,  1.19it/s]

New highest accuracy: 51.61290322580645%. Model saved.


 24%|██▍       | 24/100 [00:20<01:04,  1.18it/s]

New highest accuracy: 67.74193548387096%. Model saved.


100%|██████████| 100/100 [01:25<00:00,  1.18it/s]


Accuracy: 74.35897435897436%
FP: 5.128205128205129%
FN: 20.512820512820515%
---------------------
Split: 4...


  1%|          | 1/100 [00:00<01:26,  1.14it/s]

New highest accuracy: 54.83870967741935%. Model saved.


 23%|██▎       | 23/100 [00:21<01:08,  1.13it/s]

New highest accuracy: 67.74193548387096%. Model saved.


 36%|███▌      | 36/100 [00:32<00:53,  1.19it/s]

New highest accuracy: 70.96774193548387%. Model saved.


 41%|████      | 41/100 [00:36<00:49,  1.19it/s]

New highest accuracy: 74.19354838709677%. Model saved.


 99%|█████████▉| 99/100 [01:29<00:00,  1.09it/s]

New highest accuracy: 77.41935483870968%. Model saved.


100%|██████████| 100/100 [01:30<00:00,  1.11it/s]


Accuracy: 66.66666666666667%
FP: 25.641025641025642%
FN: 7.6923076923076925%
---------------------
Split: 5...


  1%|          | 1/100 [00:00<01:36,  1.02it/s]

New highest accuracy: 38.70967741935484%. Model saved.


  3%|▎         | 3/100 [00:02<01:36,  1.00it/s]

New highest accuracy: 61.29032258064516%. Model saved.


  4%|▍         | 4/100 [00:03<01:34,  1.02it/s]

New highest accuracy: 64.51612903225806%. Model saved.


 30%|███       | 30/100 [00:28<01:04,  1.08it/s]

New highest accuracy: 70.96774193548387%. Model saved.


100%|██████████| 100/100 [01:36<00:00,  1.04it/s]

Accuracy: 74.35897435897436%
FP: 5.128205128205129%
FN: 20.512820512820515%





In [13]:
np.mean(test_acc), np.std(test_acc)

(68.2051282051282, 8.970695222838925)

In [14]:
np.mean(test_fp), np.std(test_fp)

(18.46153846153846, 16.961882470298693)

In [15]:
np.mean(test_fn), np.std(test_fn)

(13.333333333333334, 8.17301407718422)