In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, time
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import random
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn

In [19]:
df = pd.read_excel('../data/GNP_Aerial_counting_1969_2022.xlsx')

In [20]:
empty_cols = ['MALE', 'CALVES'] #columns that are empty
zero_cols = ['LINE2002', 'LINE2012', 'COLLAR', 'CONSERVANC', 'SANCTUARY'] #columns that are > 80% just 0s
drop_cols = ['NOTES'] # other columns to drop
df.drop(columns=empty_cols, inplace=True)
df.drop(columns=zero_cols, inplace=True)
df.drop(columns=drop_cols, inplace=True)

In [21]:
df['TIME'] = df['TIME'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second if pd.notna(x) else x)
df['TIME'] = df['TIME'].fillna(0)

In [22]:
df['TYPE'] = df['TYPE'].map({'Fixed-wing': 0, 'Helicopter': 1})

In [14]:
#zero_count = (df['COUNT_DAY'] == 0).sum()
#print(zero_count / df.shape[0])

In [23]:
df['DATE'] = df['DATE'].apply(lambda t: t.day if isinstance(t, datetime) else np.nan)

def process_date(val):
    if pd.isna(val):
        return np.nan
    elif isinstance(val, str):
        return float(val.split('/')[1])
    elif isinstance(val, datetime):
        return val.day
    else:
        return float(val)

df['DATE'] = df['DATE'].apply(process_date)

In [24]:
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}

df['MONTH'] = df['MONTH'].map(month_mapping)
df['MONTH'] = pd.to_numeric(df['MONTH'], errors='coerce')

In [25]:
df['lat_lag1'] = df.groupby('SPECIES')['LATITUDE'].shift(1)
df['lon_lag1'] = df.groupby('SPECIES')['LONGITUDE'].shift(1)
df['lat_lag2'] = df.groupby('SPECIES')['LATITUDE'].shift(2)
df['lon_lag2'] = df.groupby('SPECIES')['LONGITUDE'].shift(2)
df['count_lag1'] = df.groupby('SPECIES')['COUNT_DAY'].shift(1)
df['count_lag2'] = df.groupby('SPECIES')['COUNT_DAY'].shift(2)

In [26]:
df['SPECIES'] = df['SPECIES'].str.lower()
df['STRATUM'] = df['STRATUM'].str.lower()
df = pd.get_dummies(df, columns=['SPECIES', 'STRATUM'])

In [10]:
'''
correlation_matrix = df.corr()
plt.figure(figsize=(25, 25)) 
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix Heatmap")
plt.show()
'''

'\ncorrelation_matrix = df.corr()\nplt.figure(figsize=(25, 25)) \nsns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")\nplt.title("Correlation Matrix Heatmap")\nplt.show()\n'

In [10]:
df.to_csv('GNB1969-2022.csv', index=False)

### Non-DL Model

#### Run Cross-Val

In [27]:
# Get Train Test Split
train_df = df[df['COUNT'] != 2022]
test_df = df[df['COUNT'] == 2022]

#fillna with mean
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())

X_train = train_df.drop(columns=['ID', 'LATITUDE', 'LONGITUDE', 'COUNT_DAY'])
y_train = train_df[['COUNT_DAY', 'LATITUDE', 'LONGITUDE']]
X_test = test_df.drop(columns=['ID', 'LATITUDE', 'LONGITUDE', 'COUNT_DAY'])
y_test = test_df[['COUNT_DAY', 'LATITUDE', 'LONGITUDE']]

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Train on Full Dataset (Excluding 2022 for testing)

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler



# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train.values)
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test.values)

# Reshape input to be [samples, time steps, features]
X_train_tensor = X_train_tensor.unsqueeze(1)
X_test_tensor = X_test_tensor.unsqueeze(1)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define the LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Instantiate the model
input_size = X_train.shape[1]
hidden_size = 64
num_layers = 2
output_size = 3  # Number of target variables

model = LSTMModel(input_size, hidden_size, num_layers, output_size)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# Training loop
num_epochs = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    X_test_tensor = X_test_tensor.to(device)
    predictions = model(X_test_tensor)
    test_loss = criterion(predictions, y_test_tensor.to(device))
    print(f'Test Loss: {test_loss.item():.4f}')

# Convert predictions back to numpy for further analysis if needed
predictions = predictions.cpu().numpy()

Epoch [10/100], Loss: 0.0006
Epoch [20/100], Loss: 0.0018
Epoch [30/100], Loss: 0.0014
Epoch [40/100], Loss: 0.0007
Epoch [50/100], Loss: 0.0003
Epoch [60/100], Loss: 0.0015
Epoch [70/100], Loss: 0.0007
Epoch [80/100], Loss: 0.0004
Epoch [90/100], Loss: 0.0003
Epoch [100/100], Loss: 0.0003
Test Loss: 0.0197


In [35]:
with torch.no_grad():
    actual = y_test_tensor.numpy()
    y_preds = model(X_test_tensor)
    #y_preds[:, 0] = torch.round(y_preds[:, 0])
    predicted = y_preds.detach().numpy()
    
    mse = mean_squared_error(actual, predicted)
    print(f"Mean Squared Error: {mse}")

    mae = mean_absolute_error(actual, predicted)
    print(f"Mean Absolute Error: {mae}")    

    r2 = r2_score(actual, predicted)
    print(f"R² Score: {r2}")

Mean Squared Error: 0.01973147690296173
Mean Absolute Error: 0.07780704647302628
R² Score: 0.9287698268890381
