<a href="https://colab.research.google.com/github/timoniko/ML_Pattern_Classification/blob/main/BirdTrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch
from torch import nn
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from google.colab import drive
from sklearn.model_selection import KFold
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset
import torch.nn.functional as F
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/bird.csv')

In [None]:
targets = {'other': 0,
            'comcuc': 1,
            'cowpig1': 2,
            'eucdov': 3,
            'eueowl1': 4,
            'grswoo': 5,
            'tawowl1': 6}

dataset['target'] = dataset['target'].map(targets)

In [None]:
dataset = np.array(dataset)

dataset_t = np.array(dataset)


X = dataset[:, 0:-1]
y = dataset[:, -1]

sampling_strategy_over = {
    0: len(np.where(y == 0)[0]),
    1: 15000,
    2: 15000,
    3: 15000,
    4: 15000,
    5: 15000,
    6: 15000
}


sampling_strategy_under = {
    0: len(np.where(y == 0)[0]),
    1: 15000,
    2: 15000,
    3: 15000,
    4: 15000,
    5: 15000,
    6: 15000
}

over = SMOTE(sampling_strategy=sampling_strategy_over)
under = RandomUnderSampler(sampling_strategy=sampling_strategy_under)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X, y = pipeline.fit_resample(X, y)
X = StandardScaler().fit_transform(X)

In [None]:
len(np.where(y == 0)[0])


59750

In [None]:
class BirdDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype = torch.float32)
        self.y = torch.tensor(y, dtype = torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        features = self.X[idx]
        target = self.y[idx]
        return features, target

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
class BirdNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_layers, dropout_prob):
        super(BirdNet, self).__init__()
        self.layers = nn.ModuleList()
        for i in range(num_layers):
            self.layers.append(nn.Sequential(
                nn.Linear(hidden_size, hidden_size),
                nn.BatchNorm1d(hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout_prob)
            ))
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc_final = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        for layer in self.layers:
            x = layer(x)
        x = self.fc_final(x)
        return x

In [None]:
#loss
n_classes = len(np.unique(y))

n_layers = 2

bird_net = BirdNet(X.shape[1], X.shape[1] * 2, n_classes, 2, 0.3).to(device)

loss_fn = nn.CrossEntropyLoss()

In [None]:
def train_network(k_folds, num_epochs, n_layers, input_f):
    kfold = KFold(n_splits=k_folds, shuffle=True)
    results = {}
    f1_scores = []
    accuracies = []
    
    for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
        print(f'FOLD {fold}')
        print('--------------------------------')
        
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
        test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
        
        trainloader = torch.utils.data.DataLoader(
            dataset, batch_size=64, sampler=train_subsampler)
        testloader = torch.utils.data.DataLoader(
            dataset, batch_size=64, sampler=test_subsampler)
        
        network = BirdNet(input_f, input_f * 2, n_classes, n_layers, 0.3).to(device)
        optimizer = torch.optim.Adam(network.parameters(), lr=0.01)
        loss_fn = torch.nn.CrossEntropyLoss()
        
        for epoch in range(0, num_epochs):
            print(f'Starting epoch {epoch+1}')
            current_loss = 0.0
            
            for i, data in enumerate(trainloader, 0):
                inputs, targets = data[0].to(device, dtype=torch.float32), data[1].to(device, dtype=torch.long)
                optimizer.zero_grad()
                outputs = network(inputs)
                loss = loss_fn(outputs, targets)
                loss.backward()
                optimizer.step()
                
                current_loss += loss.item()
                if i % 50 == 49:
                    print('Loss after mini-batch %5d: %.3f' % (i + 1, current_loss / 50))
                    current_loss = 0.0
                    
        print('Training process has finished. Saving trained model.')
        save_path = f'./model-fold-{fold}_rs.pth'
        torch.save(network.state_dict(), save_path)

        # Evaluation for this fold
        correct, total = 0, 0
        y_true, y_pred = [], []
        
        with torch.no_grad():
            for i, data in enumerate(testloader, 0):
                inputs, targets = data[0].to(device, dtype=torch.float32), data[1].to(device, dtype=torch.long)
                outputs = network(inputs)
                _, predicted = torch.max(outputs.data, 1)
                
                total += targets.size(0)
                correct += (predicted == targets).sum().item()
                
                y_true.extend(targets.cpu().numpy().tolist())
                y_pred.extend(predicted.cpu().numpy().tolist())
        
        accuracy = 100.0 * correct / total
        f1 = f1_score(y_true, y_pred, average='weighted') * 100
        
        print(f'Accuracy for fold {fold}: {accuracy} %')
        print(f'F1 score for fold {fold}: {f1}')
        print('--------------------------------')
        
        results[fold] = accuracy
        accuracies.append(accuracy)
        f1_scores.append(f1)

    print(f'K-FOLD CROSS VALIDATION RESULTS FOR {k_folds} FOLDS')
    print('--------------------------------')
    
    for key, value in results.items():
        print(f'Fold {key}: {value} %')
        
    print(f'Average Accuracy: {sum(accuracies)/len(accuracies)} %')
    print(f'Average F1 score: {sum(f1_scores)/len(f1_scores)} %')

    acc = sum(accuracies)/len(accuracies)
    f1 = sum(f1_scores)/len(f1_scores)

    return acc, f1


In [None]:
bird_data = BirdDataset(X, y)
trainset, testset = random_split(bird_data, [0.7, 0.3])
dataset = ConcatDataset([trainset, testset])

acc, f1 = train_network(k_folds = 16, num_epochs = 50, n_layers = 2, input_f = X.shape[1])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Loss after mini-batch   400: 0.094
Loss after mini-batch   450: 0.098
Loss after mini-batch   500: 0.106
Loss after mini-batch   550: 0.085
Loss after mini-batch   600: 0.089
Loss after mini-batch   650: 0.114
Loss after mini-batch   700: 0.134
Loss after mini-batch   750: 0.099
Loss after mini-batch   800: 0.120
Loss after mini-batch   850: 0.108
Loss after mini-batch   900: 0.094
Loss after mini-batch   950: 0.119
Loss after mini-batch  1000: 0.094
Loss after mini-batch  1050: 0.102
Loss after mini-batch  1100: 0.112
Loss after mini-batch  1150: 0.096
Loss after mini-batch  1200: 0.108
Loss after mini-batch  1250: 0.105
Loss after mini-batch  1300: 0.091
Loss after mini-batch  1350: 0.097
Loss after mini-batch  1400: 0.116
Loss after mini-batch  1450: 0.117
Loss after mini-batch  1500: 0.111
Loss after mini-batch  1550: 0.103
Loss after mini-batch  1600: 0.077
Loss after mini-batch  1650: 0.095
Loss after mini-batch  17

In [None]:
models = []
for i in range(16):
    state_dict = torch.load(f"model-fold-{i}_rs.pth")  # Load the state dictionary of each model
    model = bird_net  # Instantiate your model class
    model.load_state_dict(state_dict)  # Load the state dictionary into the model
    models.append(model)

class BirdNetEnsemble(nn.Module):
    def __init__(self, models):
        super(BirdNetEnsemble, self).__init__()
        self.models = models

    def forward(self, x):
        predictions = [model(x) for model in self.models]
        predictions = torch.stack(predictions)  # Shape: (num_models, batch_size, num_classes)
        majority_votes = torch.mode(predictions, dim=0).values  # Perform majority voting
        probs = F.softmax(majority_votes, dim=1)
        pred_class = torch.argmax(probs, dim=1)
        return pred_class

bird_torch_ensembled = BirdNetEnsemble(models)  
torch.save(bird_torch_ensembled.state_dict(), 'BirdTorchEnsembled')   

In [None]:
def test_on_samples(bird, sample_idx):
  dataset_i = dataset_t[dataset_t[:,-1] == bird]
  sample = torch.tensor(dataset_i[sample_idx][0:-1], dtype=torch.float32)
  sample = sample.unsqueeze(0) 


  with torch.no_grad():
    bird_torch_ensembled.eval()
    pred = bird_torch_ensembled(sample.to(device))
  return pred



for i in range(0,7):
  for j in range(0, 30):
    print(test_on_samples(i, j))
  print('----------------------')

