In [66]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader,TensorDataset
import matplotlib.pyplot as plt
import torch.nn.functional as F
import numpy as np
from sklearn.model_selection import KFold, train_test_split
import pandas as pd 
import functions as func
import importlib
importlib.reload(func)



<module 'functions' from 'c:\\Applied AI\\Applied_AI\\Applied AI project\\functions.py'>

In [86]:
class ClassificationNN(nn.Module):
    def __init__(self,num_features,num_targets):
        super(ClassificationNN, self).__init__()

        self.num_featurs = num_features
        self.num_targets = num_targets
        
        self.fc1 = nn.Linear(num_features,128)
        self.fc2 = nn.Linear(128,128)

        if num_targets == 2:
            self.output = nn.Linear(128,1)
        else:
            self.output = nn.Linear(128,num_targets)

        self.dropout = nn.Dropout(0.2)

    def forward(self,x):
        x = F.relu(self.fc1(x))

        x = self.dropout(x)

        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.output(x)
            
        if self.num_targets == 2:
            x = torch.sigmoid(x)
        #No need if num_targets is not binary (cross entropy loss applied from the outside)
        
        return x 







In [87]:

def train_sv_classification_model(dataset,epochs,n_splits):

    data,target,num_targets,num_features = func.read_data(dataset)

    model = ClassificationNN(num_features, num_targets=num_targets)
    learning_rate=0.001

    model.train()
    running_loss=0

    X_train_val, X_test, y_train_val, y_test = train_test_split(data.drop(target,axis=1), data[target], test_size=0.33, random_state=42)

    skf = KFold(n_splits=n_splits)

    fold = 0
    fold_accuracies = []

    for train_index, val_index in skf.split(X_train_val,y_train_val):
        X_train,X_val = func.convert_df_to_tensor([X_train_val.iloc[train_index],X_train_val.iloc[val_index]])
        y_train,y_val = func.convert_df_to_tensor([y_train_val.iloc[train_index],y_train_val.iloc[val_index]])

        train_loader = func.convert_to_dataloader(X_train,y_train)
        val_loader = func.convert_to_dataloader(X_val,y_val)

        
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        
        if num_targets == 2:
            criterion = nn.BCELoss() 
        else:
            criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        for epoch in range(1,epochs+1):
            model.train()
            running_loss = 0.0
            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                
                optimizer.zero_grad()
                outputs = model(batch_X)
                if num_targets == 2:
                    loss = criterion(outputs, batch_y.unsqueeze(1).float())
                else:
                    loss = criterion(outputs, batch_y.long())


                loss.backward()
                optimizer.step()
                
                running_loss += loss.item()
            
            avg_loss = running_loss / len(train_loader)
            
            if epoch % 10 == 0 or epoch == 1:
                print(f'Epoch [{epoch}/{epochs}], Loss: {avg_loss:.4f}')            
        
        #Test model

        model.eval()
        correct = 0
        total =0 

        with torch.no_grad():
            for batch_x,batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = model(batch_X)

                if num_targets == 2:
                    
                    predicted = (outputs > 0.5).float()
                    correct += (predicted.squeeze(1) == batch_y.float()).sum().item()
                else:
                    
                    _, predicted = torch.max(outputs, dim=1)
                    correct += (predicted == batch_y.long()).sum().item()
                total += batch_y.size(0)
        
        accuracy = 100 * correct / total
        print(f'Fold {fold} Accuracy: {accuracy:.2f}% for dataset {dataset}')
        fold_accuracies.append(accuracy)
        fold += 1

    return model




        





In [88]:

def evaluate_sv_classification_model(dataset,model):

    data,target,num_targets,num_features = func.read_data(dataset)
    
    X_train_val, X_test, y_train_val, y_test = train_test_split(data.drop(target,axis=1), data[target], test_size=0.33, random_state=42)

    X_test = func.convert_df_to_tensor(X_test)
    y_test = func.convert_df_to_tensor(y_test)

    eval_loader = func.convert_to_dataloader(X_test,y_test)
        
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
                   
    #Test model
    model.eval()
    correct = 0
    total =0 

    with torch.no_grad():
        for batch_x,batch_y in eval_loader:
            batch_X, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_X)

            if num_targets == 2:
                
                predicted = (outputs > 0.5).float()
                correct += (predicted.squeeze(1) == batch_y.float()).sum().item()
            else:
                _, predicted = torch.max(outputs, dim=1)
                correct += (predicted == batch_y.long()).sum().item()
            total += batch_y.size(0)
    
    accuracy = 100 * correct / total
    print(f'Accuracy: {accuracy:.2f}% for dataset {dataset}')
    
    return accuracy




        





In [None]:
datasets = {'ecoli':[30,5],'dry_bean':[30,5],'seeds':[30,5],'cover_type':[10,2],'ozone':[30,5]}
model_dict = {}
for dataset_name,param_list in datasets.items:
    model_dict[dataset_name] = train_sv_classification_model(dataset_name,*param_list)
    

Epoch [1/2], Loss: 1.8557
Fold 0 Accuracy: 38.61% for dataset ecoli
Epoch [1/2], Loss: 0.6323
Fold 1 Accuracy: 62.10% for dataset ecoli
Epoch [1/2], Loss: 100.7625
Fold 0 Accuracy: 25.47% for dataset dry_bean
Epoch [1/2], Loss: 1.9452
Fold 1 Accuracy: 26.96% for dataset dry_bean
Epoch [1/2], Loss: 1.2243
Fold 0 Accuracy: 32.86% for dataset seeds
Epoch [1/2], Loss: 0.8748
Fold 1 Accuracy: 34.29% for dataset seeds
Epoch [1/2], Loss: 1.2732
Fold 0 Accuracy: 48.75% for dataset cover_type
Epoch [1/2], Loss: 1.2102
Fold 1 Accuracy: 48.76% for dataset cover_type
Epoch [1/2], Loss: 6.3347
Fold 0 Accuracy: 94.01% for dataset ozone
Epoch [1/2], Loss: 5.9871
Fold 1 Accuracy: 93.85% for dataset ozone


In [91]:
accuracy_dict = {}
for dataset,model in model_dict.items():
    accuracy_dict[dataset] = evaluate_sv_classification_model(dataset,model)
    

Accuracy: 86.10% for dataset ecoli
Accuracy: 25.73% for dataset dry_bean
Accuracy: 74.29% for dataset seeds
Accuracy: 48.76% for dataset cover_type
Accuracy: 91.31% for dataset ozone


Index(['name', 'landmass', 'zone', 'area', 'population', 'language',
       'religion', 'bars', 'stripes', 'colours', 'red', 'green', 'blue',
       'gold', 'white', 'black', 'orange', 'mainhue', 'circles', 'crosses',
       'saltires', 'quarters', 'sunstars', 'crescent', 'triangle', 'icon',
       'animate', 'text', 'topleft', 'botright', 'black', 'blue', 'gold',
       'green', 'orange', 'red', 'white', 'black', 'blue', 'brown', 'gold',
       'green', 'orange', 'red', 'white'],
      dtype='object')

In [None]:
data = pd.read_csv("data/seeds/seeds_dataset.csv")
data

Unnamed: 0,0,1,2,3,4,5,6,7
0,15.26,14.84,0.8710,5.763,3.312,2.221,5.220,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.9050,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1
...,...,...,...,...,...,...,...,...
205,12.19,13.20,0.8783,5.137,2.981,3.631,4.870,3
206,11.23,12.88,0.8511,5.140,2.795,4.325,5.003,3
207,13.20,13.66,0.8883,5.236,3.232,8.315,5.056,3
208,11.84,13.21,0.8521,5.175,2.836,3.598,5.044,3
