# Load data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! tar -zxvf drive/MyDrive/dataset.tar.gz

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
dataset/test/4672.png
dataset/test/3221.png
dataset/test/2956.png
dataset/test/2552.png
dataset/test/4837.png
dataset/test/1306.png
dataset/test/835.png
dataset/test/1858.png
dataset/test/2669.png
dataset/test/4675.png
dataset/test/1366.png
dataset/test/3322.png
dataset/test/659.png
dataset/test/4589.png
dataset/test/818.png
dataset/test/1045.png
dataset/test/2880.png
dataset/test/4133.png
dataset/test/50.png
dataset/test/1839.png
dataset/test/5064.png
dataset/test/3288.png
dataset/test/2366.png
dataset/test/4763.png
dataset/test/2437.png
dataset/test/2510.png
dataset/test/296.png
dataset/test/558.png
dataset/test/1352.png
dataset/test/193.png
dataset/test/5154.png
dataset/test/138.png
dataset/test/543.png
dataset/test/5183.png
dataset/test/2252.png
dataset/test/1579.png
dataset/test/5107.png
dataset/test/1793.png
dataset/test/1095.png
dataset/test/5320.png
dataset/test/1962.png
dataset/test/5010.png
dataset/test/2631.png

In [3]:
import os
import math
from pathlib import Path
import random

trainPath = Path('dataset/train')
validationPath = Path('dataset/validate')
os.mkdir(validationPath)
for foldeName in [f for f in os.listdir(trainPath)]:
    folder = trainPath / foldeName
    files = [f for f in os.listdir(folder)  if os.path.isfile(folder / f)]
    nFiles = len(files)
    selectedFiles = math.ceil(nFiles * 0.20)
    validationFiles = random.sample(files, selectedFiles)
    os.mkdir(validationPath / foldeName)
    for validationFile in validationFiles:
        os.rename(trainPath / foldeName / validationFile, validationPath / foldeName / validationFile)
    print(f"{foldeName} done")

Edge-on without Bulge done
Edge-on with Bulge done
Disturbed done
Barred Spiral done
Unbarred Tight Spiral done
Round Smooth done
Unbarred Loose Spiral done
Cigar Shaped Smooth done
Merging done
In-between Round Smooth done


# Main


## Imports

In [11]:
# %% imports
from typing import Any, Dict, List, Tuple

import gc

import torch
from torch.functional import Tensor
from torch.utils.data.dataloader import DataLoader
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, RandomCrop, ToTensor, ColorJitter
from torchvision.datasets import ImageFolder

import matplotlib.pyplot as plt
from tqdm import tqdm
import concurrent.futures
print("Done")

Done


## Classifiction Metrics

In [5]:
class ClassificationMetrics:
    def __init__(self, num_classes=10):
        self.num_classes = num_classes
        self.C = torch.zeros(num_classes, num_classes).to('cuda')
    def add(self, yp, yt):
        with torch.no_grad():  # We require no computation graph
            self.C += (yt*self.C.shape[1]+yp).bincount(
                minlength=self.C.numel()).view(self.C.shape).float()
    def clear(self):
        self.C.zero_()
    def acc(self):
        return self.C.diag().sum().item()/self.C.sum()
    def mAcc(self):
        return (self.C.diag()/self.C.sum(-1)).mean().item()
    def mIoU(self):
        return (self.C.diag()/(self.C.sum(0)+self.C.sum(1)-self.C.diag())).mean().item()
    def confusion_matrix(self):
        return self.C
print("Done")

Done


## Load dataset

In [22]:
from torch.utils.data.sampler import WeightedRandomSampler
from torchvision.transforms import Compose, RandomCrop, ToTensor, RandomVerticalFlip, RandomHorizontalFlip, Normalize, ToPILImage, RandomRotation, Resize
from torchvision.datasets import ImageFolder

def load_dataset(path: str, trainset:bool=True) -> ImageFolder:
    transformations = []
    if trainset:
        transformations = [
            Resize(128),
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ]
    else:
        transformations = [
            Resize(128),
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ]
    transform = Compose(transformations) # remember to normalize the data
    dataset = ImageFolder(path, transform=transform)
    return dataset

def load_images(dataset: ImageFolder, batch_size=128, trainset:bool=True, sampleWeights:List[int] = []) -> DataLoader:
    sampler = WeightedRandomSampler(sampleWeights, num_samples=len(sampleWeights), replacement=True) if trainset else None
    return DataLoader(  dataset,
                        batch_size=batch_size,
                        shuffle=False,
                        num_workers=2,
                        pin_memory=True,
                        drop_last=True,
                        sampler=sampler)

datasetTrain = load_dataset('./dataset/train')
datasetValidate = load_dataset('./dataset/validate')
print("Done")

Done


In [19]:
def getSampleWeights(dataset: ImageFolder) -> List[int]:
    classWeights = [1144, 187, 605, 1048, 796, 1135, 1037, 1480, 1472, 1024]
    sampleWeights = [0] * len(dataset)
    for idx, (_, label) in enumerate(tqdm(dataset)):
        cw = classWeights[label]
        sampleWeights[idx] = cw
    return sampleWeights
sampleWeights = getSampleWeights(datasetTrain)

100%|██████████| 9928/9928 [01:00<00:00, 164.79it/s]


In [24]:
loaderTrain = load_images(datasetTrain, len(datasetTrain), trainset=True, sampleWeights=sampleWeights)
loaderValidate = load_images(datasetValidate, 256, trainset=False)
print("Done")

Done


In [16]:
i, l = next(iter(loaderTrain))

In [29]:
i = None
l = None
gc.collect()

339

In [30]:
def parseImageLabels(imageLables: Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tensor]:
    return (imageLables[0].reshape(imageLables[0].size(0), -1), imageLables[1])
images, labels = parseImageLabels(next(iter(loaderTrain)))

## Functions

In [31]:
def train(models: List[Tuple[Any, str]], images: Tensor, labels: Tensor, max_workers: int = 8):
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            list(executor.map(lambda m: m[0].fit(images, labels), models))

def predictModel(model, images: Tensor, name: str, pLabels: Dict[str, Tensor]):
    predictedLabels = torch.from_numpy(model.predict(images))
    pLabels[name] = torch.cat((pLabels[name], predictedLabels), 0)

def predict(models: List[Tuple[Any, str]], max_workers: int = 4) -> Tuple[Tensor, Dict[str, Tensor]]:
    gtLabels = torch.tensor([]).int()
    pLabels: Dict[str, Tensor] = {}
    for _, name in models:
        pLabels[name] = torch.tensor([]).int()
    for i, l in tqdm(loaderValidate):
        (images, labels) = parseImageLabels((i, l))
        gtLabels = torch.cat((gtLabels, labels), 0)
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            list(
                executor.map(lambda m: predictModel(m[0], images, m[1], pLabels)
                , models)
            )
    return (gtLabels, pLabels)

def evaluate(gtLabels: torch.Tensor, pLabels: torch.Tensor, num_classes=10) -> Dict[str, float]:
    C=(gtLabels*num_classes+pLabels).bincount(minlength=num_classes**2).view(num_classes,num_classes).float()
    return {
        'Acc': C.diag().sum().item() / gtLabels.shape[0],
        'mAcc': (C.diag()/C.sum(-1)).mean().item(),
        'mIoU': (C.diag()/(C.sum(0)+C.sum(1)-C.diag())).mean().item()
    }
def evaluateAll(gtLabels: Tensor, pLabels: Dict[str, Tensor]) -> Dict[str, Dict[str, float]]:
    res: Dict[str, Dict[str, float]] = {}
    for k in pLabels:
        res[k] = evaluate(gtLabels, pLabels[k])
    return res

def printEvaluation(res: Dict[str, Dict[str, float]]):
    for k in res:
        print(f"\n{k:15} Acc: {res[k]['Acc']:10} mAcc: {res[k]['mAcc']:10} mIoU: {res[k]['mIoU']:10}")

print("Done")

Done


## Models

In [32]:
# %% Create models
from sklearn import tree, ensemble, neighbors, svm

models: List[Tuple[Any, str]] = []
# Decision Tree
models += [
    (tree.DecisionTreeClassifier(min_samples_leaf=nr, criterion=criterion), f'dTree {criterion}{nr}')
    for nr in [1, 5]
    for criterion in ["gini", "entropy"]
]
# Forest
models += [
    (ensemble.RandomForestClassifier(n_estimators=nr, criterion=criterion), f'Forest {criterion}{nr}')
    for nr in [1, 5, 10, 20, 50, 100]
    for criterion in ["gini", "entropy"]
]
# KNN
models += [
    (neighbors.KNeighborsClassifier(n_jobs=-1), 'KNN'),
]
# SVM
models += [
    (svm.SVC(kernel=kernel, C=C), f'SVM {kernel}{C}')
    for kernel in ['rbf', 'linear', 'poly'] 
    for C in [0.1, 1, 10]
]
print("Model created")

Model created


In [33]:
print(models[:2])

[(DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), 'dTree gini1'), (DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), 'dTree entropy1')]


# Training

In [None]:
# %% Train
print("Training ...")
train(models[:2], images, labels)
print("Trained")

Training ...


In [None]:
# %% Predict
print("Predicting ...")
gtLabels, pLabels = predict(models)
res = evaluateAll(gtLabels, pLabels)
print("Predicted")



  0%|          | 0/9 [00:00<?, ?it/s][A[A

Predicting ...




 11%|█         | 1/9 [01:51<14:51, 111.49s/it][A[A

 22%|██▏       | 2/9 [03:36<12:47, 109.60s/it][A[A

 33%|███▎      | 3/9 [05:21<10:49, 108.24s/it][A[A

 44%|████▍     | 4/9 [07:07<08:57, 107.53s/it][A[A

 56%|█████▌    | 5/9 [08:54<07:08, 107.22s/it][A[A

 67%|██████▋   | 6/9 [10:38<05:19, 106.49s/it][A[A

 78%|███████▊  | 7/9 [12:23<03:32, 106.01s/it][A[A

 89%|████████▉ | 8/9 [14:08<01:45, 105.73s/it][A[A

100%|██████████| 9/9 [15:54<00:00, 106.05s/it]

Predicted





In [None]:
import pprint
pprint.pprint(res)

{'Forest entropy1': {'Acc': 0.14539930555555555,
                     'mAcc': 0.13320490717887878,
                     'mIoU': 0.06952429562807083},
 'Forest entropy10': {'Acc': 0.13758680555555555,
                      'mAcc': 0.11695041507482529,
                      'mIoU': 0.06311294436454773},
 'Forest entropy100': {'Acc': 0.14973958333333334,
                       'mAcc': 0.12832364439964294,
                       'mIoU': 0.06751248985528946},
 'Forest entropy20': {'Acc': 0.1427951388888889,
                      'mAcc': 0.12200425565242767,
                      'mIoU': 0.0651373341679573},
 'Forest entropy5': {'Acc': 0.13541666666666666,
                     'mAcc': 0.12256348133087158,
                     'mIoU': 0.0654793381690979},
 'Forest entropy50': {'Acc': 0.11805555555555555,
                      'mAcc': 0.10201016813516617,
                      'mIoU': 0.05431988090276718},
 'Forest gini1': {'Acc': 0.1384548611111111,
                  'mAcc': 0.120350494980812