## Env

In [1]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [2]:
!unzip -qq "/content/drive/MyDrive/MAGISTERKA/datasets/fog-combined.zip" -d "/content/datasets/"

In [3]:
!pip install lightning

Collecting lightning
  Downloading lightning-2.5.1.post0-py3-none-any.whl.metadata (39 kB)
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.7.2-py3-none-any.whl.metadata (21 kB)
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.5.1.post0-py3-none-any.whl.metadata (20 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<4.0,>=2.1.0->lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<4.0,>=2.1.0->lightning)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<4.0,>=2.1.0->lightning)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata

## Setup

In [17]:
from pathlib import Path
from time import time

import torch
from torchvision.models import get_weight
from torchvision.transforms import v2

import numpy as np
import pandas as pd
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.base import ClassifierMixin
from typing import Any, TypeVar
from collections import namedtuple
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

In [18]:
_T = TypeVar('_T', bound=ClassifierMixin)

def evaluate_model(model: _T, X: Any, y: Any):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred, average='macro')
    precision = precision_score(y, y_pred, average='macro')
    recall = recall_score(y, y_pred, average='macro')
    confusion = confusion_matrix(y, y_pred)
    return namedtuple('Evaluation', ['accuracy', 'f1', 'precision', 'recall', 'confusion'])(accuracy, f1, precision, recall, confusion)

def train_model(
        model_cls: _T,
        model_kwargs: dict[str, Any],
        train_df: pd.DataFrame,
        val_df: pd.DataFrame
    ) -> _T:
    model = model_cls(**model_kwargs)
    model.fit(np.stack(train_df['features'].values), train_df['class'])
    train_metrics = evaluate_model(model, np.stack(train_df['features'].values), train_df['class'])
    val_metrics = evaluate_model(model, np.stack(val_df['features'].values), val_df['class'])
    print(f"Model - {model_cls.__name__}")
    print("\tTrain metrics:")
    print(f"\t\tAccuracy: {train_metrics.accuracy:.4f}")
    print(f"\t\tF1: {train_metrics.f1:.4f}")
    print(f"\t\tPrecision: {train_metrics.precision:.4f}")
    print(f"\t\tRecall: {train_metrics.recall:.4f}")
    print("\tValidation metrics:")
    print(f"\t\tAccuracy: {val_metrics.accuracy:.4f}")
    print(f"\t\tF1: {val_metrics.f1:.4f}")
    print(f"\t\tPrecision: {val_metrics.precision:.4f}")
    print(f"\t\tRecall: {val_metrics.recall:.4f}")
    return model

In [6]:
base_transform = v2.Compose([
    v2.Resize((256, 256), v2.InterpolationMode.BILINEAR),
    v2.CenterCrop((224, 224)),
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
])
transforms = {
    "grayscale": v2.Compose([
        base_transform,
        v2.Grayscale(num_output_channels=3)
    ]),
    "color": v2.Compose([
        base_transform,
    ]),
}

In [7]:
BASE_PATH = Path("/content")
DRIVE_PATH = Path("/content/drive/MyDrive/MAGISTERKA")

In [8]:
import sys
sys.path.insert(0, str(DRIVE_PATH / 'src'))
from cnn_model import CNNClassifier, get_dataloader

In [9]:
DATASET_PATHS = {
    'fog-detection': BASE_PATH / 'datasets/fog-detection-dataset-prepared',
    'fog-or-smog': BASE_PATH / 'datasets/fog-or-smog-detection-dataset-prepared',
    'foggy-cityscapes': BASE_PATH / 'datasets/foggy-cityscapes-image-dataset-prepared',
    'combined': BASE_PATH / 'datasets/fog-combined',
}

In [10]:
DATASET_NORMALIZATION = {
    'fog-detection': {'mean': [0.4850, 0.5044, 0.4878], 'std': [0.2631, 0.2524, 0.2793]},
    'fog-or-smog': {'mean': [0.5411, 0.5339, 0.5088], 'std': [0.2353, 0.2157, 0.2289]},
    'foggy-cityscapes': {'mean': [0.4169, 0.4507, 0.4173], 'std': [0.1860, 0.1835, 0.1820]},
    'combined': {'mean': [0.5017, 0.5087, 0.4826], 'std': [0.2259, 0.2118, 0.2227]}
}

In [11]:
def get_dataset_stats(dataset_path: Path):
  dataset = get_dataloader(dataset_path, transforms["color"])
  # Calculate mean color and std
  mean = torch.zeros(3)
  std = torch.zeros(3)

  for images, _ in dataset:
    mean += images.mean(dim=(0, 2, 3))
    std += images.std(dim=(0, 2, 3))

  mean /= len(dataset)
  std /= len(dataset)

  return mean, std


In [12]:
def dataloader_to_df(dataloader):
  X = []
  y = []
  for images, labels in dataloader:
    for image, label in zip(images, labels):
      X.append(image.numpy().reshape(-1))
      y.append(label.numpy().item())
  df = pd.DataFrame({'features': X, 'class': y})
  return df

In [13]:
train_df = dataloader_to_df(
    get_dataloader(DATASET_PATHS['combined'] / 'train', transforms["color"])
)
val_df = dataloader_to_df(
    get_dataloader(DATASET_PATHS['combined'] / 'val', transforms["color"])
)
test_df = dataloader_to_df(
    get_dataloader(DATASET_PATHS['combined'] / 'test', transforms["color"])
)

In [23]:
def _get_formatted_metric(values: list[float]):
  avg = sum(values) / len(values)
  std = (sum([(v - avg) ** 2 for v in values]) / len(values)) ** 0.5
  return f"{avg:.4f} ± {std:.4f}"

def run_model(
    model_cls: _T,
    model_kwargs: dict[str, Any],
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    test_df: pd.DataFrame,
    repeat: int = 5,
    transform: str = "color",
    normalize: bool = False
):


  # train_dataloaders=get_dataloader(
  #   path=DATASET_PATHS[dataset] / "train",
  #   transform=_transform,
  #   batch_size=32,
  #   shuffle=True,
  #   num_workers=2,
  # )
  # val_dataloaders=get_dataloader(
  #   path=DATASET_PATHS[dataset] / "val",
  #   transform=_transform,
  #   batch_size=32,
  #   shuffle=False,
  #   num_workers=2,
  # )

  # test_dataloader = {
  #     dataset_name: get_dataloader(path=path / "test", transform=_transform)
  #     for dataset_name, path in DATASET_PATHS.items()
  # }

  final_res = {
      'combined': {
          'accuracy': [],
          'f1': [],
          'precision': [],
          'recall': [],
      }
  }
  training_times = []
  for i in range(repeat):

    _start = time()
    model = train_model(
        model_cls=model_cls,
        model_kwargs=model_kwargs,
        train_df=train_df,
        val_df=val_df
    )
    _end = time()
    training_times.append(_end - _start)
    results = evaluate_model(
        model=model,
        X=np.stack(test_df['features'].values),
        y=test_df['class']
    )
    final_res['combined']['accuracy'].append(results.accuracy)
    final_res['combined']['f1'].append(results.f1)
    final_res['combined']['precision'].append(results.precision)
    final_res['combined']['recall'].append(results.recall)
  print(final_res)
  latex_table = [
    [
      dataset,
      # *[round(m, 4) for m in metrics.values()]
      # Add average value and std
      *[_get_formatted_metric(m) for m in metrics.values()]
    ] for dataset, metrics in final_res.items()
  ]
  latex_table_str = ""
  for line in latex_table:
    latex_table_str += " & ".join([str(l) for l in line]) + " \\\\\n"
  return latex_table_str


## Not Deep models

In [15]:
len(train_df)

2701

In [16]:
# stratified on class subsample
subsampled_train_df = pd.concat([train_df[train_df['class'] == 0].sample(200), train_df[train_df['class'] == 1].sample(200)])

In [24]:
run_model(
    model_cls=LinearSVC,
    model_kwargs={},
    train_df=subsampled_train_df,
    val_df=val_df,
    test_df=test_df,
    repeat=1
)



Model - LinearSVC
	Train metrics:
		Accuracy: 1.0000
		F1: 1.0000
		Precision: 1.0000
		Recall: 1.0000
	Validation metrics:
		Accuracy: 0.7532
		F1: 0.7395
		Precision: 0.7923
		Recall: 0.7438
{'combined': {'accuracy': [0.7174479166666666], 'f1': [0.6948491512296231], 'precision': [0.7700496710967915], 'recall': [0.7051871395930802]}}


'combined & 0.7174 ± 0.0000 & 0.6948 ± 0.0000 & 0.7700 ± 0.0000 & 0.7052 ± 0.0000 \\\\\n'

In [25]:
run_model(
    model_cls=RandomForestClassifier,
    model_kwargs={},
    train_df=subsampled_train_df,
    val_df=val_df,
    test_df=test_df,
    repeat=1
)

Model - RandomForestClassifier
	Train metrics:
		Accuracy: 1.0000
		F1: 1.0000
		Precision: 1.0000
		Recall: 1.0000
	Validation metrics:
		Accuracy: 0.8499
		F1: 0.8497
		Precision: 0.8496
		Recall: 0.8504
{'combined': {'accuracy': [0.85546875], 'f1': [0.8554390939167682], 'precision': [0.8562748091603054], 'recall': [0.857047655314982]}}


'combined & 0.8555 ± 0.0000 & 0.8554 ± 0.0000 & 0.8563 ± 0.0000 & 0.8570 ± 0.0000 \\\\\n'

In [26]:
run_model(
    model_cls=XGBClassifier,
    model_kwargs={},
    train_df=subsampled_train_df,
    val_df=val_df,
    test_df=test_df,
    repeat=1
)

Model - XGBClassifier
	Train metrics:
		Accuracy: 1.0000
		F1: 1.0000
		Precision: 1.0000
		Recall: 1.0000
	Validation metrics:
		Accuracy: 0.8677
		F1: 0.8672
		Precision: 0.8678
		Recall: 0.8669
{'combined': {'accuracy': [0.85546875], 'f1': [0.854970831865995], 'precision': [0.8552791985343695], 'recall': [0.8547356109237297]}}


'combined & 0.8555 ± 0.0000 & 0.8550 ± 0.0000 & 0.8553 ± 0.0000 & 0.8547 ± 0.0000 \\\\\n'