In [1]:
%load_ext autoreload
%autoreload 2

import pytorch_lightning as pl
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader, Dataset
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
import os
import torch
import torch.nn.functional as F
import random
import numpy as np
import torch.nn as nn
from PIL import Image
import timm
from tqdm import tqdm
import shutil
from datetime import datetime
from pytz import timezone 
# import warnings
# warnings.filterwarnings("ignore")


from skew_correction.helper import *
from skew_correction.data import plot_random_images
from skew_correction.model import total_params, print_metrics_on_epoch_end, get_acc

root_dir = "/".join( os.getcwd().split("/")[:-1])
data_dir = os.path.join(root_dir, "data")
root_dir

device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


## prepare data (only run once, DO NOT RUN ANYMORE )

In [2]:
## split files from one folder into 2 tran-test

def split_files(src_dir, dest_dir, train_size):
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    
    all_files = os.listdir(src_dir)
    random.shuffle(all_files)
    
    total_files = len(all_files)
    train_count = int(total_files * train_size)
    
    train_files = all_files[:train_count]
    test_files = all_files[train_count:]
    
    train_dir = os.path.join(dest_dir, "train_")
    test_dir = os.path.join(dest_dir, "test_")
    
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    
    move_files(src_dir, train_dir, train_files)
    move_files(src_dir, test_dir, test_files)
    return train_files, test_files

def move_files(src_dir, dest_dir, files):
    for file in files:
        src_path = os.path.join(src_dir, file)
        dest_path = os.path.join(dest_dir, file)
        shutil.move(src_path, dest_path)


# Example usage
src_directory = os.path.join(data_dir, "original/train")
dest_directory = os.path.join(data_dir, "original/")
train_ratio = 0.8  # 80% for training, 20% for testing

## train_files, test_files = split_files(src_directory, dest_directory, train_ratio)


In [3]:
def prepare_data(src_dir, dest_dir, save_csv=True, multiple=2):  
    """
    this function takes input path of a dir whoch contains 0 degree images and rotates them to a random angle between 
    -180 and +180 and stores them in the output dir. also makes a train.csv containing file name and angles.
    """
    
    os.makedirs(dest_dir, exist_ok=True)

    save_dict = {
        'filepath': [],
        'angle': []
    }
    

    org_img_paths = get_images_in_dir(src_dir, return_path=True)
    print(f"There are {len(org_img_paths)} images in src_folder. Preparing rotated images. \
        \nmultiple={multiple}. hence there will be {multiple*len(org_img_paths)} images")
    
    for num in range(multiple):
        for img_path in tqdm(org_img_paths):
            img = read_raw_image(img_path, mode='L')
            
            # select random angle and rotate
            angles = np.arange(-180, 180)
            angle = random.choice(angles)
            img = img.rotate(angle, expand=True)
            
            # save rotated img in dest folder
            img_name, ext = img_path.split('/')[-1].split('.')
            save_filename = f"{img_name}_{angle}.{ext}"
            dest_path = os.path.join(dest_dir, save_filename)       
            img.save(dest_path)
            
            if os.path.exists(dest_path):
                save_dict["filepath"].append(dest_path)
                save_dict["angle"].append(angle)
    
    if save_csv==True:
        pd.DataFrame(save_dict).to_csv(os.path.join(dest_dir, "data.csv"), index=None)

    return 1

In [4]:
# for folder in ['train', 'test']:

#     src_dir = os.path.join(root_dir, f"data/original/{folder}/")
#     dest_dir = os.path.join(root_dir, f"data/rotated/{folder}/")
    
#     if folder=="train": 
#         multiple=2
#     else:
#         multiple=1
#     prepare_data(src_dir, dest_dir, save_csv=True, multiple=multiple)

## load dataloader

In [42]:
## define dataloader

from torch.utils.data import DataLoader, Dataset

train_transform=transforms.Compose([
    transforms.Resize((400, 400)),
    transforms.ToTensor(),
    transforms.GaussianBlur(3),
    # transforms.ColorJitter(0.5),
    # transforms.RandomAutocontrast(0.5),
    transforms.Normalize((0.5), (0.5))
    # transforms.RandomInvert(0.5),
    # transforms.RandomSolarize(0.,0.5)

])

test_transform = transforms.Compose([
    transforms.Resize((400,400)),
    transforms.ToTensor()
])

class SkewDataset(Dataset):
    def __init__(self, csv_path, split="test"):
        super().__init__()
        self.df = pd.read_csv(csv_path)
        self.filepaths = self.df["filepath"]
        self.labels = self.df["angle"]
        self.split = split

    def __len__(self):
        return len(self.filepaths)
    
    def __getitem__(self, idx):
        img = read_raw_image(self.filepaths[idx])
        label = self.labels[idx]

        if self.split=="train":
            img = train_transform(img)
        else:
            img = test_transform(img)

        return img, torch.tensor(label, dtype=torch.float)

In [43]:

train_dataset = SkewDataset("/home/deepam_minda_farmart_co/fmt/skew_correction/data/rotated/train/data.csv", split="train")
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False, num_workers=2)

test_dataset = SkewDataset("/home/deepam_minda_farmart_co/fmt/skew_correction/data/rotated/test/data.csv", split="test")
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=2)
# sample = dataset.__getitem__(5)
# sample[0].shape, sample[1]
# batch = next(iter(train_loader))

In [44]:
# plot_random_images(train_dataset)
# tensor2pil(train_dataset.__getitem__(1)[0])

In [45]:
plot_random_images(test_dataset)

Epoch 11:   8%|▊         | 2/25 [04:05<46:57, 122.50s/it, loss=1.04e+04, v_num=0]


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f56df37b8b0>
Traceback (most recent call last):
  File "/opt/conda/envs/skew/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__
    self._shutdown_workers()
  File "/opt/conda/envs/skew/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1474, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/opt/conda/envs/skew/lib/python3.9/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/opt/conda/envs/skew/lib/python3.9/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
  File "/opt/conda/envs/skew/lib/python3.9/multiprocessing/connection.py", line 936, in wait
    ready = selector.select(timeout)
  File "/opt/conda/envs/skew/lib/python3.9/selectors.py", line 416, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt: 




## define MODEL

In [31]:
## define model class

class ConvNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3)
        self.batchnorm1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=256, kernel_size=3)
        self.batchnorm2 = nn.BatchNorm2d(256)
        self.pool2 = nn.MaxPool2d(kernel_size=2)
        self.conv3 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3)
        self.batchnorm3 = nn.BatchNorm2d(512)
        self.pool3 = nn.MaxPool2d(kernel_size=2)
        self.adaptive_pool = nn.AdaptiveMaxPool2d((2,2))
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(2048, 256)
        self.linear2 = nn.Linear(256, 32)
        self.linear3 = nn.Linear(32, 1)
        

    def forward(self, x):
        x = self.relu(self.batchnorm1(self.conv1(x)))
        x = self.pool1(x)
        x = self.relu(self.batchnorm2(self.conv2(x)))
        x = self.pool2(x)
        x = self.relu(self.batchnorm3(self.conv3(x)))
        x = self.pool3(x)
        x = self.adaptive_pool(x)
        x = self.flatten(x)
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.linear3(x)
        return x


In [33]:
## test the model
from torchsummary import summary

model = ConvNet()
model = model.to(device)
summary(model, (1, 224, 224))

dummy_input = torch.ones(3,1,224,224, device=device)
output = model(dummy_input)
# total_params(model)*4/(1024*1024)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 222, 222]             640
       BatchNorm2d-2         [-1, 64, 222, 222]             128
              ReLU-3         [-1, 64, 222, 222]               0
         MaxPool2d-4         [-1, 64, 111, 111]               0
            Conv2d-5        [-1, 256, 109, 109]         147,712
       BatchNorm2d-6        [-1, 256, 109, 109]             512
              ReLU-7        [-1, 256, 109, 109]               0
         MaxPool2d-8          [-1, 256, 54, 54]               0
            Conv2d-9          [-1, 512, 52, 52]       1,180,160
      BatchNorm2d-10          [-1, 512, 52, 52]           1,024
             ReLU-11          [-1, 512, 52, 52]               0
        MaxPool2d-12          [-1, 512, 26, 26]               0
AdaptiveMaxPool2d-13            [-1, 512, 2, 2]               0
          Flatten-14                 [-

## Train 

In [34]:

class ModelModule(pl.LightningModule):
    def __init__(self, model, loss_fn, lr):
        super().__init__()
        self.model = model
        self.loss_fn = loss_fn
        self.lr = lr

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        # print(f"input shape {x.size()}, output shape {y.size()}")
        y_hat = self.model(x)
        # print(f"output {y_hat.detach().cpu()}, label {y}")
        loss = self.loss_fn(y_hat, y)
        self.log('train_loss', loss)
        # acc = get_acc(y_hat, y)
        # self.log('train_acc', acc, on_epoch=True, on_step=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.model(x)
        loss = self.loss_fn(y_hat, y)
        self.log('val_loss', loss)
        # acc = get_acc(y_hat, y)
        # self.log('val_acc', acc, on_epoch=True, on_step=True)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.model(x)
        loss = self.loss_fn(y_hat, y)
        self.log('test_loss', loss)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
        # return {"optimizer": optimizer, "lr_scheduler": scheduler}
        return optimizer

    def on_validation_epoch_end(self):
        metrics = self.trainer.callback_metrics
        print(f'==========> Epoch {self.current_epoch}')
        print_metrics_on_epoch_end(metrics, ['train_loss', 'val_loss'])


In [35]:
# dummy forward pass

#  for batch in train_loader:
#     x, y = batch
#     x, y = x.to(device), y.to(device)
#     y_hat = model(x).reshape(-1)
#     break

In [39]:
# load pl_module
loss_fn = nn.MSELoss()
lr = 1e-4
model = ConvNet().to(device)
pl_model = ModelModule(model, loss_fn, lr)

In [40]:
# define trainer

verbose=False

epochs=100
model_string = "Convnet_custom"
current_date = datetime.now(timezone('Asia/Kolkata')).strftime('%Y-%m-%d')

tb_logger = TensorBoardLogger(
    save_dir=os.path.join(root_dir, 'logs'), 
    name=f"{current_date}-less_aug_&_normalized-{model_string}-{round(total_params(model)/1000000,2)}m_params-{len(train_loader)*train_loader.batch_size}samples-lr{lr}-bs{train_loader.batch_size}"
)

checkpoint_callback = ModelCheckpoint(
    dirpath=os.path.join(root_dir, 'checkpoints'),
    save_top_k=1,
    monitor="val_loss",
    mode="min",
    filename="{epoch:02d}-{val_loss:.2f}-{val_acc:.2f}-{model_string}"
)
early_stop_callback = EarlyStopping(
    monitor="val_loss", min_delta=0.00, patience=5, verbose=True, mode="min"
)

trainer = pl.Trainer(
        accelerator='gpu' if torch.cuda.is_available() else 'cpu',
        max_epochs=epochs, 
        logger=tb_logger, 
        log_every_n_steps=1, 
        limit_train_batches=1, 
        limit_val_batches=None,
        enable_checkpointing=False,
        # callbacks=[checkpoint_callback, early_stop_callback],
    )


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(limit_train_batches=1)` was configured so 1 batch per epoch will be used.


In [41]:
trainer.fit(pl_model, train_loader, test_loader)

Missing logger folder: /home/deepam_minda_farmart_co/fmt/skew_correction/logs/2023-08-19-less_aug_&_normalized-Convnet_custom-1.86m_params-1536samples-lr0.0001-bs32
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type    | Params
------------------------------------
0 | model   | ConvNet | 1.9 M 
1 | loss_fn | MSELoss | 0     
------------------------------------
1.9 M     Trainable params
0         Non-trainable params
1.9 M     Total params
7.452     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

{'val_loss': 10817.09}
{
    "train_loss": 10396.99,
    "val_loss": 11131.89
}
{
    "train_loss": 10381.66,
    "val_loss": 11131.74
}
{
    "train_loss": 10371.86,
    "val_loss": 11131.59
}
{
    "train_loss": 10363.25,
    "val_loss": 11131.41
}
{
    "train_loss": 10360.29,
    "val_loss": 11131.21
}
{
    "train_loss": 10358.14,
    "val_loss": 11131.0
}
{
    "train_loss": 10359.82,
    "val_loss": 11130.81
}
{
    "train_loss": 10359.54,
    "val_loss": 11130.64
}
{
    "train_loss": 10361.53,
    "val_loss": 11130.48
}
{
    "train_loss": 10357.27,
    "val_loss": 11130.31
}
{
    "train_loss": 10356.91,
    "val_loss": 11130.11
}
{
    "train_loss": 10356.62,
    "val_loss": 11129.85
}
{
    "train_loss": 10356.67,
    "val_loss": 11129.53
}
{
    "train_loss": 10357.32,
    "val_loss": 11129.12
}
Epoch 14:   8%|▊         | 2/25 [01:49<20:54, 54.56s/it, loss=1.04e+04, v_num=0] 

# more things to try - 
- [ ] normalize image
- [ ] batchnorm
- [ ] more augmentations
- [ ] cleaning data
- ~~different lr (e-4, e-5 )~~

!!shuffle off for overfitting