# Imports

In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Making dataset from data

#### Define PyTorch dataset class

In [34]:
class ChestXRayDataset(Dataset):
    def __init__(self, root_dir, csv_file, transform=None):
        self.root_dir = root_dir
        self.transform = transform

        # Create a list of all image paths and their corresponding labels
        self.image_paths = []
        self.labels = []

        # Load the CSV file into a DataFrame
        df = pd.read_csv(csv_file)

        # Create a mapping of unique diseases to integer labels
        self.label_map = {disease: idx for idx, disease in enumerate(df['type'].unique())}

        for index, row in df.iterrows():
            # Only consider .jpg files
            if row['jpg'].endswith('.jpg'):
                self.image_paths.append(os.path.join(root_dir, 'files', row['jpg'][1:]))  # [1:] to skip the initial '/'
                self.labels.append(self.label_map[row['type']])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]

        image = Image.open(img_path)

        if self.transform:
            image = self.transform(image)

        return image, label

#### Define data transformation

In [27]:
data_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to fit models like ResNet, etc.
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Imagenet norms
])


#### Instantiate dataset class

In [37]:
dataset = ChestXRayDataset(
    root_dir='C:/Users/Michal/Desktop/AI_ML/Chest_X_ray_Kaggle/data',
    csv_file='C:/Users/Michal/Desktop/AI_ML/Chest_X_ray_Kaggle/data/xray_chest.csv'
)

print(len(dataset))



97


#### Spliting dataset into train and test sets

In [38]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

#### Create data loaders

In [44]:
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Visualizing data