# **Data Preprocessing Image Dataset**

This file takes the image dataset for lung cancer and creates several modified versions of the dataset to be used for model training and experiementing 

The choice of modifications is based on the analysis, the needs of different models and for experimental reasons

## Necessary Imports

In [3]:
# Python 
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn 
import sklearn
assert sklearn.__version__ >= "0.20"

# Pandas
import pandas as pd

import random

# Common imports
import numpy as np
import os
import cv2  

np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

## Loading in original dataset

In [4]:
ROOT_PATH = os.path.join("..", "datasets", "3_image", "untouched")

# Define subfolders and their corresponding labels
folders_with_labels = {
    "Benign cases": "benign",
    "Malignant cases": "malignant",
    "Normal cases": "normal",
    "Test cases": "test"
}

# Initialize lists to store image data and labels
image_data = []
labels = []

# Iterate through each folder and load images with labels
for folder, label in folders_with_labels.items():
    folder_path = os.path.join(ROOT_PATH, "Training cases" if folder != "Test cases" else "Test cases", folder)
    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg") or filename.endswith(".png"):  # Adjust based on image format
            # Load the image in grayscale (adjust if needed)
            img_path = os.path.join(folder_path, filename)
            image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            
            # Store the image and its label
            image_data.append(image)
            labels.append(label)

# Convert to a DataFrame for easy manipulation
images_lung_cancer = pd.DataFrame({
    "image": image_data,
    "label": labels
})

# Display the first few rows to verify
print(images_lung_cancer.head())

                                               image   label
0  [[38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, ...  benign
1  [[37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, ...  benign
2  [[38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, ...  benign
3  [[38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, ...  benign
4  [[38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, ...  benign


## Modifying dataset and saving as a file