# Code Snippet EDA

This notebook contains the EDA



In [None]:
#!pip install seaborn
#!pip install filetype

In [None]:
from IPython.display import Image as DisplayImage, display
from PIL import Image


import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import filetype
import os
import random


sns.set_style('whitegrid')

## Import Dataset

In [None]:
# Set the directory path of the dataset
dataset_dir = '/Users/lukasiwan/NeueFische/Repositories/Hydroponics/data/original_dataset'

# Get the class names from the subdirectories
class_names = [name for name in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, name))]

## Class distribution

In [None]:
# Initialize an empty dictionary to store class counts
class_counts = {}

# Iterate over the class names and count the number of images in each class
for class_name in class_names:
    class_dir = os.path.join(dataset_dir, class_name)
    file_pattern = os.path.join(class_dir, '*')
    class_count = sum(1 for file_path in glob.glob(file_pattern) if os.path.isfile(file_path))
    class_counts[class_name] = class_count

# Create a DataFrame from class counts
class_data = pd.DataFrame({'Class': list(class_counts.keys()), 'Count': list(class_counts.values())})

# Sort the DataFrame in descending order by count
class_data = class_data.sort_values(by='Count', ascending=False)

# Set seaborn style
sns.set_style("whitegrid")

# Create a barplot using Seaborn
plt.figure(figsize=(10, 6))
sns.barplot(data=class_data, x='Count', y='Class', color=(89/255, 145/255, 145/255))
plt.xlabel('Count', fontsize = 14)
plt.ylabel('Class', fontsize = 14)
plt.title('Class Distribution', fontsize = 16)

# Set font size for the count axis labels
plt.xticks(fontsize=10)


plt.tight_layout()
plt.show()

## Data format

In [None]:
# Initialize a dictionary to store the count of each image format
format_count = {}

# Iterate over each class folder
for class_name in class_names:
    # Get the path to the current class folder
    class_folder = os.path.join(dataset_dir, class_name)

    # Get a list of image files within the class folder
    image_files = [file for file in os.listdir(class_folder) if os.path.isfile(os.path.join(class_folder, file))]

    # Iterate over each image file
    for image_file in image_files:
        # Get the file extension (image format) using the filetype library
        file_path = os.path.join(class_folder, image_file)
        file_info = filetype.guess(file_path)
        file_extension = file_info.extension.lower() if file_info is not None else None

        # Increment the count of the image format in the dictionary
        format_count[file_extension] = format_count.get(file_extension, 0) + 1

# Remove None key if it exists (for files that are not recognized as images)
format_count.pop(None, None)

# Extract the image formats and their counts from the dictionary
formats = list(format_count.keys())
counts = list(format_count.values())


In [None]:
# Sort the image formats and counts in descending order of counts
sorted_formats = sorted(format_count.keys(), key=lambda x: format_count[x], reverse=True)
sorted_counts = [format_count[format] for format in sorted_formats]


In [None]:
# Set seaborn style
sns.set_style("whitegrid")

# Create the bar plot
plt.figure(figsize=(10, 6))
bar_plot = sns.barplot(x=sorted_formats, y=sorted_counts)
plt.xlabel("Image Formats")
plt.ylabel("Count")
plt.title("Image Formats in the Dataset")

# Add count values above each bar
for i, count in enumerate(sorted_counts):
    bar_plot.text(i, count, str(count), ha='center', va='bottom')

# Remove top and right spines
sns.despine(top=True, right=True)

# Show the plot
plt.show()

## Image size

In [None]:
# Initialize empty lists to store the image sizes
widths = []
heights = []
colors = []

# Iterate over each class
for i, class_name in enumerate(class_names):
    # Get the path to the class folder
    class_folder = os.path.join(dataset_dir, class_name)

    # Iterate over the image files within the class folder
    for file_name in os.listdir(class_folder):
        # Skip the .DS_Store file
        if file_name == '.DS_Store':
            continue

        file_path = os.path.join(class_folder, file_name)

        # Open the image and get its size
        with Image.open(file_path) as img:
            width, height = img.size

        # Append the image sizes and color for each data point
        widths.append(width)
        heights.append(height)
        colors.append(i)

In [None]:
# Create a scatter plot of image sizes with colors based on data class
plt.figure(figsize=(10, 6))
scatterplot = sns.scatterplot(x=widths, y=heights, hue=colors, palette='colorblind')
plt.xlabel("Width")
plt.ylabel("Height")
plt.title("Image Sizes")

# Remove top and right spines
sns.despine(top=True, right=True)

# Customize the legend
handles, labels = scatterplot.get_legend_handles_labels()
scatterplot.legend(handles, class_names, title="Data Class", bbox_to_anchor=(1, 1), loc='upper left')

# Show the plot
plt.show()

## Image examples

In [None]:
# Set the number of random images to display from each class
num_examples_per_class = 2  # Update with the desired number

# Iterate over each class
for class_name in class_names:
    # Get the path to the class folder
    class_folder = os.path.join(dataset_dir, class_name)

    # Get a list of image files within the class folder
    image_files = [file for file in os.listdir(class_folder) if os.path.isfile(os.path.join(class_folder, file))]

    # Select random image files
    random_images = random.sample(image_files, num_examples_per_class)

    # Display the random images
    print(f"Random Image Examples from {class_name} Class:")
    for image_file in random_images:
        # Get the image file path
        image_path = os.path.join(class_folder, image_file)

        # Display the image
        display(DisplayImage(filename=image_path))