In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
import cv2 as cv
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
XData = []
yData = []

age_list = dict()

for dirname, _, filenames in os.walk('dataset'):
    for i,filename in enumerate(filenames):
        content = filename.split("_")
        age = content[0]
        
        try:
        
            if(age_list[age] < 110):
                yData.append(content[0])

                img = cv2.imread(os.path.join(dirname, filename))
                img = img / 25
                img = cv2.resize(img, (150, 150))
                XData.append(img)

                age_list[age] = age_list[age] + 1
            
        
        except:
            age_list[age] = 0
        
        if(i % 100 == 0):
            print(i, end = " ")

In [None]:
# Now, let's perform EDA on the collected data:
# 2. Descriptive Statistics (for age):
age_values = [int(age) for age in yData]
age_series = pd.Series(age_values)
age_summary = age_series.describe()

# 3. Data Visualization
# Plot age distribution
plt.figure(figsize=(10, 6))
sns.histplot(age_values, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# find more data to make all 300 (75+)

In [None]:
age_summary

In [None]:
age_series.head()

In [None]:
age_series.shape

In [None]:
age_series.info()

In [None]:
age_series.nunique()

**Histogram Equalization**

In [None]:
# Contrast Limited Adaptive Histogram Equalization
# use this if you need to apply a contrast filter

# for dirname, _, filenames in os.walk('dataset - Copy'):
#    for filename in filenames:        
#        img = cv2.imread(os.path.join(dirname, filename), cv.IMREAD_GRAYSCALE)
#        assert img is not None, "file could not be read, check with os.path.exists()"
#        # create a CLAHE object (Arguments are optional).
#        clahe = cv.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
#        cl1 = clahe.apply(img)
#        cv.imwrite(os.path.join(dirname, filename), cl1)

In [None]:
# Function to calculate normalized histogram and CDF for a given age value
def calculate_histogram_and_cdf(age_value):
    img_sum = np.array([0] * 40000)
    count = 0
    for dirname, _, filenames in os.walk('dataset'):
        for i, filename in enumerate(filenames):
            content = filename.split("_")
            age = content[0]
            if (age == age_value):
                img = np.array(cv2.imread(os.path.join(dirname, filename), cv2.IMREAD_GRAYSCALE))
                img_sum = np.add(img_sum, img.flatten())
                count = count + 1
    average = np.divide(img_sum, count)

    # Calculate histogram of the 'average' array
    hist, bins = np.histogram(average, 256, [0, 256])

    # Normalize the histogram's y-axis values to the range [0, 1]
    normalized_hist = hist / np.max(hist)

    # Calculate cumulative distribution function (CDF)
    cdf = hist.cumsum()
    cdf_normalized = cdf / cdf.max()

    return bins, normalized_hist, cdf_normalized

# Calculate histogram and CDF for age 1 and age 100
bins_age_1, normalized_hist_age_1, cdf_normalized_age_1 = calculate_histogram_and_cdf('1')
bins_age_100, normalized_hist_age_100, cdf_normalized_age_100 = calculate_histogram_and_cdf('100')

# Plot both histograms and CDFs on the same graph for comparison
plt.figure(figsize=(10, 6))
plt.plot(cdf_normalized_age_1, color='b', label='Age 1 CDF')
plt.plot(cdf_normalized_age_100, color='r', label='Age 100 CDF')
plt.bar(bins_age_1[:-1], normalized_hist_age_1, width=(bins_age_1[1] - bins_age_1[0]), color='b', alpha=0.5, label='Age 1 Histogram')
plt.bar(bins_age_100[:-1], normalized_hist_age_100, width=(bins_age_100[1] - bins_age_100[0]), color='r', alpha=0.5, label='Age 100 Histogram')
plt.xlim([0, 256])
plt.ylim([0, 1])
plt.legend()
plt.xlabel('Pixel Intensity')
plt.ylabel('Normalized Frequency / CDF')
plt.title('Comparison of Normalized Histograms and CDFs for Age 1 and Age 100')
plt.show()