In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading the data

In [None]:
!pip install PyDrive



In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import os
import tarfile

# Authenticate PyDrive
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Folder ID from the shared Google Drive folder
folder_id = '1HROmgviy4jUUUaCdvvrQ8PcqtNg2jn3G'
file_list = drive.ListFile({'q': f"'{folder_id}' in parents and trashed=false"}).GetList()

# Loop through the files and download each .tar.gz file
for file in file_list:
    if file['title'].endswith('.tar.gz'):
        print(f'Downloading {file["title"]}')
        file.GetContentFile(file['title'])

        # Extract the .tar.gz file
        tar = tarfile.open(file['title'])
        tar.extractall('/content/extracted')  # Extract all files to /content/extracted directory
        tar.close()

        print(f'Extracted {file["title"]}')



Downloading part3.tar.gz
Extracted part3.tar.gz
Downloading part2.tar.gz
Extracted part2.tar.gz
Downloading part1.tar.gz
Extracted part1.tar.gz


# Data Test

In [None]:
import cv2
import matplotlib.pyplot as plt

file_path = '/content/extracted/part3/100_1_0_20170119212053665.jpg'
image = cv2.imread(file_path)

# Convert BGR to RGB
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Display the image
plt.imshow(image_rgb)
plt.axis('off')  # Hide axes
plt.show()

# Print the shape of the image
print(f"Shape: {image.shape}")


# Data Cleaning

In [None]:
import os

def delete_non_image_files(directories, allowed_extensions=('.jpg', '.jpeg', '.png', '.bmp', '.tiff')):
    for directory in directories:
        for filename in os.listdir(directory):
            if not filename.lower().endswith(allowed_extensions):
                file_path = os.path.join(directory, filename)
                os.remove(file_path)
                print(f"Deleted: {file_path}")

# Specify the directories
directories = [
    '/content/extracted/part1',
    '/content/extracted/part2',
    '/content/extracted/part3'
]

# Run the function to delete non-image files
delete_non_image_files(directories)

Deleted: /content/extracted/part3/44_1_4_20170116235150272.pg
Deleted: /content/extracted/part3/.DS_Store
Deleted: /content/extracted/part3/55_0_0_20170116232725357jpg


In [None]:
import os
import re

def delete_invalid_images(directories):
    # Regular expression pattern to match the desired format
    pattern = re.compile(r'^\d+_\d+_\d+_\d+\.\w+$')

    for directory in directories:
        for filename in os.listdir(directory):
            # Check if the filename matches the desired pattern
            if not pattern.match(filename):
                file_path = os.path.join(directory, filename)
                os.remove(file_path)
                print(f"Deleted: {file_path}")

# Specify the directories
directories = [
    '/content/extracted/part1',
    '/content/extracted/part2',
    '/content/extracted/part3'
]

# Run the function to delete invalid images
delete_invalid_images(directories)

Deleted: /content/extracted/part1/61_1_20170109142408075.jpg
Deleted: /content/extracted/part1/61_3_20170109150557335.jpg
Deleted: /content/extracted/part2/39_1_20170116174525125.jpg
Deleted: /content/extracted/part2/53__0_20170116184028385.jpg
Deleted: /content/extracted/part3/24_0_1_20170116220224657 .jpg


In [None]:
import cv2
import os

def resize_images_in_directories(directories, target_size=(224, 224)):
    supported_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff')

    for directory in directories:
        for filename in os.listdir(directory):
            if filename.lower().endswith(supported_extensions):
                file_path = os.path.join(directory, filename)

                # Read the image
                image = cv2.imread(file_path)

                if image is not None:
                    # Resize the image to the target size
                    resized_image = cv2.resize(image, target_size)

                    # Save the resized image back to the same path, or specify a different path if needed
                    cv2.imwrite(file_path, resized_image)
                else:
                    print(f"Warning: Failed to load {file_path}, skipping.")
            else:
                print(f"Skipping non-image file: {filename}")

# Specify the directories
directories = [
    '/content/extracted/part1',
    '/content/extracted/part2',
    '/content/extracted/part3'
]

resize_images_in_directories(directories)

In [None]:
import os

def count_files_in_directories(directories):
    file_counts = {}

    for directory in directories:
        count = len(os.listdir(directory))
        file_counts[directory] = count

    return file_counts

# Specify the directories
directories = [
    '/content/extracted/part1',
    '/content/extracted/part2',
    '/content/extracted/part3'
]

# Run the function and print the counts
file_counts = count_files_in_directories(directories)

for directory, count in file_counts.items():
    print(f"Number of files in {directory}: {count}")

Number of files in /content/extracted/part1: 10135
Number of files in /content/extracted/part2: 10717
Number of files in /content/extracted/part3: 3249


In [None]:
import os
import cv2
import numpy as np
import pandas as pd

# Specify the directories
directories = [
    '/content/extracted/part1',
    '/content/extracted/part2',
    '/content/extracted/part3'
]

# Process each directory
i = 1  # Moved this outside the loop to avoid resetting
# Initialize lists to store images and labels
images = []
ages = []
genders = []
ethnicities = []

for directory in directories:
    print(f"Processing from: {directory}")
    files = os.listdir(directory)

    for file in files:
        file_path = os.path.join(directory, file)

        # Read the image
        image = cv2.imread(file_path)

        if image is not None:  # Check if the image was loaded successfully
            # Convert BGR to RGB
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            # Append the flattened image and labels to their respective lists
            images.append(image_rgb)
            split_var = file.split('_')
            ages.append(int(split_var[0]))
            genders.append(int(split_var[1]))
            ethnicities.append(int(split_var[2]))
        else:
            print(f"Failed to load image: {file_path}")

    i += 1  # Increment the counter


Processing from: /content/extracted/part1
Processing from: /content/extracted/part2
Processing from: /content/extracted/part3


In [None]:
# After processing each directory, you can print the lengths of your lists:
print(f"Length of ages: {len(ages)}")
print(f"Length of genders: {len(genders)}")
print(f"Length of ethnicities: {len(ethnicities)}")
print(f"Length of images: {len(images)}")

Length of ages: 24101
Length of genders: 24101
Length of ethnicities: 24101
Length of images: 24101


# Preprocessing

In [None]:
# Normalize the images in place to save memory
#for i in range(len(images)):
    #images[i] = images[i] / 255.0

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(images, ages, test_size=0.2,shuffle  = True)

# ResNet

In [None]:
from keras import layers
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
import keras
from keras.layers import *
from keras.models import *
from keras import backend as K
from keras.applications import InceptionV3
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
conv_base = keras.applications.ResNet50(
    include_top=False,
    weights="imagenet"
)

conv_base.trainable = False

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


In [None]:
inputs = keras.Input(shape=(images[0].shape[0],images[0].shape[1],images[0].shape[2]))
x = keras.applications.inception_v3.preprocess_input(inputs)
x = conv_base(x)
x = layers.Flatten()(x)
x = layers.Dense(512)(x)
x = layers.Dense(512)(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="relu")(x)

inception = keras.Model(inputs=inputs, outputs=outputs)
inception.summary()


# Set custom learning rate for Adam optimizer
learning_rate = 0.001  # Change this value to your desired learning rate
adam_optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

inception.compile(loss="mae",
                  optimizer=adam_optimizer,
                  metrics=["mae"])

In [None]:
inception_history = inception.fit(
    x_train,
    y_train,
    epochs = 10,
    validation_data = (x_test, y_test),
)

In [None]:
# Get the predicted probabilities
y_pred_probs = inception.predict(x_test)

# Convert the probabilities to class labels
y_pred = np.argmax(y_pred_probs, axis=1)

# If y_test is one-hot encoded, convert it back to class labels
y_true = np.argmax(y_test, axis=1)

# Get confusion matrix
# Generate confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Class names
class_names = age_groups.columns

# Plot confusion matrix with labels
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()