### Model 1: Classical Model

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from dataloader import CustomDataloader
from utils import CustomDataset
import tqdm

features = ['age_st', 'haircuts_st', 'gender_gender_female', 'gender_gender_male',
            'has_tiktok_has_tiktok_no', 'has_tiktok_has_tiktok_yes',
            'remembers_disco_remembers_disco_no', 'remembers_disco_remembers_disco_yes',
            'uses_skincare_uses_skincare_no', 'uses_skincare_uses_skincare_yes']
target = ['age']

train_df = pd.read_csv('data/csv_training.csv')

x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(train_df[features], train_df[target], test_size=0.2, random_state=42)

# dataframes -> numpy arrays -> tensors
x_train_1, x_test_1 = x_train_1.to_numpy(), x_test_1.to_numpy()
y_train_1, y_test_1 = y_train_1.to_numpy(), y_test_1.to_numpy()

model_1 = LinearRegression()
model_1.fit(x_train_1, y_train_1)

model_1_predict = model_1.predict(x_test_1)

mse = mean_squared_error(y_test_1, model_1_predict)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 4.1694807625032735e-26


### Model 2: Neural Network Model

In [2]:
class CNNClassifier(nn.Module):
    def __init__(self, output_dim: int):
        super(CNNClassifier, self).__init__()
        assert output_dim > 0, "Output dimension must be a positive integer"
        self.conv1 = nn.Conv2d(
            in_channels = 1,
            out_channels = 16,
            kernel_size = (5, 5), 
            stride = (1, 1),
            padding = (0, 0)
        )
        self.maxpool1 = nn.MaxPool2d(
            kernel_size = (3,3),
            stride = (2,2),
            padding = (0,0)
        )
        self.conv2 = nn.Conv2d(
            in_channels = 16, 
            out_channels = 64, 
            kernel_size = (3, 3), 
            stride = (2, 2), 
            padding = (0, 0)
        )
        self.maxpool2 = nn.MaxPool2d(
            kernel_size = (5,5),
            stride = (2,2),
            padding = (0,0)
        )
        self.linear1 = nn.Linear(
            in_features=64,
            out_features=output_dim
        )
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.maxpool1(x)
        x = self.relu(self.conv2(x))
        x = self.maxpool2(x)
        # reshape for linear layer
        # note that the output of maxpool 2 is (*,64,1,1) so we just need to take the first column and row. 
        # If the output size is not 1,1, we have to flatten x before going into linear using torch.flatten
        x = x[:,:,0,0] 
        x = self.linear1(x)     
        x = torch.sigmoid(x)  
        return x

In [7]:
# Assuming you have your data and labels (x and y) ready
# Replace this with your actual data
df = pd.read_csv('data/UTKFaceAugmented.csv')

# Split the data into train, validation, and test sets
x_train, x_temp, y_train, y_temp = train_test_split(df, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

# Define transformations for image preprocessing
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
])

batch_size = 16  # Set your desired batch size

In [8]:
train_dataloader = CustomDataloader(x = x_train, y = y_train, batch_size=16, randomize=True)
val_dataloader = CustomDataloader(x = x_val, y = y_val, batch_size=16, randomize=False)

model = CNNClassifier(32)

# instantiate your optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# log your losses
train_losses = []
val_losses = []

# define how many epochs to train on
epochs = 25

# define your loss function for multiclass classification task
# BCE does binary cross entropy automatically for each class
loss_fn = torch.nn.BCELoss(reduction='none')

for _ in tqdm.tqdm(range(epochs)):
    losses = []
    for _ in range(train_dataloader.num_batches_per_epoch):
        # training data forward pass
        optimizer.zero_grad()
        train_batch = train_dataloader.fetch_batch()
        yhat = model(train_batch['x_batch'])
        train_loss = torch.mean(loss_fn(yhat, train_batch['y_batch']))

        # training data backward pass
        train_loss.backward()
        optimizer.step()
        losses.append(train_loss.detach().numpy())

    # personally, I like to visualize the loss per every iteration, rather than every epoch. I find it more useful to diagnose issues
    train_losses.extend(losses)
    
    losses = []
    for _ in range(val_dataloader.num_batches_per_epoch):
        # validation data forward pass only
        val_batch = val_dataloader.fetch_batch()
        yhat = model(val_batch['x_batch'])
        val_loss = torch.mean(loss_fn(yhat, val_batch['y_batch']),axis=0)
        losses.append(val_loss.detach().numpy())
    # epoch-level logging for validation though usually makes the most sense
    val_losses.append(np.mean(losses))

  0%|          | 0/100 [00:00<?, ?it/s]


KeyError: tensor([ 8617,  6879, 13406,  ..., 10739,  3570, 16448])