In [14]:
#Loading data, class definitions
import torch
from torch import nn
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import ToTensor, Normalize, Resize, Compose
import torchvision
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from pathlib import Path
from PIL import Image
import mlflow
import mlflow.pytorch
import random
import requests
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

class MultimodalClassifier(nn.Module):
    def __init__(self, text_vocab_size, text_embedding_dim, text_seq_length, num_classes):
        super(MultimodalClassifier, self).__init__()

        # Image model (ResNet50)
        self.image_model = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT)
        for param in self.image_model.parameters():
            param.requires_grad = False
        self.image_model.fc = nn.Linear(2048, 512)

        # Text model
        self.text_embedding = nn.Embedding(text_vocab_size, text_embedding_dim)
        self.text_conv = nn.Conv1d(in_channels=text_embedding_dim, out_channels=256, kernel_size=5, padding=2)
        self.text_lstm = nn.LSTM(input_size=256, hidden_size=256, batch_first=True, bidirectional=True)
        self.text_fc = nn.Linear(512, 512)

        # Combined classifier
        self.fc = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, image, text):
        # Image branch
        image_features = self.image_model(image)
    
        # Text branch
        text_embedded = self.text_embedding(text)
        text_embedded = text_embedded.permute(0, 2, 1)
    
        # Apply the convolution
        text_conv_out = self.text_conv(text_embedded)
        
        # Apply LSTM
        text_features, _ = self.text_lstm(text_conv_out)
        text_features = self.text_fc(text_features[:, -1, :])
    
        # Combine features
        combined_features = torch.cat((image_features, text_features), dim=1)
        return self.fc(combined_features)


# Data preparation for images
image_dataset_path = "./datasets/dataset_clean/dataset_clean.csv"
image_dataset_images_path = Path("./datasets/dataset_clean/images/")

image_dataset = pd.read_csv(image_dataset_path)

indices = list(range(len(image_dataset)))
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)

image_dataset['image_path'] = image_dataset['image_path'].apply(lambda x: image_dataset_images_path / Path(x).name)
image_categories = sorted(image_dataset['category'].unique())

image_train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.CenterCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(30),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
    transforms.ToTensor()
])

image_test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.CenterCrop(224),
    transforms.ToTensor()
])

image_train_data = image_dataset.iloc[train_indices]
image_test_data = image_dataset.iloc[test_indices]

class ImageDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.data = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_path = row['image_path']
        label = row['category_encoded']

        try:
            image = Image.open(image_path).convert("RGB")
        except FileNotFoundError:
            print(f"Image not found: {image_path}")
            raise

        if self.transform:
            image = self.transform(image)

        return image, label

image_train_dataset = ImageDataset(image_train_data, transform=image_train_transform)
image_test_dataset = ImageDataset(image_test_data, transform=image_test_transform)

image_train_loader = DataLoader(image_train_dataset, batch_size=64, shuffle=False)
image_test_loader = DataLoader(image_test_dataset, batch_size=64, shuffle=False)

# Data preparation for text
text_dataset_path = './datasets/dataset_clean/dataset_clean.csv'
text_dataset = pd.read_csv(text_dataset_path)

VOCAB_SIZE = 10000
MAX_LENGTH = 256

text_tokenizer = {}
word_index = {}

def build_tokenizer_and_vocab(descriptions, vocab_size):
    from collections import Counter
    global text_tokenizer, word_index
    tokenizer = Counter()
    for desc in descriptions:
        tokenizer.update(desc.lower().split())
    most_common = tokenizer.most_common(vocab_size - 1)
    text_tokenizer = {word: i + 1 for i, (word, _) in enumerate(most_common)}
    word_index = {word: i + 1 for i, (word, _) in enumerate(most_common)}

def tokenize_text(description, max_length):
    global text_tokenizer
    tokens = [text_tokenizer.get(word, 0) for word in description.lower().split()]
    if len(tokens) < max_length:
        tokens += [0] * (max_length - len(tokens))
    return tokens[:max_length]

build_tokenizer_and_vocab(text_dataset['description'], VOCAB_SIZE)
text_dataset['tokenized'] = text_dataset['description'].apply(lambda x: tokenize_text(x, MAX_LENGTH))

text_X = np.array(text_dataset['tokenized'].tolist())
text_y = np.array(text_dataset['category_encoded'])

text_X_train = text_X[train_indices]
text_X_test = text_X[test_indices]
text_y_train = text_y[train_indices]
text_y_test = text_y[test_indices]

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return torch.tensor(text, dtype=torch.long), torch.tensor(label, dtype=torch.long)

text_train_dataset = TextDataset(text_X_train, text_y_train)
text_test_dataset = TextDataset(text_X_test, text_y_test)

text_train_loader = DataLoader(text_train_dataset, batch_size=64, shuffle=False)
text_test_loader = DataLoader(text_test_dataset, batch_size=64, shuffle=False)

# Combined dataset
class CombinedDataset(Dataset):
    def __init__(self, image_dataset, text_dataset, p_image=0.33, p_text=0.33, p_both=0.34):
        """
        Args:
            image_dataset: Dataset containing images.
            text_dataset: Dataset containing texts.
            p_image: Probability of returning only the image.
            p_text: Probability of returning only the text.
            p_both: Probability of returning both image and text.
        """
        assert abs(p_image + p_text + p_both - 1.0) < 1e-6, "Probabilities must sum to 1"
        self.image_dataset = image_dataset
        self.text_dataset = text_dataset
        self.p_image = p_image
        self.p_text = p_text
        self.p_both = p_both

    def __len__(self):
        return min(len(self.image_dataset), len(self.text_dataset))

    def __getitem__(self, idx):
        image, image_label = self.image_dataset[idx]
        text, text_label = self.text_dataset[idx]
        assert image_label == text_label, "Labels for image and text must match"

        mode = random.choices(['image', 'text', 'both'], weights=[self.p_image, self.p_text, self.p_both])[0]

        if mode == 'image':
            # Dummy text tensor
            text = torch.zeros(MAX_LENGTH).long()
        elif mode == 'text':
            # Dummy image tensor
            image = torch.zeros(3, 224, 224)
        # Return selected data
        return image, text, image_label

combined_train_dataset = CombinedDataset(image_train_dataset, text_train_dataset)
combined_test_dataset = CombinedDataset(image_test_dataset, text_test_dataset)
combined_test_dataset_both = CombinedDataset(image_test_dataset, text_test_dataset, p_image=0, p_text=0, p_both=1)
combined_test_dataset_image = CombinedDataset(image_test_dataset, text_test_dataset, p_image=1, p_text=0, p_both=0)
combined_test_dataset_text = CombinedDataset(image_test_dataset, text_test_dataset, p_image=0, p_text=1, p_both=0)

combined_train_loader = DataLoader(combined_train_dataset, batch_size=64, shuffle=True)
combined_test_loader = DataLoader(combined_test_dataset, batch_size=64, shuffle=True)
combined_test_loader_both = DataLoader(combined_test_dataset_both, batch_size=64, shuffle=True)
combined_test_loader_image = DataLoader(combined_test_dataset_image, batch_size=64, shuffle=True)
combined_test_loader_text = DataLoader(combined_test_dataset_text, batch_size=64, shuffle=True)

# Parameters
text_vocab_size = VOCAB_SIZE
text_embedding_dim = 128
text_seq_length = MAX_LENGTH
num_classes = len(image_categories)

print("Data loaded")

cuda
Data loaded


In [17]:
# Model definition, training and eval
# Model instantiation
model = MultimodalClassifier(text_vocab_size, text_embedding_dim, text_seq_length, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Evaluation mixed (sometimes image+text, sometimes only image or text - like in training data)
def evaluate_image_text(model, dataloader, device, classes, title):
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for images, texts, labels in dataloader:
            images, texts, labels = images.to(device), texts.to(device), labels.to(device)

            outputs = model(images, texts)
            _, preds = torch.max(outputs, 1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print(title)
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 score: {f1}\n\n")
    print(f"Classification report:\n\n{classification_report(all_labels, all_preds, target_names=classes)}\n\n")
    return accuracy, precision, recall, f1

# Evaluation image only
def evaluate_image(model, dataloader, device, classes):
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for images, texts, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            
            # Dummy tensor for text (since we're evaluating only with images)
            dummy_text_tensor = torch.zeros(images.size(0), MAX_LENGTH).to(device).long()
            
            # Pass only the image and dummy text tensor to the model
            outputs = model(images, dummy_text_tensor)
            _, preds = torch.max(outputs, 1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print("Evaluation (image only):")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 score: {f1}\n\n")
    print(f"Classification report:\n\n{classification_report(all_labels, all_preds, target_names=classes)}\n\n")
    return accuracy, precision, recall, f1

# Evaluation text only
def evaluate_text(model, dataloader, device, classes):
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for images, texts, labels in dataloader:
            texts, labels = texts.to(device), labels.to(device)

            # Dummy tensor for image (since we're evaluating only with text)
            dummy_image_tensor = torch.zeros(texts.size(0), 3, 224, 224).to(device)
            
            # Pass only the text and dummy image tensor to the model
            outputs = model(dummy_image_tensor, texts)
            _, preds = torch.max(outputs, 1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print("Evaluation (text only):")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 score: {f1}\n\n")
    print(f"Classification report:\n\n{classification_report(all_labels, all_preds, target_names=classes)}\n\n")
    return accuracy, precision, recall, f1

# Training loop
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for images, texts, labels in dataloader:
        images, texts, labels = images.to(device), texts.to(device), labels.to(device)

        # Forward pass
        outputs = model(images, texts)
        loss = criterion(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

# mlflow auth
# for production should be moved to env variables
os.environ['MLFLOW_TRACKING_USERNAME'] = 'iis_user'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'iis_password'

mlflow.set_tracking_uri("https://bojan-radulovic.xyz/mlflow/")
with mlflow.start_run():
    # Log hyperparameters
    mlflow.log_param("text_vocab_size", text_vocab_size)
    mlflow.log_param("text_embedding_dim", text_embedding_dim)
    mlflow.log_param("text_seq_length", text_seq_length)
    mlflow.log_param("num_classes", num_classes)

    mlflow.log_param("loss_fun", "CrossEntropyLoss")
    mlflow.log_param("lr", 0.001)

    # Main execution loop
    num_epochs = 10
    mlflow.log_param("num_epochs", num_epochs)

    print("Starting training...")
    for epoch in range(num_epochs):
        train_loss = train(model, combined_train_loader, criterion, optimizer, device)
        mlflow.log_metric("train_loss", train_loss, step=epoch+1)
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {train_loss:.4f}")

        # Evaluation on validation/test data
        # There is no validation dataset so validation is also done on test dataset
        val_accuracy, val_precision, val_recall, val_f1 = evaluate_image_text(
            model, combined_test_loader, device, image_categories, "Evaluation (mixed):"
        )
        
        # Log metrics for this epoch
        mlflow.log_metric("val_accuracy", val_accuracy, step=epoch)
        mlflow.log_metric("val_precision", val_precision, step=epoch)
        mlflow.log_metric("val_recall", val_recall, step=epoch)
        mlflow.log_metric("val_f1", val_f1, step=epoch)

    # Log final metrics after evaluation
    print("\nEvaluating on test data...")
    accuracy_image_and_text, precision_image_and_text, recall_image_and_text, f1_image_and_text = evaluate_image_text(model, combined_test_loader_both, device, image_categories, "Evaluation (image and text):")
    accuracy_mixed, precision_mixed, recall_mixed, f1_mixed = evaluate_image_text(model, combined_test_loader, device, image_categories, "Evaluation (mixed):")
    accuracy_image, precision_image, recall_image, f1_image = evaluate_image(model, combined_test_loader_image, device, image_categories)
    accuracy_text, precision_text, recall_text, f1_text = evaluate_text(model, combined_test_loader_text, device, image_categories)

    mlflow.log_metric("test_accuracy_image_and_text", accuracy_image_and_text)
    mlflow.log_metric("test_precision_image_and_text", precision_image_and_text)
    mlflow.log_metric("test_recall_image_and_text", recall_image_and_text)
    mlflow.log_metric("test_f1_image_and_text", f1_image_and_text)
    
    mlflow.log_metric("test_accuracy_mixed", accuracy_mixed)
    mlflow.log_metric("test_precision_mixed", precision_mixed)
    mlflow.log_metric("test_recall_mixed", recall_mixed)
    mlflow.log_metric("test_f1_mixed", f1_mixed)

    mlflow.log_metric("test_accuracy_image", accuracy_image)
    mlflow.log_metric("test_precision_image", precision_image)
    mlflow.log_metric("test_recall_image", recall_image)
    mlflow.log_metric("test_f1_image", f1_image)

    mlflow.log_metric("test_accuracy_text", accuracy_text)
    mlflow.log_metric("test_precision_text", precision_text)
    mlflow.log_metric("test_recall_text", recall_text)
    mlflow.log_metric("test_f1_text", f1_text)

    # Log the model
    mlflow.pytorch.log_model(model, "model")
print("Training and evaluation completed")

Starting training...
Epoch [1/10], Loss: 1.0691
Evaluation (mixed):
Accuracy: 0.6959
Precision: 0.7036758072591334
Recall: 0.6959
F1 score: 0.6961703189026446


Classification report:

                   precision    recall  f1-score   support

      amazon home       0.68      0.58      0.63      2017
       automotive       0.69      0.85      0.77      2020
             baby       0.79      0.71      0.75      1991
     pet supplies       0.82      0.74      0.78      1983
sports & outdoors       0.53      0.59      0.56      1989

         accuracy                           0.70     10000
        macro avg       0.70      0.70      0.70     10000
     weighted avg       0.70      0.70      0.70     10000



Epoch [2/10], Loss: 0.7496
Evaluation (mixed):
Accuracy: 0.7506
Precision: 0.7523422727360207
Recall: 0.7506
F1 score: 0.747924734622753


Classification report:

                   precision    recall  f1-score   support

      amazon home       0.76      0.68      0.72      20



Evaluation (text only):
Accuracy: 0.8879
Precision: 0.8893982029171351
Recall: 0.8879
F1 score: 0.8884644965747488


Classification report:

                   precision    recall  f1-score   support

      amazon home       0.84      0.85      0.85      2017
       automotive       0.91      0.91      0.91      2020
             baby       0.92      0.92      0.92      1991
     pet supplies       0.97      0.92      0.95      1983
sports & outdoors       0.81      0.84      0.82      1989

         accuracy                           0.89     10000
        macro avg       0.89      0.89      0.89     10000
     weighted avg       0.89      0.89      0.89     10000







Training and evaluation completed


In [18]:
#model saving
save_path = './multimodal_classifier_10.pth'
torch.save(model.state_dict(), save_path)
print(f"Model saved to {save_path}")

Model saved to ./multimodal_classifier_10.pth


In [19]:
# Inference
# Preprocessing function for the image
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.CenterCrop(224),
    transforms.ToTensor()
])

# Inference function
def predict(image_path=None, text=None, model=None, device=None, categories=None):
    image_tensor = None
    text_tensor = None
    
    # If the image is provided, load and transform it
    if image_path:
        try:
            image = Image.open(image_path).convert("RGB")
        except FileNotFoundError:
            print(f"Image not found: {image_path}")
            return None
        image_tensor = image_transform(image).unsqueeze(0).to(device)  # Add batch dimension and move to device
    
    # If the text is provided, tokenize it
    if text:
        text_tokens = tokenize_text(text, MAX_LENGTH)
        text_tensor = torch.tensor(text_tokens).unsqueeze(0).to(device).long()  # Ensure it's Long for text input
    
    # Forward pass through the model
    with torch.no_grad():
        if image_tensor is not None and text_tensor is not None:
            output = model(image_tensor, text_tensor)
        elif image_tensor is not None:  # Only image input
            output = model(image_tensor, torch.zeros(1, MAX_LENGTH).to(device).long())  # Dummy tensor for text (Long type)
        elif text_tensor is not None:  # Only text input
            output = model(torch.zeros(1, 3, 224, 224).to(device).float(), text_tensor)  # Dummy tensor for image (Float type)
        else:
            print("Both image and text are missing.")
            return None

        _, predicted_class = torch.max(output, 1)
    
    # Get the class name from the index
    predicted_class_name = categories[predicted_class.item()]
    
    return predicted_class_name

# Example usage:
image_path = "datasets/dataset_clean/images/product_299.jpg"
text_input = "Pet dog collar for your little dog."

# Predicting with both image and text
predicted_class_name = predict(image_path=image_path, text=text_input, model=model, device=device, categories=image_categories)
print(f"Predicted class (image + text): {predicted_class_name}")

# Predicting with only the image
predicted_class_name = predict(image_path=image_path, text=None, model=model, device=device, categories=image_categories)
print(f"Predicted class (image only): {predicted_class_name}")

# Predicting with only the text
predicted_class_name = predict(image_path=None, text=text_input, model=model, device=device, categories=image_categories)
print(f"Predicted class (text only): {predicted_class_name}")


Predicted class (image + text): pet supplies
Predicted class (image only): pet supplies
Predicted class (text only): pet supplies
