<p style="text-align:center;font-weight: 900; font-size:40px;"> Multimodal Sentiment Analysis Higher Accuracy </p>

**More robust vgg19 and xlm-roberta******

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import os


# Load and preprocess data
def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


def augment_images(images):
    """Apply data augmentation"""
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    return datagen.flow(images, batch_size=16, shuffle=True)


# Multimodal Model Definition
class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        # Use XLM-Roberta for multilingual capabilities
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.bert_model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')

    def build_model(self):
        # Image branch (VGG19 with fine-tuning)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        for layer in vgg19.layers[:-8]:  # Fine-tune deeper layers
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.4)(x)
        image_features = LayerNormalization()(x)

        # Text branch (XLM-Roberta)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])
        pooled_output = bert_outputs[1]  # Use the pooled output
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.4)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=1e-4)  # Reduce learning rate
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using XLM-Roberta tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


# Model Training
def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=3,
            min_lr=1e-6
        )
    ]

    # Data augmentation for images
    train_images_augmented = augment_images(train_images)

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )

    return model, history


# Main Function
def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]

    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

    return model, history


if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import os
import math


# Load and preprocess data
def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


def augment_images(images):
    """Apply stronger data augmentation"""
    datagen = ImageDataGenerator(
        rotation_range=30,
        width_shift_range=0.3,
        height_shift_range=0.3,
        shear_range=0.3,
        zoom_range=0.3,
        horizontal_flip=True,
        fill_mode='nearest',
        brightness_range=[0.8, 1.2]
    )
    return datagen.flow(images, batch_size=16, shuffle=True)


# Multimodal Model Definition
class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        # Use XLM-Roberta for multilingual capabilities
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.bert_model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')

    def build_model(self):
        # Image branch (VGG19 with fine-tuning)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        for layer in vgg19.layers[:-8]:  # Fine-tune deeper layers
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
        x = Dropout(0.4)(x)
        image_features = BatchNormalization()(x)

        # Text branch (XLM-Roberta)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])
        pooled_output = bert_outputs[1]  # Use the pooled output
        x = Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = BatchNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(combined)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=1e-4)  # Reduce learning rate
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using XLM-Roberta tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


# Cosine Annealing Learning Rate Scheduler
def cosine_decay(epoch):
    initial_lr = 1e-4
    return initial_lr * (1 + math.cos(epoch * math.pi / 20)) / 2


# Model Training
def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Compute class weights for imbalanced datasets
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_labels),
        y=train_labels
    )
    class_weights = dict(enumerate(class_weights))

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=2,  # Stop earlier
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        ),
        tf.keras.callbacks.LearningRateScheduler(cosine_decay)
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )

    return model, history


# Main Function
def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]

    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

    return model, history


if __name__ == "__main__":
    main()


This vgg19 and bert model

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=0.001)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(512, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(512, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(512, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=10):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(512, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(512, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(512, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(512, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(512, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(512, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


Less layer

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


Edited

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm


def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')

    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, class_weights, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )

    return model, history


def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs,
        class_weights_dict
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Evaluate model
    print(classification_report(test_df['Label_Sentiment'].map(label_map), predicted_labels))
    print(confusion_matrix(test_df['Label_Sentiment'].map(label_map), predicted_labels))

    # Save predictions
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")


if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os
import math
import tensorflow_addons as tfa

# Load and preprocess data
def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


def augment_images(images):
    """Apply stronger data augmentation"""
    datagen = ImageDataGenerator(
        rotation_range=30,
        width_shift_range=0.3,
        height_shift_range=0.3,
        shear_range=0.3,
        zoom_range=0.3,
        horizontal_flip=True,
        fill_mode='nearest',
        brightness_range=[0.8, 1.2]
    )
    return datagen.flow(images, batch_size=16, shuffle=True)


# Multimodal Model Definition
class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        # Use XLM-Roberta for multilingual capabilities
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.bert_model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')

    def build_model(self):
        # Image branch (VGG19 with fine-tuning)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        for layer in vgg19.layers[:-8]:  # Fine-tune deeper layers
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
        x = Dropout(0.4)(x)
        image_features = BatchNormalization()(x)

        # Text branch (XLM-Roberta)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])
        pooled_output = bert_outputs[1]  # Use the pooled output
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = BatchNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(combined)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using XLM-Roberta tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


# Cosine Annealing Learning Rate Scheduler
def lr_schedule(epoch):
    initial_lr = 1e-4
    if epoch < 5:  # Warm-up phase
        return initial_lr * (epoch + 1) / 5
    else:
        return initial_lr * tf.math.exp(-0.1 * (epoch - 5))


# Model Training
def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Compute class weights for imbalanced datasets
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_labels),
        y=train_labels
    )
    class_weights = dict(enumerate(class_weights))

    # Focal loss for better handling of class imbalance
    loss = tfa.losses.SigmoidFocalCrossEntropy()

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss=loss,
        metrics=['accuracy']
    )

    # Callbacks
    lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_schedule)
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        lr_callback
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=32,
        callbacks=callbacks
    )

    return model, history


# Main Function
def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]

    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

    return model, history


if __name__ == "__main__":
    main()


Final

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os
import math

# Load and preprocess data
def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


def augment_images(images):
    """Apply stronger data augmentation"""
    datagen = ImageDataGenerator(
        rotation_range=30,
        width_shift_range=0.3,
        height_shift_range=0.3,
        shear_range=0.3,
        zoom_range=0.3,
        horizontal_flip=True,
        fill_mode='nearest',
        brightness_range=[0.8, 1.2]
    )
    return datagen.flow(images, batch_size=16, shuffle=True)


# Multimodal Model Definition
class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        # Use XLM-Roberta for multilingual capabilities
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.bert_model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')

    def build_model(self):
        # Image branch (VGG19 with fine-tuning)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        for layer in vgg19.layers[:-8]:  # Fine-tune deeper layers
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
        x = Dropout(0.4)(x)
        image_features = BatchNormalization()(x)

        # Text branch (XLM-Roberta)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])
        pooled_output = bert_outputs[1]  # Use the pooled output
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = BatchNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(combined)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using XLM-Roberta tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


# Cosine Annealing Learning Rate Scheduler
def lr_schedule(epoch):
    initial_lr = 1e-4
    if epoch < 5:  # Warm-up phase
        return initial_lr * (epoch + 1) / 5
    else:
        return initial_lr * tf.math.exp(-0.1 * (epoch - 5))


# Model Training
def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Convert labels to one-hot encoding
    train_labels_onehot = tf.keras.utils.to_categorical(train_labels, num_classes=3)
    val_labels_onehot = tf.keras.utils.to_categorical(val_labels, num_classes=3)

    # Compute class weights for imbalanced datasets
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_labels),
        y=train_labels
    )
    class_weights = dict(enumerate(class_weights))

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Callbacks
    lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_schedule)
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        lr_callback
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels_onehot,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels_onehot
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=32,
        callbacks=callbacks
    )

    return model, history


# Main Function
def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]

    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

    return model, history


if __name__ == "__main__":
    main()


In [None]:
pip install tensorflow numpy pandas transformers vit-keras tqdm scikit-learn pillow


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
from vit_keras import vit
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.imagenet_utils.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (Vision Transformer)
        vit_model = vit.vit_b16(
            image_size=224,
            pretrained=True,
            include_top=False,
            pretrained_top=False
        )
        image_input = Input(shape=(224, 224, 3), name='image_input')
        x = vit_model(image_input)  # Output: (None, 768)
        x = Dense(256, activation='relu')(x)  # Add Dense layer instead of pooling
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


In [None]:
pip install tensorflow numpy pandas transformers vit-keras tqdm scikit-learn pillow


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
from vit_keras import vit
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.imagenet_utils.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (Vision Transformer)
        vit_model = vit.vit_b16(
            image_size=224,
            pretrained=True,
            include_top=False,
            pretrained_top=False
        )
        image_input = Input(shape=(224, 224, 3), name='image_input')
        x = vit_model(image_input)  # Output: (None, 768)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)  # Add Dense layer with L2 regularization
        x = Dropout(0.4)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(combined)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
        x = Dropout(0.3)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    class_weight_dict = dict(enumerate(class_weights))

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',  # Monitor validation loss
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        ),
        tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-5 if epoch > 10 else 2e-5)
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,  # Decreased batch size for better generalization
        class_weight=class_weight_dict,  # Add class weights
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()



In [None]:
pip install tensorflow pandas numpy tqdm scikit-learn transformers vit-keras


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
from vit_keras import vit
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.imagenet_utils.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

# Image augmentation
def augment_images(images):
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    return datagen.flow(images, batch_size=len(images), shuffle=False).next()

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (Vision Transformer)
        vit_model = vit.vit_b16(
            image_size=224,
            pretrained=True,
            include_top=False,
            pretrained_top=False
        )
        image_input = Input(shape=(224, 224, 3), name='image_input')
        x = vit_model(image_input)  # Output: (None, 768)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)  # Add Dense layer with L2 regularization
        x = Dropout(0.4)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(combined)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
        x = Dropout(0.3)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Augment training images
    train_images = augment_images(train_images)

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    class_weight_dict = dict(enumerate(class_weights))

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        ),
        tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-5 if epoch > 10 else 2e-5)
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        class_weight=class_weight_dict,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


# VGG19 + BERT with Regularizaition and 0.4 dropout 
Added regularization and high dropout rate to avoid overfitting.
Batch size: 32
Layer: 256

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm


def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')

    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.4)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.4)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, class_weights, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )

    return model, history


def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs,
        class_weights_dict
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Evaluate model
    print(classification_report(test_df['Label_Sentiment'].map(label_map), predicted_labels))
    print(confusion_matrix(test_df['Label_Sentiment'].map(label_map), predicted_labels))

    # Save predictions
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")


if __name__ == "__main__":
    main()

# VisualBERT multimodal model default settings

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import TFVisualBertModel, VisualBertConfig, BertTokenizer
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from PIL import Image
import torch
from torchvision.models import resnet50
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, transform):
    """Load and preprocess a single image using PyTorch transforms"""
    try:
        img = Image.open(image_path).convert('RGB')
        img = transform(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return torch.zeros((3, 224, 224))

def process_images(image_paths):
    """Process all images and extract features using ResNet"""
    transform = Compose([
        Resize((224, 224)),
        ToTensor(),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Load ResNet model
    resnet = resnet50(pretrained=True)
    resnet = torch.nn.Sequential(*(list(resnet.children())[:-1]))  # Remove classification layer
    resnet.eval()
    
    features = []
    with torch.no_grad():
        for path in tqdm(image_paths, desc="Processing images"):
            img = preprocess_image(path, transform)
            img = img.unsqueeze(0)  # Add batch dimension
            feat = resnet(img)
            features.append(feat.squeeze().numpy())
    
    return np.array(features)

class VisualBertSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.config = VisualBertConfig.from_pretrained('uclanlp/visualbert-vqa')
        self.visual_bert = TFVisualBertModel.from_pretrained('uclanlp/visualbert-vqa', config=self.config)
        
    def build_model(self):
        # Inputs
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        visual_features = Input(shape=(2048,), dtype=tf.float32, name='visual_features')
        visual_attention_mask = Input(shape=(1,), dtype=tf.int32, name='visual_attention_mask')
        
        # Expand visual features dimensions to match VisualBERT requirements
        visual_embeds = tf.expand_dims(visual_features, axis=1)  # Replace unsqueeze with expand_dims
        
        # Pass through VisualBERT
        outputs = self.visual_bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            visual_embeddings=visual_embeds,
            visual_attention_mask=visual_attention_mask
        )
        
        # Get pooled output
        pooled_output = outputs[1]
        
        # Classification layers
        x = Dense(512, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, visual_features, visual_attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = VisualBertSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Prepare visual attention masks
    train_visual_attention_mask = tf.ones((train_images.shape[0], 1), dtype=tf.int32)
    val_visual_attention_mask = tf.ones((val_images.shape[0], 1), dtype=tf.int32)
    
    # Convert images to float32
    train_images = tf.cast(train_images, tf.float32)
    val_images = tf.cast(val_images, tf.float32)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask,
            'visual_features': train_images,
            'visual_attention_mask': train_visual_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask,
                'visual_features': val_images,
                'visual_attention_mask': val_visual_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths and extract features
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    train_features = process_images(train_image_paths)
    test_features = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_feats, val_feats, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_features, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_feats, train_texts, train_labs,
        val_feats, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = VisualBertSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    test_visual_attention_mask = tf.ones((test_features.shape[0], 1), dtype=tf.int32)
    test_features = tf.cast(test_features, tf.float32)
    
    # Make predictions
    predictions = model.predict({
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask,
        'visual_features': test_features,
        'visual_attention_mask': test_visual_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

# CLIP model with early stopping

Stopped at: epoch 14
Epoch: 30\
Accuracy: 0.7384\
Val. accuracy: 0.7295\
Layers: 512->256\
Batch size: 32

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu')(combined_features)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=32,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

# CLIP without early stopping
Epoch: 30\
Accuracy: 0.7929\
Val. accuracy: 0.7371\
Layers: 512->256\
Batch size: 32

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu')(combined_features)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=32,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

# CLIP without early stopping v-2.0
Epoch: 20\
Accuracy: 0.7535\
Val. accuracy: 0.7295\
Layers: 256->128\
Batch size: 16

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(256, activation='relu')(combined_features)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=16,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

# CLIP modified v-3
## Increased dropout, decreased batch size
Epoch: 20\
Accuracy: 0.7576\
Val. accuracy: 0.7371\
Layers: 512->256\
Batch size: 16\
Dropout: 0.4

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu')(combined_features)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=16,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

# CLIP modified v-4
## Added kernel regularizer
Epoch: 20\
Accuracy: 0.7613\
Val. accuracy: 0.7390\
Layers: 512->256\
Batch size: 16\
Dropout: 0.4

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu', kernel_regularizer=l2(0.01))(combined_features)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=16,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

# CLIP modified v-5
## Increase batch size and decay
Epoch: 30\
Accuracy: 0.7613\
Val. accuracy: 0.7390\
Layers: 512->256\
Batch size: 32\
Dropout: 0.4\
Regularization decay: 0.2

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu', kernel_regularizer=l2(0.02))(combined_features)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu', kernel_regularizer=l2(0.02))(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=50):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=32,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

# CLIP modified v-6
## Same as v-5 but increased dropout in two layers instead of just one
Epoch: 50\
Accuracy: 0.7818\
Val. accuracy: 0.7486\
Layers: 512->256\
Batch size: 32\
Dropout: 0.4\
Regularization decay: 0.2

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu', kernel_regularizer=l2(0.02))(combined_features)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu', kernel_regularizer=l2(0.02))(x)
        x = Dropout(0.4)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=50):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=32,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

# CLIP modified v-7
## Same as v-6 but used 30 epochs
Epoch: 30\
Accuracy: 0.7613\
Val. accuracy: 0.7390\
Layers: 512->256\
Batch size: 32\
Dropout: 0.4\
Regularization decay: 0.2

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu', kernel_regularizer=l2(0.02))(combined_features)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu', kernel_regularizer=l2(0.02))(x)
        x = Dropout(0.4)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=32,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

# FLAVA model v-1
Default FLAVA implemenation on the best result of CLIP

Epoch: 30\
Accuracy: 74.92%\
Val. accuracy: 74.10%\
Layers: 512->512->512->256\
Batch size: 16\
Dropout: 0.4\
Weight decay: 0.2

In [4]:
!pip install transformers torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[0m

In [3]:
import torch
import numpy as np
import pandas as pd
from transformers import FlavaProcessor, FlavaModel
from torch import nn
import torch.optim as optim
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

class FLAVASentimentModel(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.num_classes = num_classes
        self.processor = FlavaProcessor.from_pretrained("facebook/flava-full")
        self.flava = FlavaModel.from_pretrained("facebook/flava-full")
        
        for param in self.flava.parameters():
            param.requires_grad = False
            
        # Get multimodal output size from FLAVA config
        hidden_size = self.flava.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.LayerNorm(512),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, image_inputs, text_inputs):
        outputs = self.flava(
            input_ids=text_inputs['input_ids'],
            attention_mask=text_inputs['attention_mask'],
            pixel_values=image_inputs['pixel_values'],
            return_dict=True
        )
        
        # Get first token ([CLS]) of multimodal output
        multimodal_output = outputs.multimodal_embeddings[:, 0]
        return self.classifier(multimodal_output)

def process_data_batch(image_paths, texts, processor, batch_size=16, device='cuda'):
    all_image_inputs = []
    all_text_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        inputs = processor(
            images=batch_images,
            text=batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=77
        )
        
        all_image_inputs.append({
            'pixel_values': inputs['pixel_values'].to(device)
        })
        all_text_inputs.append({
            'input_ids': inputs['input_ids'].to(device),
            'attention_mask': inputs['attention_mask'].to(device)
        })
    
    return all_image_inputs, all_text_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, 
                device='cuda', epochs=30, batch_size=16):
    
    model = FLAVASentimentModel().to(device)
    optimizer = optim.Adam(model.parameters(), lr=2e-5, weight_decay=0.02)
    criterion = nn.CrossEntropyLoss()
    
    print("Processing training data...")
    train_image_inputs, train_text_inputs = process_data_batch(
        train_image_paths, train_texts, model.processor, batch_size=batch_size, device=device
    )
    
    print("Processing validation data...")
    val_image_inputs, val_text_inputs = process_data_batch(
        val_image_paths, val_texts, model.processor, batch_size=batch_size, device=device
    )
    
    train_labels = torch.tensor(train_labels, device=device)
    val_labels = torch.tensor(val_labels, device=device)
    
    best_val_acc = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for i in range(len(train_image_inputs)):
            optimizer.zero_grad()
            outputs = model(train_image_inputs[i], train_text_inputs[i])
            batch_labels = train_labels[i * batch_size:(i + 1) * batch_size]
            
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += batch_labels.size(0)
            correct += predicted.eq(batch_labels).sum().item()
        
        model.eval()
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for i in range(len(val_image_inputs)):
                outputs = model(val_image_inputs[i], val_text_inputs[i])
                batch_labels = val_labels[i * batch_size:(i + 1) * batch_size]
                
                _, predicted = outputs.max(1)
                val_total += batch_labels.size(0)
                val_correct += predicted.eq(batch_labels).sum().item()
        
        val_acc = 100. * val_correct / val_total
        print(f'Epoch {epoch + 1}: Train Acc: {100.*correct/total:.2f}% Val Acc: {val_acc:.2f}%')
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')
    
    return model

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    batch_size = 16
    
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    model = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs,
        device=device,
        batch_size=batch_size
    )
    
    print("Processing test data...")
    test_image_inputs, test_text_inputs = process_data_batch(
        test_image_paths, test_df['Captions'].tolist(),
        model.processor, batch_size=batch_size, device=device
    )
    
    model.eval()
    predictions = []
    with torch.no_grad():
        for i in range(len(test_image_inputs)):
            outputs = model(test_image_inputs[i], test_text_inputs[i])
            _, predicted = outputs.max(1)
            predictions.extend(predicted.cpu().numpy())
    
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predictions]
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

if __name__ == "__main__":
    main()

Some weights of the model checkpoint at facebook/flava-full were not used when initializing FlavaModel: ['image_codebook.blocks.group_4.group.block_2.res_path.path.conv_2.bias', 'mmm_image_head.decoder.bias', 'itm_head.seq_relationship.bias', 'mlm_head.decoder.bias', 'image_codebook.blocks.group_4.group.block_2.res_path.path.conv_3.weight', 'mmm_image_head.bias', 'mmm_image_head.decoder.weight', 'image_codebook.blocks.group_2.group.block_1.res_path.path.conv_2.bias', 'itm_head.seq_relationship.weight', 'itm_head.pooler.dense.bias', 'image_codebook.blocks.group_4.group.block_1.res_path.path.conv_2.bias', 'image_codebook.blocks.group_1.group.block_1.res_path.path.conv_4.bias', 'image_codebook.blocks.group_1.group.block_1.res_path.path.conv_2.weight', 'image_codebook.blocks.group_2.group.block_1.res_path.path.conv_4.weight', 'image_codebook.blocks.group_1.group.block_1.res_path.path.conv_3.weight', 'image_codebook.blocks.group_4.group.block_2.res_path.path.conv_2.weight', 'image_codebook.

Processing training data...
Error loading image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)
Processing validation data...




Epoch 1: Train Acc: 61.72% Val Acc: 61.90%
Epoch 2: Train Acc: 63.67% Val Acc: 62.86%
Epoch 3: Train Acc: 63.23% Val Acc: 65.33%
Epoch 4: Train Acc: 64.78% Val Acc: 65.14%
Epoch 5: Train Acc: 64.21% Val Acc: 66.10%
Epoch 6: Train Acc: 64.51% Val Acc: 67.24%
Epoch 7: Train Acc: 66.36% Val Acc: 68.95%
Epoch 8: Train Acc: 65.69% Val Acc: 67.62%
Epoch 9: Train Acc: 66.09% Val Acc: 68.57%
Epoch 10: Train Acc: 67.64% Val Acc: 69.33%
Epoch 11: Train Acc: 67.37% Val Acc: 69.52%
Epoch 12: Train Acc: 67.54% Val Acc: 70.29%
Epoch 13: Train Acc: 68.75% Val Acc: 71.05%
Epoch 14: Train Acc: 68.86% Val Acc: 71.43%
Epoch 15: Train Acc: 69.16% Val Acc: 72.19%
Epoch 16: Train Acc: 71.08% Val Acc: 71.62%
Epoch 17: Train Acc: 70.71% Val Acc: 72.57%
Epoch 18: Train Acc: 71.18% Val Acc: 72.38%
Epoch 19: Train Acc: 71.35% Val Acc: 72.38%
Epoch 20: Train Acc: 72.32% Val Acc: 73.71%
Epoch 21: Train Acc: 72.22% Val Acc: 73.33%
Epoch 22: Train Acc: 72.53% Val Acc: 73.71%
Epoch 23: Train Acc: 73.27% Val Acc: 74.6

# BLIP-2 model v-1
Default BLIP-2 implemenation on the best result of CLIP

Epoch: 30\
Accuracy: 74.92%\
Val. accuracy: 74.10%\
Layers: 512->512->512->256\
Batch size: 16\
Dropout: 0.4\
Weight decay: 0.2

In [7]:
!pip install --upgrade transformers torch torchvision pillow

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting torch
  Downloading torch-1.13.1-cp37-cp37m-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting torchvision
  Downloading torchvision-0.14.1-cp37-cp37m-manylinux1_x86_64.whl (24.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.2/24.

In [2]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoProcessor, BlipForConditionalGeneration
from torch import nn
import torch.optim as optim
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

class BLIPSentimentModel(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.num_classes = num_classes
        self.processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        self.blip = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        
        for param in self.blip.parameters():
            param.requires_grad = False
            
        self.classifier = nn.Sequential(
            nn.Linear(768, 512),  # BLIP base hidden size is 768
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.LayerNorm(512),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, image_inputs, text_inputs):
        outputs = self.blip(
            input_ids=text_inputs['input_ids'],
            attention_mask=text_inputs['attention_mask'],
            pixel_values=image_inputs['pixel_values'],
            return_dict=True
        )
        
        # Get image features
        image_features = outputs.image_embeds[:, 0]  # [CLS] token
        return self.classifier(image_features)

def process_data_batch(image_paths, texts, processor, batch_size=8, device='cuda'):
    all_image_inputs = []
    all_text_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        inputs = processor(
            images=batch_images,
            text=batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=77
        )
        
        all_image_inputs.append({
            'pixel_values': inputs['pixel_values'].to(device)
        })
        all_text_inputs.append({
            'input_ids': inputs['input_ids'].to(device),
            'attention_mask': inputs['attention_mask'].to(device)
        })
    
    return all_image_inputs, all_text_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, 
                device='cuda', epochs=30, batch_size=8):
    
    print("Initializing model...")
    model = BLIPSentimentModel().to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-5, weight_decay=0.02)
    criterion = nn.CrossEntropyLoss()
    
    print("Processing training data...")
    train_image_inputs, train_text_inputs = process_data_batch(
        train_image_paths, train_texts, model.processor, batch_size=batch_size, device=device
    )
    
    print("Processing validation data...")
    val_image_inputs, val_text_inputs = process_data_batch(
        val_image_paths, val_texts, model.processor, batch_size=batch_size, device=device
    )
    
    train_labels = torch.tensor(train_labels, device=device)
    val_labels = torch.tensor(val_labels, device=device)
    
    best_val_acc = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for i in range(len(train_image_inputs)):
            optimizer.zero_grad()
            outputs = model(train_image_inputs[i], train_text_inputs[i])
            batch_labels = train_labels[i * batch_size:(i + 1) * batch_size]
            
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += batch_labels.size(0)
            correct += predicted.eq(batch_labels).sum().item()
        
        model.eval()
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for i in range(len(val_image_inputs)):
                outputs = model(val_image_inputs[i], val_text_inputs[i])
                batch_labels = val_labels[i * batch_size:(i + 1) * batch_size]
                
                _, predicted = outputs.max(1)
                val_total += batch_labels.size(0)
                val_correct += predicted.eq(batch_labels).sum().item()
        
        val_acc = 100. * val_correct / val_total
        print(f'Epoch {epoch + 1}: Train Acc: {100.*correct/total:.2f}% Val Acc: {val_acc:.2f}%')
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')
    
    return model

def main():
    print("Installing required packages...")
    try:
        import subprocess
        subprocess.check_call(["pip", "install", "--upgrade", "transformers", "torch", "torchvision"])
    except:
        print("Warning: Could not install packages. Please ensure they are installed manually.")
        
    batch_size = 8
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    print("Loading data...")
    train_df = pd.read_csv('/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv')
    test_df = pd.read_csv('/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv')
    
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = [os.path.join(memes_folder, img) for img in train_df['image_name']]
    test_image_paths = [os.path.join(memes_folder, img) for img in test_df['image_name']]
    
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    model = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs,
        device=device,
        batch_size=batch_size
    )
    
    print("Processing test data...")
    test_image_inputs, test_text_inputs = process_data_batch(
        test_image_paths, test_df['Captions'].tolist(),
        model.processor, batch_size=batch_size, device=device
    )
    
    model.eval()
    predictions = []
    with torch.no_grad():
        for i in range(len(test_image_inputs)):
            outputs = model(test_image_inputs[i], test_text_inputs[i])
            _, predicted = outputs.max(1)
            predictions.extend(predicted.cpu().numpy())
    
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predictions]
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

if __name__ == "__main__":
    main()

ImportError: cannot import name 'BlipForConditionalGeneration' from 'transformers' (/opt/conda/lib/python3.7/site-packages/transformers/__init__.py)

In [4]:
import transformers
print(transformers.__version__)


4.20.1


# VILT model v-1 (Too memory hungry)
Default VILT implemenation

In [8]:
import torch
import numpy as np
import pandas as pd
from transformers import ViltProcessor, ViltModel
from torch import nn
import torch.optim as optim
from PIL import Image
from sklearn.model_selection import train_test_split
import os

class VILTSentimentModel(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.num_classes = num_classes
        self.processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
        self.vilt = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")
        
        for param in self.vilt.parameters():
            param.requires_grad = False
            
        hidden_size = self.vilt.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.LayerNorm(512),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, image_inputs, text_inputs):
        outputs = self.vilt(
            input_ids=text_inputs['input_ids'],
            attention_mask=text_inputs['attention_mask'],
            pixel_values=image_inputs['pixel_values'],
            pixel_mask=image_inputs['pixel_mask'],
            return_dict=True
        )
        
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)

def process_data_batch(image_paths, texts, processor, batch_size=16, device='cuda'):
    all_image_inputs = []
    all_text_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        inputs = processor(
            images=batch_images,
            text=batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True
        )
        
        all_image_inputs.append({
            'pixel_values': inputs['pixel_values'].to(device),
            'pixel_mask': inputs['pixel_mask'].to(device)
        })
        all_text_inputs.append({
            'input_ids': inputs['input_ids'].to(device),
            'attention_mask': inputs['attention_mask'].to(device)
        })
    
    return all_image_inputs, all_text_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, 
                device='cuda', epochs=30, batch_size=16):
    
    model = VILTSentimentModel().to(device)
    optimizer = optim.Adam(model.parameters(), lr=2e-5, weight_decay=0.02)
    criterion = nn.CrossEntropyLoss()
    
    print("Processing training data...")
    train_image_inputs, train_text_inputs = process_data_batch(
        train_image_paths, train_texts, model.processor, batch_size=batch_size, device=device
    )
    
    print("Processing validation data...")
    val_image_inputs, val_text_inputs = process_data_batch(
        val_image_paths, val_texts, model.processor, batch_size=batch_size, device=device
    )
    
    train_labels = torch.tensor(train_labels, device=device)
    val_labels = torch.tensor(val_labels, device=device)
    
    best_val_acc = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for i in range(len(train_image_inputs)):
            optimizer.zero_grad()
            outputs = model(train_image_inputs[i], train_text_inputs[i])
            batch_labels = train_labels[i * batch_size:(i + 1) * batch_size]
            
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += batch_labels.size(0)
            correct += predicted.eq(batch_labels).sum().item()
        
        model.eval()
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for i in range(len(val_image_inputs)):
                outputs = model(val_image_inputs[i], val_text_inputs[i])
                batch_labels = val_labels[i * batch_size:(i + 1) * batch_size]
                
                _, predicted = outputs.max(1)
                val_total += batch_labels.size(0)
                val_correct += predicted.eq(batch_labels).sum().item()
        
        val_acc = 100. * val_correct / val_total
        print(f'Epoch {epoch + 1}: Train Acc: {100.*correct/total:.2f}% Val Acc: {val_acc:.2f}%')
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')
    
    return model

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    batch_size = 16
    
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    model = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs,
        device=device,
        batch_size=batch_size
    )
    
    print("Processing test data...")
    test_image_inputs, test_text_inputs = process_data_batch(
        test_image_paths, test_df['Captions'].tolist(),
        model.processor, batch_size=batch_size, device=device
    )
    
    model.eval()
    predictions = []
    with torch.no_grad():
        for i in range(len(test_image_inputs)):
            outputs = model(test_image_inputs[i], test_text_inputs[i])
            _, predicted = outputs.max(1)
            predictions.extend(predicted.cpu().numpy())
    
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predictions]
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

if __name__ == "__main__":
    main()

Some weights of the model checkpoint at dandelin/vilt-b32-mlm were not used when initializing ViltModel: ['mlm_score.transform.dense.bias', 'mlm_score.decoder.weight', 'mlm_score.transform.dense.weight', 'mlm_score.transform.LayerNorm.bias', 'mlm_score.bias', 'mlm_score.transform.LayerNorm.weight']
- This IS expected if you are initializing ViltModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViltModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 14.74 GiB total capacity; 14.11 GiB already allocated; 20.12 MiB free; 14.26 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Multimodal BERT (MBMT) v-1
Default MMBT implemenation

Epoch: 30\
Accuracy: 74.92%\
Val. accuracy: 74.10%\

Batch size: 32\
Dropout: 0.4\
Weight decay: 0.02

In [11]:
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
from torchvision import transforms, models
from torch import nn
import torch.optim as optim
from PIL import Image
import os
from sklearn.model_selection import train_test_split

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

class ImageEncoder(nn.Module):
    def __init__(self, hidden_size=512):
        super().__init__()
        resnet = models.resnet18(pretrained=True)
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.fc = nn.Linear(512, hidden_size)
        
    def forward(self, x):
        x = self.resnet(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

class MMBTSentimentModel(nn.Module):
    def __init__(self, num_classes=3, hidden_size=512):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.image_encoder = ImageEncoder(hidden_size)
        
        for param in self.bert.parameters():
            param.requires_grad = False
            
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size + self.bert.config.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, num_classes)
        )
        
        self.image_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])
        ])
    
    def process_text(self, texts, device):
        inputs = self.tokenizer(texts, padding=True, truncation=True, 
                              max_length=128, return_tensors="pt")
        return {k: v.to(device) for k, v in inputs.items()}
    
    def process_image(self, image_paths, device):
        images = []
        for path in image_paths:
            try:
                image = Image.open(path).convert('RGB')
                image = self.image_transform(image)
                images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                dummy = torch.zeros(3, 224, 224)
                images.append(dummy)
        return torch.stack(images).to(device)
    
    def forward(self, image_inputs, text_inputs):
        text_outputs = self.bert(**text_inputs)
        text_embeddings = text_outputs.pooler_output
        
        image_embeddings = self.image_encoder(image_inputs)
        combined = torch.cat([text_embeddings, image_embeddings], dim=1)
        
        return self.classifier(combined)

def process_data_batch(image_paths, texts, model, batch_size=32, device='cuda'):
    all_image_inputs = []
    all_text_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = image_paths[i:i + batch_size]
        batch_texts = texts[i:i + batch_size]
        
        images = model.process_image(batch_images, device)
        text_inputs = model.process_text(batch_texts, device)
        
        all_image_inputs.append(images)
        all_text_inputs.append(text_inputs)
    
    return all_image_inputs, all_text_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, 
                device='cuda', epochs=30, batch_size=32,
                gradient_accumulation_steps=4):
    
    model = MMBTSentimentModel().to(device)
    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.2)
    criterion = nn.CrossEntropyLoss()
    
    print("Processing training data...")
    train_image_inputs, train_text_inputs = process_data_batch(
        train_image_paths, train_texts, model, batch_size=batch_size, device=device
    )
    
    print("Processing validation data...")
    val_image_inputs, val_text_inputs = process_data_batch(
        val_image_paths, val_texts, model, batch_size=batch_size, device=device
    )
    
    train_labels = torch.tensor(train_labels, device=device)
    val_labels = torch.tensor(val_labels, device=device)
    
    best_val_acc = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for i in range(len(train_image_inputs)):
            outputs = model(train_image_inputs[i], train_text_inputs[i])
            batch_labels = train_labels[i * batch_size:(i + 1) * batch_size]
            
            loss = criterion(outputs, batch_labels) / gradient_accumulation_steps
            loss.backward()
            
            if (i + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
            
            total_loss += loss.item() * gradient_accumulation_steps
            _, predicted = outputs.max(1)
            total += batch_labels.size(0)
            correct += predicted.eq(batch_labels).sum().item()
        
        model.eval()
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for i in range(len(val_image_inputs)):
                outputs = model(val_image_inputs[i], val_text_inputs[i])
                batch_labels = val_labels[i * batch_size:(i + 1) * batch_size]
                
                _, predicted = outputs.max(1)
                val_total += batch_labels.size(0)
                val_correct += predicted.eq(batch_labels).sum().item()
        
        val_acc = 100. * val_correct / val_total
        print(f'Epoch {epoch + 1}: Train Acc: {100.*correct/total:.2f}% Val Acc: {val_acc:.2f}%')
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')
    
    return model

def load_data(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(folder, image_names):
    return [os.path.join(folder, name) for name in image_names]

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    batch_size = 8
    
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    model = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs,
        device=device,
        batch_size=batch_size
    )
    
    print("Processing test data...")
    test_image_inputs, test_text_inputs = process_data_batch(
        test_image_paths, test_df['Captions'].tolist(),
        model, batch_size=batch_size, device=device
    )
    
    model.eval()
    predictions = []
    with torch.no_grad():
        for i in range(len(test_image_inputs)):
            outputs = model(test_image_inputs[i], test_text_inputs[i])
            _, predicted = outputs.max(1)
            predictions.extend(predicted.cpu().numpy())
    
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predictions]
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

if __name__ == "__main__":
    main()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "


Processing training data...
Error loading image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)
Processing validation data...
Epoch 1: Train Acc: 69.73% Val Acc: 76.38%
Epoch 2: Train Acc: 82.05% Val Acc: 71.62%
Epoch 3: Train Acc: 93.37% Val Acc: 70.86%
Epoch 4: Train Acc: 96.80% Val Acc: 69.71%
Epoch 5: Train Acc: 97.31% Val Acc: 70.86%
Epoch 6: Train Acc: 98.18% Val Acc: 73.71%
Epoch 7: Train Acc: 98.79% Val Acc: 74.29%
Epoch 8: Train Acc: 99.46% Val Acc: 74.86%
Epoch 9: Train Acc: 99.46% Val Acc: 73.33%


KeyboardInterrupt: 

# VisualBERT

In [13]:
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, VisualBertModel, VisualBertConfig
from torchvision import transforms, models
from torch import nn
import torch.optim as optim
from PIL import Image
import os
from sklearn.model_selection import train_test_split

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

class ImageEncoder(nn.Module):
    def __init__(self, output_size=2048):
        super().__init__()
        resnet = models.resnet50(pretrained=True)
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        
    def forward(self, x):
        x = self.resnet(x)
        return x.view(x.size(0), -1)

class VisualBERTSentimentModel(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.num_classes = num_classes
        
        config = VisualBertConfig.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
        self.visual_bert = VisualBertModel(config)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.image_encoder = ImageEncoder()
        
        for param in self.visual_bert.parameters():
            param.requires_grad = False
            
        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, num_classes)
        )
        
        self.image_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])
        ])
    
    def process_text(self, texts, device):
        inputs = self.tokenizer(texts, padding=True, truncation=True, 
                              max_length=128, return_tensors="pt")
        return {k: v.to(device) for k, v in inputs.items()}
    
    def process_image(self, image_paths, device):
        images = []
        for path in image_paths:
            try:
                image = Image.open(path).convert('RGB')
                image = self.image_transform(image)
                images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                dummy = torch.zeros(3, 224, 224)
                images.append(dummy)
        return torch.stack(images).to(device)
    
    def forward(self, image_inputs, text_inputs):
        batch_size = image_inputs.size(0)
        visual_embeds = self.image_encoder(image_inputs)
            
        # Reshape visual embeds to (batch_size, num_visual_features, visual_embedding_dim)
        visual_embeds = visual_embeds.view(batch_size, 1, -1)
            
        # Create correct dimension masks
        visual_attention_mask = torch.ones(batch_size, 1, dtype=torch.float).to(image_inputs.device)
        visual_token_type_ids = torch.ones(batch_size, 1, dtype=torch.long).to(image_inputs.device)
            
        outputs = self.visual_bert(
            input_ids=text_inputs['input_ids'],
            attention_mask=text_inputs['attention_mask'],
            visual_embeds=visual_embeds,
            visual_token_type_ids=visual_token_type_ids,
            visual_attention_mask=visual_attention_mask,
            return_dict=True
        )
            
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)

def process_data_batch(image_paths, texts, model, batch_size=8, device='cuda'):
    all_image_inputs = []
    all_text_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = image_paths[i:i + batch_size]
        batch_texts = texts[i:i + batch_size]
        
        images = model.process_image(batch_images, device)
        text_inputs = model.process_text(batch_texts, device)
        
        all_image_inputs.append(images)
        all_text_inputs.append(text_inputs)
    
    return all_image_inputs, all_text_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, 
                device='cuda', epochs=30, batch_size=8,
                gradient_accumulation_steps=4):
    
    model = VisualBERTSentimentModel().to(device)
    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
    criterion = nn.CrossEntropyLoss()
    
    print("Processing training data...")
    train_image_inputs, train_text_inputs = process_data_batch(
        train_image_paths, train_texts, model, batch_size=batch_size, device=device
    )
    
    print("Processing validation data...")
    val_image_inputs, val_text_inputs = process_data_batch(
        val_image_paths, val_texts, model, batch_size=batch_size, device=device
    )
    
    train_labels = torch.tensor(train_labels, device=device)
    val_labels = torch.tensor(val_labels, device=device)
    
    best_val_acc = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for i in range(len(train_image_inputs)):
            outputs = model(train_image_inputs[i], train_text_inputs[i])
            batch_labels = train_labels[i * batch_size:(i + 1) * batch_size]
            
            loss = criterion(outputs, batch_labels) / gradient_accumulation_steps
            loss.backward()
            
            if (i + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
            
            total_loss += loss.item() * gradient_accumulation_steps
            _, predicted = outputs.max(1)
            total += batch_labels.size(0)
            correct += predicted.eq(batch_labels).sum().item()
        
        model.eval()
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for i in range(len(val_image_inputs)):
                outputs = model(val_image_inputs[i], val_text_inputs[i])
                batch_labels = val_labels[i * batch_size:(i + 1) * batch_size]
                
                _, predicted = outputs.max(1)
                val_total += batch_labels.size(0)
                val_correct += predicted.eq(batch_labels).sum().item()
        
        val_acc = 100. * val_correct / val_total
        print(f'Epoch {epoch + 1}: Train Acc: {100.*correct/total:.2f}% Val Acc: {val_acc:.2f}%')
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')
    
    return model

def load_data(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(folder, image_names):
    return [os.path.join(folder, name) for name in image_names]

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    batch_size = 8
    
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    model = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs,
        device=device,
        batch_size=batch_size
    )
    
    print("Processing test data...")
    test_image_inputs, test_text_inputs = process_data_batch(
        test_image_paths, test_df['Captions'].tolist(),
        model, batch_size=batch_size, device=device
    )
    
    model.eval()
    predictions = []
    with torch.no_grad():
        for i in range(len(test_image_inputs)):
            outputs = model(test_image_inputs[i], test_text_inputs[i])
            _, predicted = outputs.max(1)
            predictions.extend(predicted.cpu().numpy())
    
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predictions]
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

if __name__ == "__main__":
    main()

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "


Processing training data...
Error loading image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 14.74 GiB total capacity; 12.82 GiB already allocated; 16.12 MiB free; 13.70 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF