<p style="text-align:center;font-weight: 900; font-size:40px;"> Multimodal Sentiment Analysis Higher Accuracy </p>

**More robust vgg19 and xlm-roberta******

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import os


# Load and preprocess data
def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


def augment_images(images):
    """Apply data augmentation"""
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    return datagen.flow(images, batch_size=16, shuffle=True)


# Multimodal Model Definition
class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        # Use XLM-Roberta for multilingual capabilities
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.bert_model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')

    def build_model(self):
        # Image branch (VGG19 with fine-tuning)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        for layer in vgg19.layers[:-8]:  # Fine-tune deeper layers
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.4)(x)
        image_features = LayerNormalization()(x)

        # Text branch (XLM-Roberta)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])
        pooled_output = bert_outputs[1]  # Use the pooled output
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.4)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=1e-4)  # Reduce learning rate
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using XLM-Roberta tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


# Model Training
def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=3,
            min_lr=1e-6
        )
    ]

    # Data augmentation for images
    train_images_augmented = augment_images(train_images)

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )

    return model, history


# Main Function
def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]

    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

    return model, history


if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import os
import math


# Load and preprocess data
def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


def augment_images(images):
    """Apply stronger data augmentation"""
    datagen = ImageDataGenerator(
        rotation_range=30,
        width_shift_range=0.3,
        height_shift_range=0.3,
        shear_range=0.3,
        zoom_range=0.3,
        horizontal_flip=True,
        fill_mode='nearest',
        brightness_range=[0.8, 1.2]
    )
    return datagen.flow(images, batch_size=16, shuffle=True)


# Multimodal Model Definition
class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        # Use XLM-Roberta for multilingual capabilities
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.bert_model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')

    def build_model(self):
        # Image branch (VGG19 with fine-tuning)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        for layer in vgg19.layers[:-8]:  # Fine-tune deeper layers
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
        x = Dropout(0.4)(x)
        image_features = BatchNormalization()(x)

        # Text branch (XLM-Roberta)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])
        pooled_output = bert_outputs[1]  # Use the pooled output
        x = Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = BatchNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(combined)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=1e-4)  # Reduce learning rate
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using XLM-Roberta tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


# Cosine Annealing Learning Rate Scheduler
def cosine_decay(epoch):
    initial_lr = 1e-4
    return initial_lr * (1 + math.cos(epoch * math.pi / 20)) / 2


# Model Training
def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Compute class weights for imbalanced datasets
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_labels),
        y=train_labels
    )
    class_weights = dict(enumerate(class_weights))

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=2,  # Stop earlier
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        ),
        tf.keras.callbacks.LearningRateScheduler(cosine_decay)
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )

    return model, history


# Main Function
def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]

    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

    return model, history


if __name__ == "__main__":
    main()


This vgg19 and bert model

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=0.001)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(512, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(512, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(512, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=10):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(512, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(512, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(512, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(512, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(512, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(512, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


Less layer

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)
        
        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False
            
        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)
        
        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)
        
        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


Edited

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm


def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')

    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, class_weights, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )

    return model, history


def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs,
        class_weights_dict
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Evaluate model
    print(classification_report(test_df['Label_Sentiment'].map(label_map), predicted_labels))
    print(confusion_matrix(test_df['Label_Sentiment'].map(label_map), predicted_labels))

    # Save predictions
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")


if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os
import math
import tensorflow_addons as tfa

# Load and preprocess data
def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


def augment_images(images):
    """Apply stronger data augmentation"""
    datagen = ImageDataGenerator(
        rotation_range=30,
        width_shift_range=0.3,
        height_shift_range=0.3,
        shear_range=0.3,
        zoom_range=0.3,
        horizontal_flip=True,
        fill_mode='nearest',
        brightness_range=[0.8, 1.2]
    )
    return datagen.flow(images, batch_size=16, shuffle=True)


# Multimodal Model Definition
class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        # Use XLM-Roberta for multilingual capabilities
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.bert_model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')

    def build_model(self):
        # Image branch (VGG19 with fine-tuning)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        for layer in vgg19.layers[:-8]:  # Fine-tune deeper layers
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
        x = Dropout(0.4)(x)
        image_features = BatchNormalization()(x)

        # Text branch (XLM-Roberta)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])
        pooled_output = bert_outputs[1]  # Use the pooled output
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = BatchNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(combined)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using XLM-Roberta tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


# Cosine Annealing Learning Rate Scheduler
def lr_schedule(epoch):
    initial_lr = 1e-4
    if epoch < 5:  # Warm-up phase
        return initial_lr * (epoch + 1) / 5
    else:
        return initial_lr * tf.math.exp(-0.1 * (epoch - 5))


# Model Training
def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Compute class weights for imbalanced datasets
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_labels),
        y=train_labels
    )
    class_weights = dict(enumerate(class_weights))

    # Focal loss for better handling of class imbalance
    loss = tfa.losses.SigmoidFocalCrossEntropy()

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss=loss,
        metrics=['accuracy']
    )

    # Callbacks
    lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_schedule)
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        lr_callback
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=32,
        callbacks=callbacks
    )

    return model, history


# Main Function
def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]

    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

    return model, history


if __name__ == "__main__":
    main()


Final

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os
import math

# Load and preprocess data
def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


def augment_images(images):
    """Apply stronger data augmentation"""
    datagen = ImageDataGenerator(
        rotation_range=30,
        width_shift_range=0.3,
        height_shift_range=0.3,
        shear_range=0.3,
        zoom_range=0.3,
        horizontal_flip=True,
        fill_mode='nearest',
        brightness_range=[0.8, 1.2]
    )
    return datagen.flow(images, batch_size=16, shuffle=True)


# Multimodal Model Definition
class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        # Use XLM-Roberta for multilingual capabilities
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.bert_model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')

    def build_model(self):
        # Image branch (VGG19 with fine-tuning)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        for layer in vgg19.layers[:-8]:  # Fine-tune deeper layers
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
        x = Dropout(0.4)(x)
        image_features = BatchNormalization()(x)

        # Text branch (XLM-Roberta)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])
        pooled_output = bert_outputs[1]  # Use the pooled output
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = BatchNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(combined)
        x = Dropout(0.3)(x)
        x = BatchNormalization()(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using XLM-Roberta tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


# Cosine Annealing Learning Rate Scheduler
def lr_schedule(epoch):
    initial_lr = 1e-4
    if epoch < 5:  # Warm-up phase
        return initial_lr * (epoch + 1) / 5
    else:
        return initial_lr * tf.math.exp(-0.1 * (epoch - 5))


# Model Training
def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Convert labels to one-hot encoding
    train_labels_onehot = tf.keras.utils.to_categorical(train_labels, num_classes=3)
    val_labels_onehot = tf.keras.utils.to_categorical(val_labels, num_classes=3)

    # Compute class weights for imbalanced datasets
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_labels),
        y=train_labels
    )
    class_weights = dict(enumerate(class_weights))

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Callbacks
    lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_schedule)
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        lr_callback
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels_onehot,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels_onehot
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=32,
        callbacks=callbacks
    )

    return model, history


# Main Function
def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]

    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

    return model, history


if __name__ == "__main__":
    main()


In [None]:
pip install tensorflow numpy pandas transformers vit-keras tqdm scikit-learn pillow


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
from vit_keras import vit
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.imagenet_utils.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (Vision Transformer)
        vit_model = vit.vit_b16(
            image_size=224,
            pretrained=True,
            include_top=False,
            pretrained_top=False
        )
        image_input = Input(shape=(224, 224, 3), name='image_input')
        x = vit_model(image_input)  # Output: (None, 768)
        x = Dense(256, activation='relu')(x)  # Add Dense layer instead of pooling
        x = Dropout(0.3)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


In [None]:
pip install tensorflow numpy pandas transformers vit-keras tqdm scikit-learn pillow


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
from vit_keras import vit
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.imagenet_utils.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (Vision Transformer)
        vit_model = vit.vit_b16(
            image_size=224,
            pretrained=True,
            include_top=False,
            pretrained_top=False
        )
        image_input = Input(shape=(224, 224, 3), name='image_input')
        x = vit_model(image_input)  # Output: (None, 768)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)  # Add Dense layer with L2 regularization
        x = Dropout(0.4)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(combined)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
        x = Dropout(0.3)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    class_weight_dict = dict(enumerate(class_weights))

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',  # Monitor validation loss
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        ),
        tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-5 if epoch > 10 else 2e-5)
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,  # Decreased batch size for better generalization
        class_weight=class_weight_dict,  # Add class weights
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()



In [None]:
pip install tensorflow pandas numpy tqdm scikit-learn transformers vit-keras


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from transformers import TFBertModel, BertTokenizer
from vit_keras import vit
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.imagenet_utils.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))

def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)

# Image augmentation
def augment_images(images):
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    return datagen.flow(images, batch_size=len(images), shuffle=False).next()

class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        
    def build_model(self):
        # Image branch (Vision Transformer)
        vit_model = vit.vit_b16(
            image_size=224,
            pretrained=True,
            include_top=False,
            pretrained_top=False
        )
        image_input = Input(shape=(224, 224, 3), name='image_input')
        x = vit_model(image_input)  # Output: (None, 768)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)  # Add Dense layer with L2 regularization
        x = Dropout(0.4)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(pooled_output)
        x = Dropout(0.4)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(combined)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
        x = Dropout(0.3)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()
    
    # Augment training images
    train_images = augment_images(train_images)

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    class_weight_dict = dict(enumerate(class_weights))

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        ),
        tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-5 if epoch > 10 else 2e-5)
    ]
    
    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        class_weight=class_weight_dict,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    
    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()


# VGG19 + BERT with Regularizaition and 0.4 dropout 
Added regularization and high dropout rate to avoid overfitting.
Batch size: 32
Layer: 256

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input, Dropout, Concatenate, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from transformers import TFBertModel, BertTokenizer
import os
from tqdm import tqdm


def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df


def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]


def preprocess_image(image_path, target_size=(224, 224)):
    """Load and preprocess a single image"""
    try:
        img = load_img(image_path, target_size=target_size)
        img = img_to_array(img)
        img = tf.keras.applications.vgg19.preprocess_input(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return np.zeros(target_size + (3,))


def process_images(image_paths, target_size=(224, 224)):
    """Process all images with progress bar"""
    images = []
    for path in tqdm(image_paths, desc="Processing images"):
        img = preprocess_image(path, target_size)
        images.append(img)
    return np.array(images)


class MultimodalSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')

    def build_model(self):
        # Image branch (VGG19)
        image_input = Input(shape=(224, 224, 3), name='image_input')
        vgg19 = VGG19(weights='imagenet', include_top=False)

        # Fine-tune only the top layers
        for layer in vgg19.layers[:-4]:
            layer.trainable = False

        x = vgg19(image_input)
        x = GlobalAveragePooling2D()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.4)(x)
        image_features = LayerNormalization()(x)

        # Text branch (BERT)
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')

        bert_outputs = self.bert_model([input_ids, attention_mask])[0]
        pooled_output = tf.reduce_mean(bert_outputs, axis=1)
        x = Dense(256, activation='relu')(pooled_output)
        x = Dropout(0.4)(x)
        text_features = LayerNormalization()(x)

        # Combine features
        combined = Concatenate()([image_features, text_features])
        x = Dense(256, activation='relu')(combined)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)

        model = Model(
            inputs=[image_input, input_ids, attention_mask],
            outputs=outputs
        )

        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.bert_tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']


def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, class_weights, epochs=20):
    # Create model instance
    model_handler = MultimodalSentimentModel()
    model = model_handler.build_model()

    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]

    # Train model
    history = model.fit(
        {
            'image_input': train_images,
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'image_input': val_images,
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask
            },
            val_labels
        ),
        class_weight=class_weights,
        epochs=epochs,
        batch_size=16,
        callbacks=callbacks
    )

    return model, history


def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )

    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())

    # Process images
    train_images = process_images(train_image_paths)
    test_images = process_images(test_image_paths)

    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])

    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

    # Split data
    train_imgs, val_imgs, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_images, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )

    # Train model
    model, history = train_model(
        train_imgs, train_texts, train_labs,
        val_imgs, val_texts, val_labs,
        class_weights_dict
    )

    # Prepare test data
    model_handler = MultimodalSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])

    # Make predictions
    predictions = model.predict({
        'image_input': test_images,
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)

    # Evaluate model
    print(classification_report(test_df['Label_Sentiment'].map(label_map), predicted_labels))
    print(confusion_matrix(test_df['Label_Sentiment'].map(label_map), predicted_labels))

    # Save predictions
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")


if __name__ == "__main__":
    main()

# VisualBERT multimodal model default settings

In [6]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import TFVisualBertModel, VisualBertConfig, BertTokenizer
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from PIL import Image
import torch
from torchvision.models import resnet50
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

def preprocess_image(image_path, transform):
    """Load and preprocess a single image using PyTorch transforms"""
    try:
        img = Image.open(image_path).convert('RGB')
        img = transform(img)
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return torch.zeros((3, 224, 224))

def process_images(image_paths):
    """Process all images and extract features using ResNet"""
    transform = Compose([
        Resize((224, 224)),
        ToTensor(),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Load ResNet model
    resnet = resnet50(pretrained=True)
    resnet = torch.nn.Sequential(*(list(resnet.children())[:-1]))  # Remove classification layer
    resnet.eval()
    
    features = []
    with torch.no_grad():
        for path in tqdm(image_paths, desc="Processing images"):
            img = preprocess_image(path, transform)
            img = img.unsqueeze(0)  # Add batch dimension
            feat = resnet(img)
            features.append(feat.squeeze().numpy())
    
    return np.array(features)

class VisualBertSentimentModel:
    def __init__(self, num_classes=3, max_length=128):
        self.num_classes = num_classes
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.config = VisualBertConfig.from_pretrained('uclanlp/visualbert-vqa')
        self.visual_bert = TFVisualBertModel.from_pretrained('uclanlp/visualbert-vqa', config=self.config)
        
    def build_model(self):
        # Inputs
        input_ids = Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        visual_features = Input(shape=(2048,), dtype=tf.float32, name='visual_features')
        visual_attention_mask = Input(shape=(1,), dtype=tf.int32, name='visual_attention_mask')
        
        # Expand visual features dimensions to match VisualBERT requirements
        visual_embeds = tf.expand_dims(visual_features, axis=1)  # Replace unsqueeze with expand_dims
        
        # Pass through VisualBERT
        outputs = self.visual_bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            visual_embeddings=visual_embeds,
            visual_attention_mask=visual_attention_mask
        )
        
        # Get pooled output
        pooled_output = outputs[1]
        
        # Classification layers
        x = Dense(512, activation='relu')(pooled_output)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, visual_features, visual_attention_mask],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def prepare_text(self, texts):
        """Tokenize texts using BERT tokenizer"""
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )
        return encodings['input_ids'], encodings['attention_mask']

def train_model(train_images, train_texts, train_labels, val_images, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = VisualBertSentimentModel()
    model = model_handler.build_model()
    
    # Prepare text data
    train_input_ids, train_attention_mask = model_handler.prepare_text(train_texts)
    val_input_ids, val_attention_mask = model_handler.prepare_text(val_texts)
    
    # Prepare visual attention masks
    train_visual_attention_mask = tf.ones((train_images.shape[0], 1), dtype=tf.int32)
    val_visual_attention_mask = tf.ones((val_images.shape[0], 1), dtype=tf.int32)
    
    # Convert images to float32
    train_images = tf.cast(train_images, tf.float32)
    val_images = tf.cast(val_images, tf.float32)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        {
            'input_ids': train_input_ids,
            'attention_mask': train_attention_mask,
            'visual_features': train_images,
            'visual_attention_mask': train_visual_attention_mask
        },
        train_labels,
        validation_data=(
            {
                'input_ids': val_input_ids,
                'attention_mask': val_attention_mask,
                'visual_features': val_images,
                'visual_attention_mask': val_visual_attention_mask
            },
            val_labels
        ),
        epochs=epochs,
        batch_size=8,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths and extract features
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    train_features = process_images(train_image_paths)
    test_features = process_images(test_image_paths)
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_feats, val_feats, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_features, train_df['Captions'],
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_feats, train_texts, train_labs,
        val_feats, val_texts, val_labs
    )
    
    # Prepare test data
    model_handler = VisualBertSentimentModel()
    test_input_ids, test_attention_mask = model_handler.prepare_text(test_df['Captions'])
    test_visual_attention_mask = tf.ones((test_features.shape[0], 1), dtype=tf.int32)
    test_features = tf.cast(test_features, tf.float32)
    
    # Make predictions
    predictions = model.predict({
        'input_ids': test_input_ids,
        'attention_mask': test_attention_mask,
        'visual_features': test_features,
        'visual_attention_mask': test_visual_attention_mask
    })
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

ImportError: cannot import name 'TFVisualBertModel' from 'transformers' (/opt/conda/lib/python3.7/site-packages/transformers/__init__.py)

# CLIP model with early stopping

Stopped at: epoch 14
Epoch: 30\
Accuracy: 0.7384\
Val. accuracy: 0.7295\
Layers: 512->256\
Batch size: 32

In [7]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu')(combined_features)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=32,
        callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/842k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/512k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/577M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Processing training data...
Error loading image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)
Processing validation data...
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Processing test data...


All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Predictions saved to submission.csv


# CLIP without early stopping
Epoch: 30\
Accuracy: 0.7929\
Val. accuracy: 0.7371\
Layers: 512->256\
Batch size: 32

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu')(combined_features)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=32,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Processing training data...
Error loading image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)
Processing validation data...
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Processing test data...


All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Predictions saved to submission.csv


# CLIP without early stopping v-2.0
Epoch: 20\
Accuracy: 0.7535\
Val. accuracy: 0.7295\
Layers: 256->128\
Batch size: 16

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(256, activation='relu')(combined_features)
        x = Dropout(0.3)(x)
        x = LayerNormalization()(x)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=16,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/842k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/512k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/577M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Processing training data...
Error loading image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)
Processing validation data...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Processing test data...


All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Predictions saved to submission.csv


# CLIP modified v-3
## Increased dropout, decreased batch size
Epoch: 20\
Accuracy: 0.7576\
Val. accuracy: 0.7371\
Layers: 512->256\
Batch size: 16\
Dropout: 0.4

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu')(combined_features)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=16,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/842k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/512k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/577M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Processing training data...
Error loading image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)
Processing validation data...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Processing test data...


All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Predictions saved to submission.csv


# CLIP modified v-4
## Added kernel regularizer
Epoch: 20\
Accuracy: 0.7613\
Val. accuracy: 0.7390\
Layers: 512->256\
Batch size: 16\
Dropout: 0.4

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu', kernel_regularizer=l2(0.01))(combined_features)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=20):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=16,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Processing training data...
Error loading image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)
Processing validation data...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Processing test data...


All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Predictions saved to submission.csv


# CLIP modified v-5
## Increase batch size and decay
Epoch: 30\
Accuracy: 0.7613\
Val. accuracy: 0.7390\
Layers: 512->256\
Batch size: 32\
Dropout: 0.4\
Regularization decay: 0.2

In [5]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu', kernel_regularizer=l2(0.02))(combined_features)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu', kernel_regularizer=l2(0.02))(x)
        x = Dropout(0.2)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=50):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=32,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Processing training data...
Error loading image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)
Processing validation data...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Processing test data...


All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Predictions saved to submission.csv


# CLIP modified v-6
## Same as v-5 but increased dropout in two layers instead of just one
Epoch: 50\
Accuracy: 0.7818\
Val. accuracy: 0.7486\
Layers: 512->256\
Batch size: 32\
Dropout: 0.4\
Regularization decay: 0.2

In [6]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu', kernel_regularizer=l2(0.02))(combined_features)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu', kernel_regularizer=l2(0.02))(x)
        x = Dropout(0.4)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=50):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=32,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Processing training data...
Error loading image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)
Processing validation data...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Processing test data...


All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Predictions saved to submission.csv


# CLIP modified v-7
## Same as v-6 but used 30 epochs
Epoch: 30\
Accuracy: 0.7613\
Val. accuracy: 0.7390\
Layers: 512->256\
Batch size: 32\
Dropout: 0.4\
Regularization decay: 0.2

In [7]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import CLIPProcessor, TFCLIPModel
from tensorflow.keras.layers import Dense, Input, Dropout, LayerNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class CLIPSentimentModel:
    def __init__(self, num_classes=3):
        self.num_classes = num_classes
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        # Freeze CLIP layers
        self.clip.trainable = False
    
    def process_batch(self, images, texts):
        """Process a batch of images and texts through CLIP"""
        inputs = self.processor(
            images=images,
            text=texts,
            return_tensors="tf",
            padding=True,
            truncation=True
        )
        return inputs
    
    def build_model(self):
        # Define inputs
        input_ids = Input(shape=(77,), dtype=tf.int32, name='input_ids')
        attention_mask = Input(shape=(77,), dtype=tf.int32, name='attention_mask')
        pixel_values = Input(shape=(3, 224, 224), dtype=tf.float32, name='pixel_values')
        
        # Get CLIP embeddings
        clip_outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values
        )
        
        # Concatenate image and text features
        image_features = clip_outputs.image_embeds
        text_features = clip_outputs.text_embeds
        combined_features = Concatenate()([image_features, text_features])
        
        # Classification head
        x = Dense(512, activation='relu', kernel_regularizer=l2(0.02))(combined_features)
        x = Dropout(0.4)(x)
        x = LayerNormalization()(x)
        x = Dense(256, activation='relu', kernel_regularizer=l2(0.02))(x)
        x = Dropout(0.4)(x)
        outputs = Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(
            inputs=[input_ids, attention_mask, pixel_values],
            outputs=outputs
        )
        
        optimizer = Adam(learning_rate=2e-5)
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model

def process_data_batch(image_paths, texts, model_handler, batch_size=32):
    """Process data in batches to avoid memory issues"""
    all_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        # Load images for current batch
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                # Create a blank image as fallback
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        # Process batch
        inputs = model_handler.process_batch(batch_images, batch_texts)
        all_inputs.append(inputs)
    
    # Combine all batches
    combined_inputs = {
        'input_ids': tf.concat([x['input_ids'] for x in all_inputs], axis=0),
        'attention_mask': tf.concat([x['attention_mask'] for x in all_inputs], axis=0),
        'pixel_values': tf.concat([x['pixel_values'] for x in all_inputs], axis=0)
    }
    
    return combined_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, epochs=30):
    # Create model instance
    model_handler = CLIPSentimentModel()
    model = model_handler.build_model()
    
    print("Processing training data...")
    train_inputs = process_data_batch(train_image_paths, train_texts, model_handler)
    
    print("Processing validation data...")
    val_inputs = process_data_batch(val_image_paths, val_texts, model_handler)
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=1e-6
        )
    ]
    
    # Train model
    history = model.fit(
        train_inputs,
        train_labels,
        validation_data=(val_inputs, val_labels),
        epochs=epochs,
        batch_size=32,
        # callbacks=callbacks
    )
    
    return model, history

def main():
    # Load data
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    # Get image paths
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    # Convert labels
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    # Split data
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    # Train model
    model, history = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs
    )
    
    # Process test data
    print("Processing test data...")
    model_handler = CLIPSentimentModel()
    test_inputs = process_data_batch(test_image_paths, test_df['Captions'].tolist(), model_handler)
    
    # Make predictions
    predictions = model.predict(test_inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Convert predictions to labels
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predicted_labels]
    
    # Save predictions
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")
    
    return model, history

if __name__ == "__main__":
    main()

All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Processing training data...
Error loading image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)
Processing validation data...
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Processing test data...


All model checkpoint layers were used when initializing TFCLIPModel.

All the layers of TFCLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCLIPModel for predictions without further training.


Predictions saved to submission.csv


# FLAVA model
Default FLAVA implemenation on the best result of CLIP

In [1]:
!pip install transformers torch

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[0m

In [2]:
import torch
import numpy as np
import pandas as pd
from transformers import FlavaProcessor, FlavaForPreTraining
from torch import nn
import torch.optim as optim
from PIL import Image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def load_data(train_path, test_path):
    """Load training and test data"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def get_image_paths(directory, image_names):
    """Get full paths for images"""
    image_paths = {img: os.path.join(directory, img) for img in image_names}
    return [image_paths[img] for img in image_names if img in image_paths]

class FLAVASentimentModel(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.num_classes = num_classes
        self.processor = FlavaProcessor.from_pretrained("facebook/flava-full")
        self.flava = FlavaForPreTraining.from_pretrained("facebook/flava-full")
        
        for param in self.flava.parameters():
            param.requires_grad = False
            
        self.classifier = nn.Sequential(
            nn.Linear(self.flava.config.hidden_size * 2, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.LayerNorm(512),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, image_inputs, text_inputs):
        with torch.no_grad():
            flava_outputs = self.flava.flava(
                input_ids=text_inputs['input_ids'],
                attention_mask=text_inputs['attention_mask'],
                pixel_values=image_inputs['pixel_values'],
                bool_masked_pos=None,
                return_dict=True
            )
        
        image_features = flava_outputs.image_embeddings
        text_features = flava_outputs.text_embeddings
        combined_features = torch.cat((image_features, text_features), dim=1)
        
        return self.classifier(combined_features)

def process_data_batch(image_paths, texts, processor, batch_size=32, device='cuda'):
    """Process data in batches"""
    all_image_inputs = []
    all_text_inputs = []
    
    for i in range(0, len(image_paths), batch_size):
        batch_images = []
        batch_texts = texts[i:i + batch_size]
        
        for path in image_paths[i:i + batch_size]:
            try:
                image = Image.open(path).convert('RGB')
                batch_images.append(image)
            except Exception as e:
                print(f"Error loading image {path}: {str(e)}")
                batch_images.append(Image.new('RGB', (224, 224), color='black'))
        
        inputs = processor(
            images=batch_images,
            text=batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=77,
            return_codebook_pixels=True
        )
        
        all_image_inputs.append({
            'pixel_values': inputs.pixel_values.to(device)
        })
        all_text_inputs.append({
            'input_ids': inputs.input_ids.to(device),
            'attention_mask': inputs.attention_mask.to(device)
        })
    
    return all_image_inputs, all_text_inputs

def train_model(train_image_paths, train_texts, train_labels, 
                val_image_paths, val_texts, val_labels, 
                device='cuda', epochs=30):
    
    model = FLAVASentimentModel().to(device)
    optimizer = optim.Adam(model.parameters(), lr=2e-5, weight_decay=0.02)
    criterion = nn.CrossEntropyLoss()
    
    print("Processing training data...")
    train_image_inputs, train_text_inputs = process_data_batch(
        train_image_paths, train_texts, model.processor, device=device
    )
    
    print("Processing validation data...")
    val_image_inputs, val_text_inputs = process_data_batch(
        val_image_paths, val_texts, model.processor, device=device
    )
    
    train_labels = torch.tensor(train_labels, device=device)
    val_labels = torch.tensor(val_labels, device=device)
    
    best_val_acc = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for i in range(len(train_image_inputs)):
            optimizer.zero_grad()
            outputs = model(train_image_inputs[i], train_text_inputs[i])
            batch_labels = train_labels[i * 32:(i + 1) * 32]
            
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += batch_labels.size(0)
            correct += predicted.eq(batch_labels).sum().item()
        
        model.eval()
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for i in range(len(val_image_inputs)):
                outputs = model(val_image_inputs[i], val_text_inputs[i])
                batch_labels = val_labels[i * 32:(i + 1) * 32]
                
                _, predicted = outputs.max(1)
                val_total += batch_labels.size(0)
                val_correct += predicted.eq(batch_labels).sum().item()
        
        val_acc = 100. * val_correct / val_total
        print(f'Epoch {epoch + 1}: Train Acc: {100.*correct/total:.2f}% Val Acc: {val_acc:.2f}%')
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')
    
    return model

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    train_df, test_df = load_data(
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/train.csv',
        '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/test.csv'
    )
    
    memes_folder = '/kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes'
    train_image_paths = get_image_paths(memes_folder, train_df['image_name'].tolist())
    test_image_paths = get_image_paths(memes_folder, test_df['image_name'].tolist())
    
    label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
    train_labels = np.array([label_map[label] for label in train_df['Label_Sentiment']])
    
    train_paths, val_paths, train_texts, val_texts, train_labs, val_labs = train_test_split(
        train_image_paths, train_df['Captions'].tolist(),
        train_labels, test_size=0.15,
        random_state=42, stratify=train_labels
    )
    
    model = train_model(
        train_paths, train_texts, train_labs,
        val_paths, val_texts, val_labs,
        device=device
    )
    
    print("Processing test data...")
    test_image_inputs, test_text_inputs = process_data_batch(
        test_image_paths, test_df['Captions'].tolist(),
        model.processor, device=device
    )
    
    model.eval()
    predictions = []
    with torch.no_grad():
        for i in range(len(test_image_inputs)):
            outputs = model(test_image_inputs[i], test_text_inputs[i])
            _, predicted = outputs.max(1)
            predictions.extend(predicted.cpu().numpy())
    
    reverse_label_map = {v: k for k, v in label_map.items()}
    test_df['Label'] = [reverse_label_map[label] for label in predictions]
    test_df[['Id', 'Label']].to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

if __name__ == "__main__":
    main()

Downloading:   0%|          | 0.00/881 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/358 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/695k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Processing training data...
Error loading image /kaggle/input/multimodal-sentiment-analysis-cuet-nlp/Memes/Memes/nurani-memes (149).jpg: image file is truncated (3 bytes not processed)
Processing validation data...


`input_ids_masked` isn't passed which means MLM loss won't be calculated correctlySetting it to `input_ids` so that model can work. Please pass it if this is unintentional. This is usually OKAY if you are doing inference on unmasked text...


ValueError: `codebook_pixel_value` are required to generate `mim_labels` if loss is expected. Call `FlavaProcessor` with `return_codebook_pixels` set to True