# Load Project Dataset

In [15]:
import torch
import torchvision.transforms as transforms
from transformers import AutoTokenizer
from PIL import Image
import pandas as pd
import os

In [32]:
# Load project dataset
df = pd.read_csv("Dataset/Split Dataset/Training_meme_dataset.csv")

print(df.columns)

print(df.head())

Index(['image_name', 'sentence', 'label'], dtype='object')
    image_name                                           sentence  \
0  LJ3r8Gy.png  OFFICIAL BERNIE SANDERS DRINKING GAME ! Every ...   
1  qDnIIHA.png  2:28 PM THIS IS A WALL INSIDE A NAZI GAS CHAMB...   
2  1JQk5NF.png                o shit waddup ! BERNIE SANDERS COM    
3    iMMNq.png  `` MITT ROMNEY IS THE WORST REPUBLICAN IN THE ...   
4  jAi3iI1.png  Anonymous ( ID : duqdA1io a 08/05/16 ( Fri ) 1...   

          label  
0  Non-offensiv  
1     offensive  
2     offensive  
3  Non-offensiv  
4  Non-offensiv  


In [17]:
# Define relevant columns based on the dataset
TEXT_COLUMN = "sentence"
LABEL_COLUMN = "label"

IMAGE_FOLDER = "Dataset/Labelled Images" 

df["image_path"] = df["image_name"].apply(lambda x: os.path.join(IMAGE_FOLDER, x))
IMAGE_COLUMN = "image_path"

# Tokenize text data
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_text(text):
    return tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Preprocess images
def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
    image = Image.open(image_path).convert("RGB")
    return transform(image).unsqueeze(0)

# Example usage
text_tokens = tokenize_text(df[TEXT_COLUMN].iloc[0])
image_tensor = preprocess_image(df[IMAGE_COLUMN].iloc[0])
print("Image Shape:", image_tensor.shape, "Text Tokens:", text_tokens)


Image Shape: torch.Size([1, 3, 224, 224]) Text Tokens: {'input_ids': tensor([[  101,  2880, 15941, 12055,  5948,  2208,   999,  2296,  2051,  1996,
         16595,  6238,  9704,  1037,  2489,  2231,  2565,  1010, 14684,  2290,
          8307,  2842,  1005,  1055,  5404,   999,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}


# Select and Fine-Tune Relevant Models

In [19]:
from transformers import AutoModel, VisionEncoderDecoderModel
import torch.nn as nn

In [23]:
class MultiModalModel(nn.Module):
    def __init__(self, text_model_name, image_model_name, output_classes):
        super().__init__()
        
        # Select appropriate text model
        self.text_encoder = AutoModel.from_pretrained(text_model_name)

        # Select appropriate image model
        self.image_encoder = torch.hub.load("pytorch/vision", image_model_name, pretrained=True)
        self.image_encoder.fc = nn.Identity()

        # Final classifier
        self.fc_combined = nn.Linear(512 + 768 + 128, output_classes)

    def forward(self, text_tokens, image_tensor):
        text_features = self.text_encoder(**text_tokens).last_hidden_state[:, 0, :]
        image_features = self.image_encoder(image_tensor)
        
        combined = torch.cat((text_features, image_features), dim=1)
        return self.fc_combined(combined)

# Create the model instance with project-relevant choices
model = MultiModalModel(
    text_model_name="bert-base-uncased",
    image_model_name="resnet50",
    output_classes=len(df[LABEL_COLUMN].unique())
)

print("Model Ready:", model)


Using cache found in /Users/nanxuan/.cache/torch/hub/pytorch_vision_main
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /Users/nanxuan/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:04<00:00, 20.8MB/s]

Model Ready: MultiModalModel(
  (text_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,),




# Save and Verify the Model

In [24]:
# Save the trained model
torch.save(model.state_dict(), "multi_modal_model.pth")
print("Model saved successfully")

# Verify model file
if os.path.exists("multi_modal_model.pth"):
    print("Model file found")
else:
    print("Model file is missing Train and save it again")


Model saved successfully
Model file found


# Flask API 

In [28]:
pip install flask flask-cors torch torchvision transformers pillow

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting flask
  Downloading flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting flask-cors
  Downloading Flask_Cors-5.0.0-py2.py3-none-any.whl.metadata (5.5 kB)
Collecting Werkzeug>=3.1 (from flask)
  Downloading werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Collecting itsdangerous>=2.2 (from flask)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Downloading flask-3.1.0-py3-none-any.whl (102 kB)
Downloading Flask_Cors-5.0.0-py2.py3-none-any.whl (14 kB)
Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Downloading werkzeug-3.1.3-py3-none-any.whl (224 kB)
Installing collected packages: Werkzeug, itsdangerous, flask, flask-cors
Successfully installed Werkzeug-3.1.3 flask-3.1.0 flask-cors-5.0.0 itsdangerous-2.2.0
Note: you may need to restart the kernel to use updated packages.


In [38]:
!python app.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using cache found in /Users/nanxuan/.cache/torch/hub/pytorch_vision_main
Model loaded successfully
 * Serving Flask app 'app'
 * Debug mode: on
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:9000
 * Running on http://192.168.1.153:9000
[33mPress CTRL+C to quit[0m
 * Restarting with stat
Using cache found in /Users/nanxuan/.cache/torch/hub/pytorch_vision_main
Model loaded successfully
 * Debugger is active!
 * Debugger PIN: 111-630-473
192.168.1.153 - - [06/Feb/2025 13:36:37] "[33mGET / HTTP/1.1[0m" 404 -
192.168.1.153 - - [06/Feb/2025 13:36:38] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
192.168.1.153 - - [06/Feb/2025 13:37:28] "[33mGET / HTTP/1.1[0m" 404 -
192.168.1.153 - - [06/Feb/2025 13:37:29] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
^C
