In [None]:
!pip install transformers datasets torch torchvision albumentations pandas matplotlib




In [None]:
import torch
import pandas as pd
import os
from torchvision import transforms
from torch.utils.data import DataLoader, random_split
from transformers import ViTFeatureExtractor, ViTForImageClassification, Trainer, TrainingArguments
from PIL import Image


In [None]:
import zipfile
import os

zip_path = "/content/images.zip"  # Your ZIP file
extract_path = "meme_images"  # Folder to extract images

# Extract if not already extracted
if not os.path.exists(extract_path):
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_path)

print("Extraction complete. Images are in:", extract_path)

Extraction complete. Images are in: meme_images


In [None]:
df = pd.read_csv("/content/labels.csv")  # CSV with columns: image_name, label

# Define image transformations
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std),
])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6992 entries, 0 to 6991
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         6992 non-null   int64 
 1   image_name         6992 non-null   object
 2   text_ocr           6831 non-null   object
 3   text_corrected     6987 non-null   object
 4   humour             6992 non-null   object
 5   sarcasm            6992 non-null   object
 6   offensive          6992 non-null   object
 7   motivational       6992 non-null   object
 8   overall_sentiment  6992 non-null   object
dtypes: int64(1), object(8)
memory usage: 491.8+ KB


In [None]:
columns= ['Unnamed: 0', 'text_ocr', 'text_corrected', 'humour', 'sarcasm', 'offensive', 'motivational']
df.drop(columns=columns, inplace=True)

In [None]:
df.loc[df['overall_sentiment'] == 'very_positive', 'overall_sentiment'] = 'positive'
df.loc[df['overall_sentiment'] == 'very_negative', 'overall_sentiment'] = 'negative'

In [None]:
df.loc[df['overall_sentiment'] == 'positive', 'label'] = 2
df.loc[df['overall_sentiment'] == 'negative', 'label'] = 1
df.loc[df['overall_sentiment'] == 'neutral', 'label'] = 0
df["label"] = df["label"].astype(int)
df.drop(columns=['overall_sentiment'], inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6992 entries, 0 to 6991
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   image_name  6992 non-null   object
 1   label       6992 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 109.4+ KB


In [None]:
from PIL import Image, ImageFile
def load_image(index):
    img_name, label = df.iloc[index]
    img_path = os.path.join("/content/meme_images/images", img_name)  # Path to image

    # Configure PIL to handle truncated images
    ImageFile.LOAD_TRUNCATED_IMAGES = True # Add this line

    try:
        image = Image.open(img_path).convert("RGB")  # Open image and convert to RGB
    except OSError as e:
        print(f"Error loading image: {img_path}. Skipping...")
        print(e)
        return None, None # Return None for both image and label

    image = transform(image)  # Apply transformations
    image = torch.clamp(image, 0, 1)
    # Feature extraction for ViT
    encoding = feature_extractor(images=image, return_tensors="pt")
    return encoding["pixel_values"].squeeze(0), torch.tensor(label, dtype=torch.long)


In [None]:

dataset = [load_image(i) for i in range(len(df))]  # List of (image_tensor, label)


It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


In [None]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [None]:
def collate_fn(batch):
    pixel_values = torch.stack([b[0] for b in batch])  # Stack image tensors
    labels = torch.tensor([b[1] for b in batch])  # Stack labels
    return {"pixel_values": pixel_values, "labels": labels}

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)


In [None]:
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=3,
    id2label={0: "neutral", 1: "negative", 2: "positive"},
    label2id={"neutral": 0, "negative": 1, "positive": 2}
)


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics (pred) :
    labels = pred.label_idsxx
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    learning_rate=5e-5,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=feature_extractor,
    data_collator=collate_fn,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8935,0.912898,0.587563,0.434918
2,0.8801,0.907911,0.587563,0.434918


TrainOutput(global_step=1400, training_loss=0.8871240670340402, metrics={'train_runtime': 463.1777, 'train_samples_per_second': 24.151, 'train_steps_per_second': 3.023, 'total_flos': 8.668331649106698e+17, 'train_loss': 0.8871240670340402, 'epoch': 2.0})