In [20]:
import base64
import requests
import json
import pandas as pd
import os
from tqdm import tqdm

In [6]:
API_URL = "https://api-inference.huggingface.co/models/openai/clip-vit-base-patch32"
headers = {"Authorization": "Bearer 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"}

In [2]:
def query(data):
	with open(data["image_path"], "rb") as f:
		img = f.read()
	payload={
		"parameters": data["parameters"],
		"inputs": base64.b64encode(img).decode("utf-8")
	}
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()


In [12]:
output

[{'score': 0.5086583495140076, 'label': 'angry'},
 {'score': 0.1942409873008728, 'label': 'sexy'},
 {'score': 0.1696304976940155, 'label': 'human'},
 {'score': 0.05426330491900444, 'label': 'happy'},
 {'score': 0.034045759588479996, 'label': 'sad'},
 {'score': 0.023448029533028603, 'label': 'annoyed'},
 {'score': 0.007537000812590122, 'label': 'dog'},
 {'score': 0.005674607120454311, 'label': 'llama'},
 {'score': 0.0025015140417963266, 'label': 'cat'}]

In [11]:
output = query({
    "image_path": "yelling_man.png",
    "parameters": {"candidate_labels": ["sad", "angry", "puzzled", "evil", "crazy",
                                        "happy", "sexy", "cool", "cute", "other"]},
})

In [22]:
output[0]['label']

'angry'

In [14]:
with open("meme_properties.json") as file:
    meme_properties = json.load(file)

### Predict sentiment of each meme.

In [16]:
def list_files(directory):
    try:
        # List all files and directories in the given directory
        files = os.listdir(directory)
        
        # Filter out directories, keep only files
        file_names = [f for f in files if os.path.isfile(os.path.join(directory, f))]
        
        return file_names
    except FileNotFoundError:
        print("The specified directory does not exist.")
        return []
    except PermissionError:
        print("Permission denied to access the directory.")
        return []

In [19]:
directory_path = 'memes900k/images'
files = list_files(directory_path)

In [27]:
img_to_sentiment = dict()
for file in tqdm(files):
    img = os.path.join(directory_path, file)
    output = query({"image_path": img,
                    "parameters": {"candidate_labels": ["sad", "angry", "puzzled", "evil", "crazy", "happy", "sexy", "cool", "cute", "other"]}})
    img_to_sentiment[file] = output[0]['label']

 99%|█████████▊| 296/300 [02:44<00:02,  1.79it/s]


KeyError: 0

In [32]:
img_to_sentiment['gangnam-style.jpg'] = 'crazy'
img_to_sentiment['slowpoke.jpg'] = 'happy'
img_to_sentiment['beavis-and-butthead.jpg'] = 'sexy'
img_to_sentiment['conspiracy-keanu.jpg'] = 'puzzled'


In [44]:
len(img_to_sentiment)

300

In [50]:
sentiment_column = []
for meme in meme_properties:
    sentiment_column.extend([meme_properties[meme]['sentiment'] for _ in range(3000)])

In [51]:
len(sentiment_column)

900000

In [34]:
# with open("img_to_sentiment.json", 'w', encoding='utf-8') as file:
#     json.dump(img_to_sentiment, file)

In [37]:
nickname_to_name = dict()
for el in meme_properties:
    nickname = meme_properties[el]['nickname']
    nickname_to_name[nickname] = el

In [39]:
for file in img_to_sentiment:
    file_name = file.split('.')[0]
    name = nickname_to_name[file_name]
    meme_properties[name]['sentiment'] = img_to_sentiment[file]
    

In [41]:
df = pd.read_csv('dataset_descr_captions.csv')

In [52]:
df['sentiments'] = sentiment_column

In [54]:
# df.to_csv('sentiments_captions.csv', index=False)

In [55]:
df

Unnamed: 0,descriptions,captions,sentiments
0,"a simple, stylized drawing of a face with a ve...",commercial <sep> y u no same volume as show!?,angry
1,"a simple, stylized drawing of a face with a ve...",Victoria <sep> y u no tell us your secret?!,angry
2,"a simple, stylized drawing of a face with a ve...",KONY <sep> Y u no take justin bieber,angry
3,"a simple, stylized drawing of a face with a ve...",TED <sep> y u no tell us how you met their mother,angry
4,"a simple, stylized drawing of a face with a ve...",Google <sep> Y U NO LET ME FINISH TYPING?,angry
...,...,...,...
899995,the dark knight rises,When reading time commences <sep> you have my ...,angry
899996,the dark knight rises,only when my mind is in ashes <sep> do you hav...,angry
899997,the dark knight rises,when facebook has emotions <sep> you have my p...,angry
899998,the dark knight rises,WHEN YOU HAVE FIXED YOUR GRAMMAR <sep> then yo...,angry


### Now only train angry prompts.

In [64]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, TaskType
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
from datasets import Dataset
import pandas as pd
from collections import defaultdict, Counter
import matplotlib.pyplot as plt

In [65]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

mps


In [56]:
df_angry = df[df['sentiments']=='angry']

In [57]:
df_angry

Unnamed: 0,descriptions,captions,sentiments
0,"a simple, stylized drawing of a face with a ve...",commercial <sep> y u no same volume as show!?,angry
1,"a simple, stylized drawing of a face with a ve...",Victoria <sep> y u no tell us your secret?!,angry
2,"a simple, stylized drawing of a face with a ve...",KONY <sep> Y u no take justin bieber,angry
3,"a simple, stylized drawing of a face with a ve...",TED <sep> y u no tell us how you met their mother,angry
4,"a simple, stylized drawing of a face with a ve...",Google <sep> Y U NO LET ME FINISH TYPING?,angry
...,...,...,...
899995,the dark knight rises,When reading time commences <sep> you have my ...,angry
899996,the dark knight rises,only when my mind is in ashes <sep> do you hav...,angry
899997,the dark knight rises,when facebook has emotions <sep> you have my p...,angry
899998,the dark knight rises,WHEN YOU HAVE FIXED YOUR GRAMMAR <sep> then yo...,angry


In [58]:
df_angry_shuffled = df_angry.sample(frac=1, random_state=42).reset_index(drop=True)
df_angry_shuffled = df_angry_shuffled.drop(columns=['descriptions'])

In [61]:
df_angry_shuffled

Unnamed: 0,sentiments,captions
0,angry,can't draw or paint <sep> abstract expressionist
1,angry,IM <sep> OVER IT
2,angry,what if i told you <sep> people are allowed to...
3,angry,Watching a game of thrones sex scene in class ...
4,angry,kill spider in bedroom <sep> leave his corpse ...
...,...,...
173995,angry,HALLOOOOOO <sep> <emp>
173996,angry,I NERD <sep> LIKE A BOSS
173997,angry,Live in dorm <sep> actually have to put on pan...
173998,angry,NEVER LICK YOU'ER <sep> BUTT IN THE NY CITY ST...


In [75]:
df_1000 = df_angry_shuffled.sample(n=1000, random_state=42).reset_index(drop=True)

In [60]:
new_order = ['sentiments', 'captions']  # Specify the new order
df_angry_shuffled = df_angry_shuffled[new_order]

In [66]:
model_name_or_path = "google/flan-t5-xl"
tokenizer_name_or_path = "google/flan-t5-xl"

text_column = "sentiment"
label_column = "captions"
# max_length = 128
lr = 1e-2
num_epochs = 3
batch_size = 4

In [77]:
# Create Transformers dataset
data = {"caption": [word for word in df_1000['captions']], "sentiment":[descr for descr in df_1000['sentiments']]}
dataset = Dataset.from_dict(data)
dataset_split = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
dataset_split['train'][0]

{'caption': 'woah, woah, woah... <sep> have u been brushing your teeth with dog shit?',
 'sentiment': 'angry'}

In [78]:
len(dataset_split['train'])

800

In [79]:
captions = dataset_split['train']['caption']
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
caption_max_length = max([len(tokenizer(caption)["input_ids"]) for caption in captions])
print(caption_max_length)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

60




In [80]:
def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    prefix = "Generate a meme caption based on a sentiment. Sentiment: "
    suffix = ". Caption: "
    inputs = [prefix + input_text + suffix for input_text in inputs]

    model_inputs = tokenizer(inputs, max_length=caption_max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=caption_max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

In [81]:
processed_datasets = dataset_split.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset_split["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

In [82]:
processed_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

In [83]:
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

In [84]:
for i in range(10):
    print(tokenizer.decode(train_dataset['input_ids'][i], skip_special_tokens=True))

Generate a meme caption based on a sentiment. Sentiment: angry. Caption: 
Generate a meme caption based on a sentiment. Sentiment: angry. Caption: 
Generate a meme caption based on a sentiment. Sentiment: angry. Caption: 
Generate a meme caption based on a sentiment. Sentiment: angry. Caption: 
Generate a meme caption based on a sentiment. Sentiment: angry. Caption: 
Generate a meme caption based on a sentiment. Sentiment: angry. Caption: 
Generate a meme caption based on a sentiment. Sentiment: angry. Caption: 
Generate a meme caption based on a sentiment. Sentiment: angry. Caption: 
Generate a meme caption based on a sentiment. Sentiment: angry. Caption: 
Generate a meme caption based on a sentiment. Sentiment: angry. Caption: 


In [97]:
tokenizer.decode(train_dataset['labels'][0][:30])

'woah, woah, woah... <unk>sep> have u been brushing your teeth with dog s'

In [98]:
peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, num_virtual_tokens=20)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]



model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
import wandb

In [None]:
wandb.init(project="caption_generation_3_epochs_large_sentiment", entity="nurs-sagimbayeva")

In [None]:
model.print_trainable_parameters()
"trainable params: 983040 || all params: 738651136 || trainable%: 0.13308583065659835"

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [None]:
model = model.to(device)

for epoch in range(num_epochs):
    print(f"-----Epoch {epoch} has started-----")
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        wandb.log({"train_loss": loss.item(), "epoch": epoch})

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    
        # Log evaluation metrics
    wandb.log({
        "eval_loss": eval_epoch_loss.item(),
        "eval_ppl": eval_ppl.item(),
        "train_loss": train_epoch_loss.item(),
        "train_ppl": train_ppl.item(),
        "epoch": epoch
    })
    
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

In [None]:
for i in range(len(dataset_split['test']['description'])):
    print(dataset_split['test']['description'][i])
    print("Actual caption:", dataset_split['test']['caption'][i])
    print("prediction:", eval_preds[i])