In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from extractor import text_extraction, image_extraction

In [3]:
# if args.img - dir == 'Oxford_HIC':
#     dirPath = '../Data/Oxford_HIC/oxford_hic_data.csv'
#     imgPath = '../Data/Oxford_HIC/oxford_img/'
# else:
dirPath = '../Data/Instagram/Filter_' + 'wendys' + '.csv'
imgPath = '../Data/Instagram/' + 'wendys' + '_img/'
# load data
data = pd.read_csv(dirPath)
data = text_extraction(data, imgPath)
# split data
train, test = train_test_split(data, test_size=0.2, random_state=42)

train_text = torch.tensor(train.iloc[:,0:768].to_numpy())
train_image = image_extraction(train.iloc[:,768])
train_funny_score = torch.tensor(train.iloc[:,769].to_numpy())
test_text = torch.tensor(test.iloc[:,0:768].to_numpy())
test_image = image_extraction(test.iloc[:,768])
test_funny_score = torch.tensor(test.iloc[:,769].to_numpy())

100%|██████████| 293/293 [00:08<00:00, 33.90it/s]
100%|██████████| 74/74 [00:02<00:00, 34.43it/s]


In [4]:
train_dataset = torch.utils.data.TensorDataset(train_text, train_image, train_funny_score)
test_dataset = torch.utils.data.TensorDataset(test_text, test_image, test_funny_score)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
### 不確定是否為官方的 Gemini #############################################################################
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("describeai/gemini")
gemini = AutoModelForSeq2SeqLM.from_pretrained("describeai/gemini")
#######################################################################################################

In [5]:
### 官方的Gemma #########################################################################################
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
# gemma = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto", revision="float16")
gemma = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto",  torch_dtype=torch.bfloat16)
########################################################################################################

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [65]:
### LLM測試
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gemma.to(device)
input_text = "Write a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to(device)

outputs = gemma.generate(**input_ids, max_length=500)
print(tokenizer.decode(outputs[0]))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<bos>Write a poem about Machine Learning.

Answer:

Step 1/2
Machine Learning is a powerful tool that can help us make sense of the world around us. It can analyze vast amounts of data and make predictions based on that data. It can also learn from our own actions and behaviors, and adapt to new situations. But what does it mean to be a machine? Is it just a computer program that can learn? Or is it something more? Maybe it's a combination of both. Maybe it's a combination of human intelligence and computer power. Maybe it's a combination of both. But whatever it is, it's a powerful tool that can help us make sense of the world around us. And it's a tool that can help us make sense of ourselves.

Step 2/2
So, what does it mean to be a machine? It means that we can learn from our own actions and behaviors, and adapt to new situations. It means that we can make sense of the world around us, and make predictions based on that data. It means that we can learn from our own experiences, and 

In [97]:
input_text = "Give me three best book."
input_ids = tokenizer(input_text, return_tensors="pt").to(device)
input_ids

outputs = model.generate(**input_ids, max_new_tokens=200)
print(tokenizer.decode(outputs[0]))

<bos>Give me three best book.

Answer:

1. The Great Gatsby
2. The Catcher in the Rye
3. The Grapes of Wrath<eos>


In [6]:
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.self_att = nn.MultiheadAttention(768, 1)
        self.multi_att = nn.MultiheadAttention(768, 8)
        self.layer_norm = nn.LayerNorm(768)
        self.linear = nn.Linear(768, 768)
        
    def self_attention(self, x):
        self_out,_ = self.self_att(x, x, x)
        self_out = self.layer_norm(self_out + x)
        return self_out
    
    def multi_head_attention(self, x):
        multi_out,_ = self.multi_att(x, x, x)
        multi_out = self.linear(multi_out)
        multi_out = self.layer_norm(multi_out + x)
        return multi_out
    
    def co_attention(self, x, y):
        # x: self, y: another
        co_out,_ = self.multi_att(x, y, y)
        co_out = self.linear(co_out)
        co_out = self.layer_norm(co_out + y)
        return co_out
    
    def feed_forward(self, x):
        ff_out = self.linear(x)
        ff_out = self.layer_norm(ff_out + x)
        return ff_out
    
    
    def forward(self, text, image):
        text = text.unsqueeze(1).expand(-1, 64, -1)
        text = text.transpose(0, 1)
        image = image.transpose(0, 1)
        
        text = self.self_attention(text)
        text = self.feed_forward(text)
        
        image = self.multi_head_attention(image)
        
        text = self.co_attention(text, image)
        image = self.co_attention(image, text)
        
        output = text + image
        output = self.feed_forward(output)
        
        return output

In [8]:
model = Generator()
optimizer = optim.Adam(model.parameters(), lr=0.001)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
gemma.to(device)
print(1)

1


In [9]:
epochs = 10
train_losses = []
test_losses = []
for epoch in range(epochs):
    train_loss = 0
    test_loss = 0
    with tqdm(train_loader, unit="batch") as tepoch:
        for text, image, funny_score in tepoch:
            optimizer.zero_grad()
            temp_output = model(text.to(device).to(torch.float32), image.to(device).to(torch.float32))
            output = gemma.generate(**temp_output, max_new_tokens=200)
            print(output)
            break
            loss = criterion(output, funny_score)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            tepoch.set_postfix(loss=train_loss)
    train_losses.append(train_loss)
    with tqdm(test_loader, unit="batch") as tepoch:
        for text, image, funny_score in tepoch:
            output = model(text, image)
            loss = criterion(output, funny_score)
            test_loss += loss.item()
            tepoch.set_postfix(loss=test_loss)
    test_losses.append(test_loss)

  0%|          | 0/10 [00:00<?, ?batch/s]

torch.Size([32, 768]) torch.Size([32, 64, 768])





TypeError: transformers.generation.utils.GenerationMixin.generate() argument after ** must be a mapping, not Tensor

In [10]:
temp_output.shape
temp = temp_output.transpose(0, 1)[0]
temp.shape

torch.Size([64, 768])

In [11]:
def tensor_to_text(tensor):
    # 將 tensor 轉換為 list，然後轉換為一個字符串
    tensor_list = tensor.tolist()
    tensor_text = str(tensor_list)
    return tensor_text

input_text = tensor_to_text(temp)

In [None]:
gemma.to("cpu")

input_ids = tokenizer(input_text, return_tensors="pt").to(device)
input_ids.to("cpu")
outputs = gemma.generate(**input_ids, max_new_tokens=200)
print(tokenizer.decode(outputs[0]))

In [148]:
output = gemma.generate(**temp, max_new_tokens=200)

TypeError: transformers.generation.utils.GenerationMixin.generate() argument after ** must be a mapping, not Tensor