In [None]:
# run this only one time and then restart runtime, do not run upon restart again
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.54.4
    Uninstalling openai-1.54.4:
      Successfully uninstalled openai-1.54.4
Successfully installed openai-0.28.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import os
from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import openai
import torch.nn as nn
import torch.optim as optim

from google.colab import userdata

# Initialize OpenAI API key
openai.api_key = userdata.get('api_key') # you will need to generate your own api key for getting representation
# Load Dataset
data_path = '/content/drive/MyDrive/NLP_fall_2024/processed_datasets/new_russian_processed_data.csv'
df = pd.read_csv(data_path)

# Split the dataset into training (70%) and testing (30%)
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
print("After split, train_df columns:", train_df.columns)

# Specify the folder containing your images
img_folder_path = '/content/drive/MyDrive/NLP_fall_2024/russian-images-archive'

# Count the number of files in the folder
def count_images_in_folder(folder_path):
    image_count = 0
    for file in os.listdir(folder_path):
        if file.endswith(('png', 'jpg', 'jpeg')):  # Check for valid image extensions
            image_count += 1
    return image_count

image_count = count_images_in_folder(img_folder_path)
print(f"Number of image files in the folder: {image_count}")

After split, train_df columns: Index(['id', 'text', 'label'], dtype='object')
Number of image files in the folder: 196


In [None]:
print("Files in image folder:")
print(os.listdir(img_folder_path))

def load_image(image_id, img_folder_path):
    for ext in ['png', 'jpg', 'jpeg']:
        image_path = os.path.join(img_folder_path, f"{image_id}.{ext}")
        if os.path.exists(image_path):
            image = Image.open(image_path).convert('RGB').resize((224, 224))
            print(f"Loaded image: {image_path}")
            return np.array(image)
    # print(f"Image not found for ID: {image_id}")
    return None

for idx in range(5):
    image = load_image(train_df.iloc[idx]['id'], img_folder_path)
    if image is None:
        print(f"Image with ID {train_df.iloc[idx]['id']} not loaded.")
    else:
        print(f"Image with ID {train_df.iloc[idx]['id']} loaded successfully.")

Files in image folder:
['35.jpg', '36.jpg', '37.jpg', '67.png', '38.jpg', '68.png', '69.png', '70.png', '72.png', '71.png', '73.png', '74.png', '75.png', '76.png', '39.png', '77.png', '40.png', '78.png', '41.png', '42.png', '79.png', '43.png', '80.png', '44.png', '81.png', '45.png', '46.png', '47.png', '49.png', '48.png', '50.png', '52.png', '51.png', '53.png', '54.png', '82.png', '83.png', '57.png', '55.png', '56.png', '58.png', '60.png', '84.png', '61.png', '62.png', '63.png', '59.png', '21.jpg', '22.jpg', '23.jpg', '25.jpg', '26.jpg', '27.jpg', '28.jpg', '24.jpg', '29.jpg', '30.jpg', '31.jpg', '32.jpg', '33.jpg', '1.jpg', '2.jpg', '3.jpg', '5.jpg', '10.jpg', '11.jpg', '4.jpg', '13.jpg', '12.jpg', '8.jpg', '6.jpg', '7.jpg', '14.jpg', '15.jpg', '16.jpg', '18.jpg', '19.jpg', '20.jpg', '17.jpg', '34.jpg', '89.png', '90.png', '91.jpg', '92.png', '93.png', '94.png', '95.png', '101.jpg', '102.jpg', '103.jpg', '104.jpg', '105.jpg', '106.jpg', '107.jpg', '108.jpg', '109.jpg', '110.jpg', '111

In [None]:
def get_representation(text, model="gpt-4-turbo"):
    """
    Generate a representation for text using GPT-4 Turbo with the updated OpenAI API.
    """
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an assistant that generates concise representations for embeddings."},
                {"role": "user", "content": text}
            ]
        )
        # Extract concise representation
        representation = response['choices'][0]['message']['content']
        print(f"Representation generated for text: {text[:50]}...")
        return representation
    except Exception as e:
        print(f"Error generating representation: {e}")
        return None

In [None]:
test_text = "This is a sample text for testing."
representation = get_representation(test_text)
print("Generated representation:", representation)

Representation generated for text: This is a sample text for testing....
Generated representation: This text is meant for testing purposes.


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def generate_and_tokenize_representations(dataframe, model="gpt-4-turbo"):
    representations = []
    for i, row in dataframe.iterrows():
        text = row['text']
        # Generate representation
        representation = get_representation(text, model)
        if representation:
            # Tokenize representation
            tokenized = tokenizer(representation, truncation=True, padding="max_length", max_length=128)
            representations.append(tokenized['input_ids'])
        else:
            print(f"Failed to generate representation for row {i}")
            representations.append(None)
    return representations

# Generate and tokenize representations for the entire dataset
df['tokenized'] = generate_and_tokenize_representations(df)

# Save tokenized embeddings to a file
np.save("tokenized_representations.npy", np.array(df['tokenized'].tolist(), dtype=object))
df.to_csv("processed_dataset.csv", index=False)
print("Tokenized representations saved to 'tokenized_representations.npy'")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Representation generated for text: открыл рецепт простого борща ? поздравляю , ты офи...
Representation generated for text: если я когда-нибудь напишу книгу , она будет назыв...
Representation generated for text: осенний суп . вкусно и просто за 5 минут ! сохрани...
Representation generated for text: - как ты ? - ниче , держусь , работаю...
Representation generated for text: надписи на футболке : камень , бумага , ножницы , ...
Representation generated for text: санёк , что ты носишься ? премию не дадут , подзат...
Representation generated for text: ах , я сошла с ума ! какая досада ! emoji_2 # сарк...
Representation generated for text: сегодня международный день защиты мужской нервной ...
Representation generated for text: # квадроберы # сарказм # стопдегротам # квадроберы...
Representation generated for text: парковаться задом # сарказм...
Representation generated for text: суперлуние по уральски . на данном кадре мы наблюд...
Representation generated for text: # сарказм следы более 

In [None]:
class MultimodalDatasetWithTokens(Dataset):
    def __init__(self, dataframe, img_folder, transform=None):
        self.dataframe = dataframe.reset_index(drop=True)
        self.img_folder = img_folder
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_id = row['id']
        label = row['label']
        text = row['text']
        text_tokens = torch.tensor(row['tokenized'], dtype=torch.long)

        # Load image
        image = None
        for ext in ['png', 'jpg', 'jpeg']:
            image_path = os.path.join(self.img_folder, f"{image_id}.{ext}")
            if os.path.exists(image_path):
                image = Image.open(image_path).convert("RGB")
                break

        if image is None:
            # Skip samples with missing images
            # print(f"Image not found for ID: {image_id}. Skipping sample.")
            return None  # Returning None allows the DataLoader's collate_fn to filter this out

        if self.transform:
            image = self.transform(image)

        return {
            "id": image_id,
            "text": text,
            "text_tokens": text_tokens,
            "image": image,
            "label": torch.tensor(label, dtype=torch.long) if label is not None else None,
        }

In [None]:
# Image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load the preprocessed dataset
processed_df = pd.read_csv("processed_dataset.csv")
processed_df['tokenized'] = processed_df['tokenized'].apply(eval)

# Create train/test splits again with tokenized processed dataset
train_df = processed_df.sample(frac=0.8, random_state=42)
test_df = processed_df.drop(train_df.index)

# Create Datasets and DataLoaders
train_dataset = MultimodalDatasetWithTokens(train_df, img_folder_path, transform=transform)
test_dataset = MultimodalDatasetWithTokens(test_df, img_folder_path, transform=transform)

def custom_collate_fn(batch):
    # Filter out None samples
    batch = [sample for sample in batch if sample is not None]
    if len(batch) == 0:
        return None

    # Create a batch dictionary with proper tensor conversion
    batch_dict = {}
    for key in batch[0]:
        if key == "id" or key == "text":  # Keep non-numerical fields as is
            batch_dict[key] = [sample[key] for sample in batch]
        else:  # Convert numerical fields to tensors
            batch_dict[key] = torch.stack([sample[key] for sample in batch])

    return batch_dict

# Create DataLoaders with the custom collate function
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=custom_collate_fn)

In [None]:
import torchvision.models as models

class MultimodalGPTModel(nn.Module):
    def __init__(self):
        super(MultimodalGPTModel, self).__init__()
        # Pretrained ResNet
        resnet = models.resnet18(pretrained=True)
        self.image_encoder = nn.Sequential(
            *(list(resnet.children())[:-1]),  # Remove the final classification layer
            nn.Flatten()
        )
        self.image_fc = nn.Linear(resnet.fc.in_features, 256)
        self.text_fc = nn.Linear(128, 256)
        self.weight_fc = nn.Linear(256, 2)  # Learnable weights for text and image
        self.fc = nn.Sequential(
            nn.Linear(256 + 128, 128),
            nn.ReLU(),
            nn.Linear(128, 2)
        )

    def forward(self, text_embeddings, images):
        image_features = self.image_encoder(images)
        image_features = self.image_fc(image_features)

        text_features = self.text_fc(text_embeddings)

        # Learnable modality weights
        weights = torch.softmax(self.weight_fc(text_features + image_features), dim=1)  # Size: (batch_size, 2)

        # Combine features using learned weights
        combined_weighted_features = weights[:, 0].unsqueeze(1) * text_features + weights[:, 1].unsqueeze(1) * image_features

        # Concatenate weighted combined features with original features
        combined_features = torch.cat((combined_weighted_features, text_embeddings), dim=1)  # Size: (batch_size, 384)

        outputs = self.fc(combined_features)
        return outputs

In [None]:
# Initialize Model, Optimizer, and Loss Function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalGPTModel().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()



In [None]:
# Dummy data for testing
dummy_text_embeddings = torch.randn(32, 128).to(device)  # Batch of 32, 128-dimensional embeddings
dummy_images = torch.randn(32, 3, 224, 224).to(device)   # Batch of 32 images, 3 channels, 224x224 size

# Forward pass
outputs = model(dummy_text_embeddings, dummy_images)
print(f"Model output shape: {outputs.shape}")  # Should be [32, 2] for binary classification

Model output shape: torch.Size([32, 2])


In [None]:
for epoch in range(10):  # Number of epochs
    model.train()
    total_loss = 0

    for batch in train_loader:
        if batch is None:
            continue
        text_tokens = batch["text_tokens"].to(device).float()
        images = batch["image"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(text_tokens, images)

        # Compute loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")


Epoch 1, Loss: 657.9050880432129
Epoch 2, Loss: 561.6492797851563
Epoch 3, Loss: 309.96873779296874
Epoch 4, Loss: 263.54943466186523
Epoch 5, Loss: 246.2924545288086
Epoch 6, Loss: 221.2427185058594
Epoch 7, Loss: 160.22191371917725
Epoch 8, Loss: 100.45460729598999
Epoch 9, Loss: 80.3496597290039
Epoch 10, Loss: 59.46617469787598


In [None]:
# do not use, used for full comparison between original label, our model label and gpt4 label
def openai_predict(text, model="gpt-4-turbo"):
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a classifier for sarcasm detection."},
                {"role": "user", "content": f"Is the following sarcastic? {text}"}
            ]
        )
        prediction = response['choices'][0]['message']['content'].strip().lower()
        return 1 if "sarcastic" in prediction else 0  # Assuming binary classification
    except Exception as e:
        print(f"OpenAI prediction failed: {e}")
        return -1  # Indicate failure

model.eval()
true_labels = []
predicted_labels = []
openai_predictions = []
results = []  # To store detailed results for comparison

print("\nTesting starts...")
with torch.no_grad():
    for batch_idx, batch in enumerate(test_loader):
        if batch is None:
            continue

        images = batch["image"].to(device)
        text_tokens = batch["text_tokens"].to(device).float()
        labels = batch["label"].to(device)
        texts = batch["text"]

        # Forward pass through the trained model
        model_outputs = model(text_tokens, images)
        model_preds = torch.argmax(model_outputs, dim=1).cpu().tolist()

        # Get predictions from OpenAI
        for text in texts:
            openai_pred = openai_predict(text)
            openai_predictions.append(openai_pred)

        # Log results for comparison
        for idx in range(len(labels)):
            results.append({
                "text": texts[idx],
                "true_label": labels[idx].item(),
                "model_pred": model_preds[idx],
                "openai_pred": openai_predictions[idx]
            })

        true_labels.extend(labels.cpu().tolist())
        predicted_labels.extend(model_preds)

# Save detailed results for comparison
results_df = pd.DataFrame(results)
results_df.to_csv("full_comparison_results.csv", index=False)
print("Full comparison results saved to full_comparison_results.csv.")



Testing starts...
Full comparison results saved to full_comparison_results.csv.


In [None]:
# Testing loop
model.eval()
true_labels = []
predicted_labels = []
results = []

print("\nTesting starts...")
with torch.no_grad():
    for batch_idx, batch in enumerate(test_loader):
        print(f"\nBatch {batch_idx + 1}:")

        # Skip None batches due to potential filtering in custom_collate_fn
        if batch is None:
            print(f"Batch {batch_idx + 1} skipped due to missing data.")
            continue

        # Move tensors to the device
        images = batch["image"].to(device)
        text_tokens = batch["text_tokens"].to(device).float()
        labels = batch["label"].to(device)
        ids = batch["id"]
        texts = batch["text"]

        # Forward pass
        outputs = model(text_tokens, images)
        preds = torch.argmax(outputs, dim=1).cpu().tolist()

        # Log results for each sample
        for idx in range(len(labels)):
            result = {
                "id": ids[idx],
                "text": texts[idx],
                "true_label": labels[idx].item(),  # Original label
                "predicted_label": preds[idx],  # Model's prediction
            }
            results.append(result)

        # Collect labels for metric computation
        true_labels.extend(labels.cpu().tolist())
        predicted_labels.extend(preds)
        print(f"Predictions for batch: {preds}")

# Save detailed results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("comparison_results.csv", index=False)
print("Detailed results saved to comparison_results.csv.")



Testing starts...

Batch 1:
Predictions for batch: [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1]

Batch 2:
Predictions for batch: [0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1]

Batch 3:
Predictions for batch: [0, 1, 1, 0, 0, 0, 1, 0]
Detailed results saved to comparison_results.csv.


In [None]:
# Calculate metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

# Create a DataFrame for display
metrics_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1-Score"],
    "Value": [accuracy, precision, recall, f1]
})

print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels, target_names=["Non-Sarcastic", "Sarcastic"]))

print("\nMetrics Summary:")
print(metrics_df)


Classification Report:
               precision    recall  f1-score   support

Non-Sarcastic       0.53      0.43      0.48        23
    Sarcastic       0.38      0.47      0.42        17

     accuracy                           0.45        40
    macro avg       0.45      0.45      0.45        40
 weighted avg       0.46      0.45      0.45        40


Metrics Summary:
      Metric     Value
0   Accuracy  0.450000
1  Precision  0.464536
2     Recall  0.450000
3   F1-Score  0.452757
