In [None]:
!unzip "/content/drive/MyDrive/Minor/unaugumented.zip"

Archive:  /content/drive/MyDrive/Minor/unaugumented.zip
   creating: unaugumented/
  inflating: unaugumented/final_dataset.csv  
   creating: unaugumented/img/
  inflating: unaugumented/img/img1.png  
  inflating: unaugumented/img/img10.png  
  inflating: unaugumented/img/img100.jpg  
  inflating: unaugumented/img/img101.jpg  
  inflating: unaugumented/img/img104.jpg  
  inflating: unaugumented/img/img105.jpg  
  inflating: unaugumented/img/img107.jpg  
  inflating: unaugumented/img/img11.png  
  inflating: unaugumented/img/img110.jpg  
  inflating: unaugumented/img/img111.jpg  
  inflating: unaugumented/img/img112.jpg  
  inflating: unaugumented/img/img113.jpg  
  inflating: unaugumented/img/img114.jpg  
  inflating: unaugumented/img/img115.jpg  
  inflating: unaugumented/img/img116.jpg  
  inflating: unaugumented/img/img118.jpg  
  inflating: unaugumented/img/img119.jpg  
  inflating: unaugumented/img/img12.png  
  inflating: unaugumented/img/img120.jpg  
  inflating: unaugumented/im

In [None]:
import pandas as pd
import os
data = pd.read_csv('/content/unaugumented/final_dataset.csv')
image_paths = data['Image Name'].tolist()
comments = data['Comment'].tolist()
labels = data['Label'].tolist()  # sarcastic or non-sarcastic
image_dir = '/content/unaugumented/img'

# Get all image file paths
image_paths = [os.path.join(image_dir, fname) for fname in os.listdir(image_dir) if fname.endswith(('.png', '.jpg', '.jpeg'))]

In [None]:
# Minimal preprocessing for sarcasm detection
def preprocess_text(text):
    # Convert to lowercase
    return text.lower()

# Apply minimal preprocessing
processed_comments = [preprocess_text(comment) for comment in comments]


In [None]:
import torch
from torchvision import models, transforms
from PIL import Image

# Pretrained model
model = models.resnet50(pretrained=True)
model = torch.nn.Sequential(*list(model.children())[:-1])
model.eval()

# Image preprocessing
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Extract features from images
def extract_image_features(image_path):
    img = Image.open(image_path).convert('RGB')
    img_tensor = preprocess(img).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = model(img_tensor)
    return features.squeeze().numpy()  # Return as numpy array

image_features = [extract_image_features(path) for path in image_paths]

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 58.4MB/s]


In [None]:
from transformers import BertTokenizer, BertModel

# Load pretrained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def extract_text_features(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  # Average of last hidden state

text_features = [extract_text_features(comment) for comment in processed_comments]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
import numpy as np

# Concatenate image and text features
concatenated_features = [np.concatenate((img_feat, text_feat)) for img_feat, text_feat in zip(image_features, text_features)]


In [None]:
# Convert concatenated features to DataFrame
concatenated_df = pd.DataFrame(concatenated_features)
concatenated_df['label'] = labels

# Save to CSV
concatenated_df.to_csv('concatenated_features_bert_unaug.csv', index=False)

In [None]:
concatenated_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2807,2808,2809,2810,2811,2812,2813,2814,2815,label
0,0.023767,0.688126,0.187356,0.156375,0.635659,0.789264,0.642427,0.048202,0.229867,0.586493,...,-0.547676,0.336935,-0.183516,0.420551,-0.034758,-0.03764,-0.281426,-0.029275,-0.186172,1
1,0.085713,1.327546,0.316282,0.359921,1.023657,0.114407,0.994263,0.285618,0.250206,0.488024,...,-0.19727,-0.009535,-0.212124,0.02852,-0.307864,0.053031,-0.015583,-0.235781,-0.292257,1
2,0.139909,2.038962,0.099394,0.105803,0.453012,0.59024,0.655892,1.104721,0.083357,0.258617,...,0.021123,0.028912,0.018958,-0.112458,-0.548774,-0.058277,-0.217674,-0.080026,-0.283298,1
3,0.266485,0.958967,0.27802,0.220421,0.952355,1.06846,0.521217,0.347555,0.16694,0.405613,...,-0.260781,-0.248117,-0.521603,0.156626,-0.119494,0.151568,0.237638,-0.122058,-0.273515,1
4,0.122599,0.872869,1.730311,0.120516,0.930536,0.902072,0.360049,0.068582,0.565898,0.575123,...,-0.384094,0.012216,-0.317381,-0.163353,-0.44733,0.204188,0.015534,-0.230478,-0.115329,1


In [None]:
concatenated_df.shape

(494, 2817)

In [None]:
concatenated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482 entries, 0 to 1481
Columns: 2817 entries, 0 to label
dtypes: float32(2816), int64(1)
memory usage: 15.9 MB


In [None]:
concatenated_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2807,2808,2809,2810,2811,2812,2813,2814,2815,label
count,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,...,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0
mean,0.304823,0.782872,0.448042,0.374694,0.526746,0.401468,0.483418,0.390814,0.273618,0.46148,...,-0.067225,-0.069976,-0.087555,0.049412,-0.140364,0.011582,-0.173546,0.055851,0.002471,0.491903
std,0.287271,0.586953,0.424811,0.329457,0.465482,0.359711,0.419924,0.371691,0.259559,0.383091,...,0.202495,0.210003,0.20785,0.196215,0.241978,0.179852,0.184308,0.173933,0.187055,0.500103
min,0.0,0.0,0.0,0.0,0.000897,0.0,0.0,0.0,0.0,0.0,...,-0.683201,-0.694169,-0.835839,-0.732374,-0.944323,-0.726999,-0.7709,-0.445314,-0.607458,0.0
25%,0.106978,0.366846,0.137457,0.136928,0.212712,0.143178,0.171393,0.129435,0.09319,0.188996,...,-0.208136,-0.218019,-0.223529,-0.074232,-0.3089,-0.110639,-0.295651,-0.060313,-0.125634,0.0
50%,0.222924,0.647263,0.307604,0.277609,0.410034,0.309294,0.375004,0.287678,0.202675,0.371946,...,-0.072324,-0.061569,-0.079224,0.049961,-0.141371,0.014149,-0.173915,0.050146,0.005842,0.0
75%,0.414492,1.065992,0.643637,0.517085,0.702688,0.532042,0.673385,0.520405,0.3757,0.632032,...,0.062892,0.073909,0.059336,0.187721,0.022156,0.137457,-0.053295,0.170903,0.12852,1.0
max,1.92682,3.540485,3.19161,2.098533,4.052948,2.531533,2.822402,3.11873,1.896577,2.454827,...,0.603689,0.663549,0.506204,0.709158,0.579975,0.60307,0.381173,0.748879,0.586631,1.0
