<a href="https://colab.research.google.com/github/omi1215/Text-Extraction-from-HarMeme-Dataset/blob/main/Feature_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import re

# Function to preprocess the text
def preprocess_text(text):
    text = str(text)
    text = re.sub(r'\n', ' ', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Convert to lowercase
    text = text.lower()

    # Optionally remove special characters or digits (if necessary)
    text = re.sub(r'[^\w\s]', '', text)

    return text

# Preprocess the 'text' column in both datasets
covid_df['processed_text'] = covid_df['text'].apply(preprocess_text)
us_pol_df['processed_text'] = us_pol_df['text'].apply(preprocess_text)

# Check the preprocessed text
covid_df.head(), us_pol_df.head()


(       image_name                                               text  \
 0  memes_4498.png  Alice\nGambell\nWe need a rewind 0 people have...   
 1  memes_4877.png  Proof the Republican Party doesn't discriminat...   
 2  memes_4394.png  Biden's campaign manager after every\nlive int...   
 3  memes_4672.png  WOODSTOCK\nT\nHOME & HARDWARE\nTRUMP'S WIVES W...   
 4  memes_6563.png  Brendan Bergen\n@carpetislava\nBIDEN: How come...   
 
                                       processed_text  
 0  alice gambell we need a rewind 0 people have b...  
 1  proof the republican party doesnt discriminate...  
 2  bidens campaign manager after every live inter...  
 3  woodstock t home  hardware trumps wives were i...  
 4  brendan bergen carpetislava biden how come you...  ,
        image_name                                               text  \
 0  memes_4498.png  Alice\nGambell\nWe need a rewind 0 people have...   
 1  memes_4877.png  Proof the Republican Party doesn't discriminat...   
 2  

In [7]:
# Define file paths to save the processed data
covid_output_path = '/content/drive/MyDrive/covid_processed_texts.csv'
us_pol_output_path = '/content/drive/MyDrive/us_pol_processed_texts.csv'

# Save the processed DataFrames to CSV files
covid_df[['image_name', 'processed_text']].to_csv(covid_output_path, index=False)
us_pol_df[['image_name', 'processed_text']].to_csv(us_pol_output_path, index=False)

# Confirm the files are saved
print(f'CSV for Covid processed texts saved to: {covid_output_path}')
print(f'CSV for US Pol processed texts saved to: {us_pol_output_path}')


CSV for Covid processed texts saved to: /content/drive/MyDrive/covid_processed_texts.csv
CSV for US Pol processed texts saved to: /content/drive/MyDrive/us_pol_processed_texts.csv


In [2]:
from transformers import DistilBertTokenizer, DistilBertModel
from sentence_transformers import SentenceTransformer
import torch
import pandas as pd

# Load the processed CSV files from Google Drive
covid_df = pd.read_csv('/content/drive/MyDrive/covid_processed_texts.csv')
us_pol_df = pd.read_csv('/content/drive/MyDrive/us_pol_processed_texts.csv')

covid_df['processed_text'] = covid_df['processed_text'].fillna('').astype(str)
us_pol_df['processed_text'] = us_pol_df['processed_text'].fillna('').astype(str)

# Load DistilBERT Model and Tokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Load SBERT Model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to extract DistilBERT features
def extract_distilbert_features(text):
    inputs = distilbert_tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = distilbert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Function to extract SBERT features
def extract_sbert_features(text):
    return sbert_model.encode(text)

# Extract features for both datasets
covid_df['distilbert_features'] = covid_df['processed_text'].apply(extract_distilbert_features)
covid_df['sbert_features'] = covid_df['processed_text'].apply(extract_sbert_features)

us_pol_df['distilbert_features'] = us_pol_df['processed_text'].apply(extract_distilbert_features)
us_pol_df['sbert_features'] = us_pol_df['processed_text'].apply(extract_sbert_features)

# Check the features
covid_df.head(), us_pol_df.head()


(       image_name                                     processed_text  \
 0  memes_4498.png  alice gambell we need a rewind 0 people have b...   
 1  memes_4877.png  proof the republican party doesnt discriminate...   
 2  memes_4394.png  bidens campaign manager after every live inter...   
 3  memes_4672.png  woodstock t home  hardware trumps wives were i...   
 4  memes_6563.png  brendan bergen carpetislava biden how come you...   
 
                                  distilbert_features  \
 0  [0.108546644, -0.14112994, 0.42620727, 0.05014...   
 1  [-0.07914518, 0.10318212, 0.087846756, 0.19117...   
 2  [0.031965554, -0.2188791, 0.23486552, 0.027350...   
 3  [0.16556902, 0.28730798, 0.32006925, -0.003805...   
 4  [0.19949307, -0.009880979, 0.04618714, -0.0006...   
 
                                       sbert_features  
 0  [-0.020168282, -0.008652444, 0.0170774, 0.0095...  
 1  [-0.047294147, 0.014942359, 0.027965542, 0.003...  
 2  [-0.06936304, -0.006860266, 0.09252098, -0.0

In [3]:
# Save the extracted features to separate CSV files
covid_df[['image_name', 'distilbert_features']].to_csv('/content/drive/MyDrive/covid_distilbert_features.csv', index=False)
covid_df[['image_name', 'sbert_features']].to_csv('/content/drive/MyDrive/covid_sbert_features.csv', index=False)

us_pol_df[['image_name', 'distilbert_features']].to_csv('/content/drive/MyDrive/us_pol_distilbert_features.csv', index=False)
us_pol_df[['image_name', 'sbert_features']].to_csv('/content/drive/MyDrive/us_pol_sbert_features.csv', index=False)

# Confirming the files are saved
'/content/drive/MyDrive/covid_distilbert_features.csv', '/content/drive/MyDrive/covid_sbert_features.csv', '/content/drive/MyDrive/us_pol_distilbert_features.csv', '/content/drive/MyDrive/us_pol_sbert_features.csv'

('/content/drive/MyDrive/covid_distilbert_features.csv',
 '/content/drive/MyDrive/covid_sbert_features.csv',
 '/content/drive/MyDrive/us_pol_distilbert_features.csv',
 '/content/drive/MyDrive/us_pol_sbert_features.csv')

In [None]:
import torch
from torchvision import models, transforms
from PIL import Image
import os
import pandas as pd

# Define image transformation (resizing, normalization for pretrained models)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load pretrained models (ResNet-152 and VGG19)
resnet_model = models.resnet152(pretrained=True)
vgg19_model = models.vgg19(pretrained=True)

# Set the models to evaluation mode
resnet_model.eval()
vgg19_model.eval()

# Function to extract image features from ResNet-152
def extract_resnet_features(image_path):
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = resnet_model(image)
    return features.squeeze().numpy()  # Remove batch dimension and convert to numpy

# Function to extract image features from VGG19
def extract_vgg19_features(image_path):
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = vgg19_model.features(image)
        features = features.view(features.size(0), -1)  # Flatten the features
    return features.squeeze().numpy()  # Remove batch dimension and convert to numpy

# Function to process image folder and extract features
def process_image_folder(image_folder, model_type="resnet"):
    features_list = []
    for image_name in os.listdir(image_folder):
        image_path = os.path.join(image_folder, image_name)
        if image_path.endswith('.png') or image_path.endswith('.jpg'):
            if model_type == "resnet":
                features = extract_resnet_features(image_path)
            elif model_type == "vgg19":
                features = extract_vgg19_features(image_path)
            features_list.append((image_name, features))
    return features_list

# Paths to the image folders
covid_image_folder = '/content/drive/MyDrive/HarMeme_Images/HarMeme_Images/harmeme_images_covid_19'
us_pol_image_folder = '/content/drive/MyDrive/HarMeme_Images/HarMeme_Images/harmeme_images_us_pol'

# Extract ResNet-152 and VGG19 features for both folders
covid_resnet_features = process_image_folder(covid_image_folder, model_type="resnet")
covid_vgg19_features = process_image_folder(covid_image_folder, model_type="vgg19")

us_pol_resnet_features = process_image_folder(us_pol_image_folder, model_type="resnet")
us_pol_vgg19_features = process_image_folder(us_pol_image_folder, model_type="vgg19")

# Convert extracted features into DataFrames
covid_resnet_df = pd.DataFrame(covid_resnet_features, columns=['image_name', 'resnet_features'])
covid_vgg19_df = pd.DataFrame(covid_vgg19_features, columns=['image_name', 'vgg19_features'])

us_pol_resnet_df = pd.DataFrame(us_pol_resnet_features, columns=['image_name', 'resnet_features'])
us_pol_vgg19_df = pd.DataFrame(us_pol_vgg19_features, columns=['image_name', 'vgg19_features'])

# Save the extracted features to CSV files
covid_resnet_df.to_csv('/content/drive/MyDrive/covid_resnet_features.csv', index=False)
covid_vgg19_df.to_csv('/content/drive/MyDrive/covid_vgg19_features.csv', index=False)

us_pol_resnet_df.to_csv('/content/drive/MyDrive/us_pol_resnet_features.csv', index=False)
us_pol_vgg19_df.to_csv('/content/drive/MyDrive/us_pol_vgg19_features.csv', index=False)

# Return file paths to confirm
'/content/drive/MyDrive/covid_resnet_features.csv', '/content/drive/MyDrive/covid_vgg19_features.csv', '/content/drive/MyDrive/us_pol_resnet_features.csv', '/content/drive/MyDrive/us_pol_vgg19_features.csv'


Downloading: "https://download.pytorch.org/models/resnet152-394f9c45.pth" to /root/.cache/torch/hub/checkpoints/resnet152-394f9c45.pth
100%|██████████| 230M/230M [00:01<00:00, 180MB/s]
Downloading: "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth" to /root/.cache/torch/hub/checkpoints/vgg19-dcbb9e9d.pth
100%|██████████| 548M/548M [00:03<00:00, 152MB/s]
