In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models, transforms
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image
import pandas as pd
import zipfile
import os
from transformers import BertTokenizer
import subprocess


In [None]:
subprocess.run(["unzip", "images.zip", "-d", "/content/images"])

CompletedProcess(args=['unzip', 'images.zip', '-d', '/content/images'], returncode=0)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **DF**

In [None]:
import os
file_path = "/content/drive/MyDrive/processed_df.pkl"
print("File exists:", os.path.exists(file_path))

File exists: True


In [None]:
import pandas as pd

df = pd.read_pickle("/content/drive/MyDrive/processed_df.pkl")
print(df.head())

        author                                        clean_title  \
0       Vranak  president assad expresses appreciation towards...   
1  monkeypowah  uk policeman asks for names and addresses of p...   
2     Relandis  flypocalypse dad swatted these flies in the ki...   
3        dwaxe    life times pixar movies made us cry like babies   
4   SowerPlave                                  stay classy tiger   

    created_utc           domain  hasImage       id  \
0  1.388330e+09          cnn.com      True   1txzm9   
1  1.423516e+09  theguardian.com      True   2vcb92   
2  1.564338e+09        i.redd.it      True   ciz7vl   
3  1.470069e+09    clickhole.com      True   4vnbj9   
4  1.378134e+09                       True  cc084wv   

                                           image_url linked_submission_id  \
0  https://external-preview.redd.it/k3pMa3DrphExz...                        
1  https://external-preview.redd.it/72WISwugE1C3P...                        
2  https://preview.redd

In [None]:
df

Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,upvote_ratio,6_way_label
0,Vranak,president assad expresses appreciation towards...,1.388330e+09,cnn.com,True,1txzm9,https://external-preview.redd.it/k3pMa3DrphExz...,,2.0,13,upliftingnews,0.82,0
1,monkeypowah,uk policeman asks for names and addresses of p...,1.423516e+09,theguardian.com,True,2vcb92,https://external-preview.redd.it/72WISwugE1C3P...,,1.0,27,nottheonion,0.88,0
2,Relandis,flypocalypse dad swatted these flies in the ki...,1.564338e+09,i.redd.it,True,ciz7vl,https://preview.redd.it/v6t5ktde53d31.jpg?widt...,,3.0,4,mildlyinteresting,0.63,0
3,dwaxe,life times pixar movies made us cry like babies,1.470069e+09,clickhole.com,True,4vnbj9,https://external-preview.redd.it/LNyLFwFIi6Knk...,,0.0,5,theonion,0.67,1
4,SowerPlave,stay classy tiger,1.378134e+09,,True,cc084wv,http://i.imgur.com/IqgTf87.jpg,1lkj8g,,6,psbattle_artwork,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16915,piedmont6,after transplant saves her son mom returns the...,1.405109e+09,newser.com,True,2agd8x,https://external-preview.redd.it/BRsICYmjMl0FE...,,0.0,14,upliftingnews,0.84,0
16916,Pooch76,dog in a field,1.503420e+09,i.redd.it,True,6vc7fu,https://preview.redd.it/qqari2mtgbhz.jpg?width...,,4.0,3,photoshopbattles,1.0,0
16917,SomeGuyInNewZealand,police escort surgeon through auckland rush ho...,1.502103e+09,stuff.co.nz,True,6s4rmk,https://external-preview.redd.it/XZzRiIuvTT1xu...,,0.0,16,upliftingnews,0.85,0
16918,SiON42X,garth brooks gives unexpected answer to trump ...,1.484687e+09,archive.is,True,5okrm9,https://external-preview.redd.it/auG2N6WhVRaFa...,,7.0,104,savedyouaclick,0.89,5


# *** IMAGES DATA ***

In [None]:
zip_path = "/content/drive/MyDrive/Images 2.zip"
Destination_path = "/content/Images_data"

In [None]:
if not os.path.exists(Destination_path):
    os.makedirs(Destination_path)

In [None]:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(Destination_path)
print("Extraction completed!")

Extraction completed!


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
df["image_filename"] = df["image_url"].apply(lambda x: os.path.basename(x) if isinstance(x, str) else None)


In [None]:
image_dir = "/content/Images_data/Images"
existing_images = set(os.listdir(image_dir))
df = df[df["image_filename"].isin(existing_images)].reset_index(drop=True)

In [None]:
def tokenize_text(text, max_length=50):
    tokens = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")
    return tokens["input_ids"].squeeze(0)

In [None]:
class FakeNewsDataset(Dataset):
    def __init__(self, dataframe, image_dir, tokenizer, max_length=50, transform=None):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Use 'clean_title' for text
        text = self.dataframe.iloc[idx]["clean_title"]
        text_tensor = tokenize_text(text, self.max_length).unsqueeze(0)  # adding sentence dimension if needed

        # Use 'image_filename' for image file names
        image_name = self.dataframe.iloc[idx]["image_filename"]
        image_path = os.path.join(self.image_dir, image_name)
        image = read_image(image_path).float() / 255.0  # Normalize

        if self.transform:
            image = self.transform(image)

        # Use '6_way_label' for label
        label = torch.tensor(self.dataframe.iloc[idx]["6_way_label"], dtype=torch.long)

        return text_tensor, image, label

# ***Attention Modules***

In [None]:
class WordAttention(nn.Module):
    def __init__(self, embed_dim, hidden_dim):
        super(WordAttention, self).__init__()
        self.lstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.attn_fn = nn.Linear(hidden_dim * 2, hidden_dim * 2)
        self.context_vector = nn.Parameter(torch.rand(hidden_dim * 2))

    def forward(self, x):
        # x shape: (batch, seq_len, embed_dim)
        lstm_out, _ = self.lstm(x)  # shape: (batch, seq_len, hidden_dim*2)
        attn_weights = torch.tanh(self.attn_fn(lstm_out))  # corrected layer name
        attn_weights = torch.matmul(attn_weights, self.context_vector)  # shape: (batch, seq_len)
        attn_weights = F.softmax(attn_weights, dim=1).unsqueeze(2)  # shape: (batch, seq_len, 1)
        weighted_sum = torch.sum(lstm_out * attn_weights, dim=1)  # shape: (batch, hidden_dim*2)
        return weighted_sum

In [None]:
class SentencAttention(nn.Module):
    def __init__(self, hidden_dim):
        super(SentencAttention, self).__init__()
        self.lstm = nn.LSTM(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
        self.attn_fn = nn.Linear(hidden_dim * 2, hidden_dim * 2)
        self.context_vector = nn.Parameter(torch.rand(hidden_dim * 2))

    def forward(self, x):
        # x shape: (batch, num_sentences, hidden_dim*2)
        lstm_out, _ = self.lstm(x)
        attn_weights = torch.tanh(self.attn_fn(lstm_out))  # corrected layer name
        attn_weights = torch.matmul(attn_weights, self.context_vector)
        attn_weights = F.softmax(attn_weights, dim=1).unsqueeze(2)
        weighted_sum = torch.sum(lstm_out * attn_weights, dim=1)
        return weighted_sum

# ***Hierarchical Attention Network for Text***

In [None]:
# Hierarchical Attention Network for text
class HierarchicalAttentionNetwork(nn.Module):
    def __init__(self, vocab_size, embed_dim, word_hidden_dim, sentence_hidden_dim, pretrained_embeddings=None):
        super(HierarchicalAttentionNetwork, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(pretrained_embeddings)
        self.word_attn = WordAttention(embed_dim, word_hidden_dim)
        self.sentence_attn = SentencAttention(sentence_hidden_dim)

    def forward(self, x):
        # x shape: (batch, num_sentences, sentence_length)
        batch_size, num_sentences, sentence_length = x.size()
        x = x.view(batch_size * num_sentences, sentence_length)
        x = self.embedding(x)
        word_level_features = self.word_attn(x)  # shape: (batch*num_sentences, word_hidden_dim*2)
        word_level_features = word_level_features.view(batch_size, num_sentences, -1)
        doc_representation = self.sentence_attn(word_level_features)
        return doc_representation


# ***Image Encoder Using Pre-trained ResNet-50***

In [None]:
class ImageEncoder(nn.Module):
    def __init__(self, output_dim):
        super(ImageEncoder, self).__init__()
        resnet = models.resnet50(pretrained=True)
        modules = list(resnet.children())[:-1]  # Remove final classification layer
        self.resnet = nn.Sequential(*modules)
        self.fc = nn.Linear(resnet.fc.in_features, output_dim)

    def forward(self, x):
        with torch.no_grad():
            features = self.resnet(x)
        features = features.view(features.size(0), -1)
        features = self.fc(features)
        return features  # Now returning the computed features


# ***Multimodal Fusion and Classification Model***

In [None]:
class MultimodalFakeNewsDetector(nn.Module):
    def __init__(self, vocab_size, embed_dim, word_hidden_dim, sentence_hidden_dim,
                 text_output_dim, image_output_dim, fusion_hidden_dim, num_classes=2, pretrained_embeddings=None):
        super(MultimodalFakeNewsDetector, self).__init__()
        self.text_encoder = HierarchicalAttentionNetwork(vocab_size, embed_dim, word_hidden_dim, sentence_hidden_dim, pretrained_embeddings)
        self.image_encoder = ImageEncoder(image_output_dim)
        self.fusion_fc = nn.Linear(text_output_dim + image_output_dim, fusion_hidden_dim)
        self.out_fc = nn.Linear(fusion_hidden_dim, num_classes)

    def forward(self, text, image):
        # text shape: (batch, num_sentences, sentence_length)
        # image shape: (batch, 3, H, W)
        text_features = self.text_encoder(text)
        image_features = self.image_encoder(image)
        combined_features = torch.cat([text_features, image_features], dim=1)
        fusion = F.relu(self.fusion_fc(combined_features))
        logits = self.out_fc(fusion)
        return logits


In [None]:
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),  # Resizing for ResNet-50
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])  # Standard normalization
])


In [None]:
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

In [None]:
train_df

Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,upvote_ratio,6_way_label
11648,maksimovica,i am honestly sorry for making this,1.555870e+09,,True,elfsxj7,https://i.imgur.com/AtT4L52.jpg,bfouyk,,1,psbattle_artwork,,4
13679,Desol_8,th year anniversary of germany helping defeat ...,1.559939e+09,i.redd.it,True,bxzepf,https://preview.redd.it/urwwoaynuz231.jpg?widt...,,6.0,49,fakehistoryporn,0.96,2
169,Jaxible,head expansion,1.520819e+09,i.imgur.com,True,83rdl6,https://external-preview.redd.it/1a1YPF5pBk2Dq...,,25.0,798,confusing_perspective,0.94,2
12618,SemiLoquacious,antipyramid scheme political cartoon albania,1.554737e+09,i.redd.it,True,bav0ib,https://preview.redd.it/jwj89ncd42r21.jpg?widt...,,4.0,26,propagandaposters,0.9,5
15778,TimelessCorruption,lil uzi vert jumping down a cliff,1.551391e+09,i.redd.it,True,avwppi,https://preview.redd.it/dg3kpokmtdj21.jpg?widt...,,1.0,3,photoshopbattles,0.64,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11205,2Thebreezes,americas next super villain,1.442418e+09,,True,cv3kz67,http://i.imgur.com/JY83PKA.jpg,3l6lpx,,180,psbattle_artwork,,4
7133,SmileyFace-_-,the last airbender and his companion,1.471287e+09,,True,d6inb7o,https://i.imgur.com/FEdNhjb.jpg,4xuycf,,103,psbattle_artwork,,4
2307,belder1990,this is what fire hydrants actually look like,1.569978e+09,i.redd.it,True,dc31au,https://preview.redd.it/y1gkblxs11q31.jpg?widt...,,27.0,283,mildlyinteresting,0.94,0
8804,phantomjm,get down from there,1.453821e+09,,True,czckn03,http://i.imgur.com/HGnVK0E.jpg,42nd1v,,7,psbattle_artwork,,4


In [None]:
test_df

Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,upvote_ratio,6_way_label
1,monkeypowah,uk policeman asks for names and addresses of p...,1.423516e+09,theguardian.com,True,2vcb92,https://external-preview.redd.it/72WISwugE1C3P...,,1.0,27,nottheonion,0.88,0
5,SockofBadKarma,also i do agree that disneys been experimentin...,1.465442e+09,,True,d41uf6f,http://i.imgur.com/R3J7veq.jpg,4n5aje,,22,psbattle_artwork,,4
7,cboogie,new boots got jokes,1.505449e+09,i.redd.it,True,707luy,https://preview.redd.it/wqt5en9h4zlz.jpg?width...,,0.0,2,pareidolia,0.67,2
9,NewJerseyFreakshow,onion explains the rise of china,1.437108e+09,theonion.com,True,3dla4k,https://external-preview.redd.it/sQnNPBfBmZ2_A...,,0.0,2,theonion,0.67,1
11,,saw this toilet in a restaurant cornwall uk wa...,1.552575e+09,i.redd.it,True,b115ft,https://preview.redd.it/ts2df0g0k3m21.jpg?widt...,,4.0,13,mildlyinteresting,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16902,tre3fla_,joan of arc telling people she had vissions of...,1.551657e+09,i.redd.it,True,ax088d,https://preview.redd.it/21t5q725pzj21.png?widt...,,1.0,11,fakehistoryporn,0.88,2
16903,notParticularlyAnony,why anna wintour always wears sunglasses,1.565213e+09,archive.is,True,cnbtan,https://external-preview.redd.it/ih-hWKdm3iSHP...,,5.0,57,savedyouaclick,0.95,5
16905,AnakinKardashian,mike tomlin cheating on the sideline,1.385764e+09,cbssports.com,True,1rqizl,https://external-preview.redd.it/OQ3A6KPesqhU2...,,1.0,7,photoshopbattles,0.89,0
16906,M7madDKA,this zebra looking at lions have sex,1.505228e+09,i.redd.it,True,6znh8v,https://preview.redd.it/jl07ex0uuglz.jpg?width...,,33.0,691,photoshopbattles,0.98,0


In [None]:
train_dataset = FakeNewsDataset(train_df, image_dir, tokenizer, transform=image_transforms)
test_dataset = FakeNewsDataset(test_df, image_dir, tokenizer, transform=image_transforms)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = tokenizer.vocab_size
embed_dim = 128
word_hidden_dim = 64
sentence_hidden_dim = 64
text_output_dim = sentence_hidden_dim * 2  # HAN outputs hidden_dim*2 due to bidirectional LSTM
image_output_dim = 256  # ResNet output size
fusion_hidden_dim = 128
num_classes = 2  # Fake or Real

In [None]:
model = MultimodalFakeNewsDetector(
    vocab_size, embed_dim, word_hidden_dim, sentence_hidden_dim,
    text_output_dim, image_output_dim, fusion_hidden_dim, num_classes
).to(device)



In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 5

In [None]:
df

Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,upvote_ratio,6_way_label
0,Vranak,president assad expresses appreciation towards...,1.388330e+09,cnn.com,True,1txzm9,https://external-preview.redd.it/k3pMa3DrphExz...,,2.0,13,upliftingnews,0.82,0
1,monkeypowah,uk policeman asks for names and addresses of p...,1.423516e+09,theguardian.com,True,2vcb92,https://external-preview.redd.it/72WISwugE1C3P...,,1.0,27,nottheonion,0.88,0
2,Relandis,flypocalypse dad swatted these flies in the ki...,1.564338e+09,i.redd.it,True,ciz7vl,https://preview.redd.it/v6t5ktde53d31.jpg?widt...,,3.0,4,mildlyinteresting,0.63,0
3,dwaxe,life times pixar movies made us cry like babies,1.470069e+09,clickhole.com,True,4vnbj9,https://external-preview.redd.it/LNyLFwFIi6Knk...,,0.0,5,theonion,0.67,1
4,SowerPlave,stay classy tiger,1.378134e+09,,True,cc084wv,http://i.imgur.com/IqgTf87.jpg,1lkj8g,,6,psbattle_artwork,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16915,piedmont6,after transplant saves her son mom returns the...,1.405109e+09,newser.com,True,2agd8x,https://external-preview.redd.it/BRsICYmjMl0FE...,,0.0,14,upliftingnews,0.84,0
16916,Pooch76,dog in a field,1.503420e+09,i.redd.it,True,6vc7fu,https://preview.redd.it/qqari2mtgbhz.jpg?width...,,4.0,3,photoshopbattles,1.0,0
16917,SomeGuyInNewZealand,police escort surgeon through auckland rush ho...,1.502103e+09,stuff.co.nz,True,6s4rmk,https://external-preview.redd.it/XZzRiIuvTT1xu...,,0.0,16,upliftingnews,0.85,0
16918,SiON42X,garth brooks gives unexpected answer to trump ...,1.484687e+09,archive.is,True,5okrm9,https://external-preview.redd.it/auG2N6WhVRaFa...,,7.0,104,savedyouaclick,0.89,5


In [None]:
print(df.columns)

Index(['author', 'clean_title', 'created_utc', 'domain', 'hasImage', 'id',
       'image_url', 'linked_submission_id', 'num_comments', 'score',
       'subreddit', 'upvote_ratio', '6_way_label'],
      dtype='object')


In [None]:
def __getitem__(self, idx):
    # Use 'clean_title' for text
    text = self.dataframe.iloc[idx]["clean_title"]
    text_tensor = tokenize_text(text, self.max_length).unsqueeze(0)  # add sentence dimension if needed

    # Use 'image_filename' for image filenames
    image_name = self.dataframe.iloc[idx]["image_filename"]
    image_path = os.path.join(self.image_dir, image_name)
    image = read_image(image_path).float() / 255.0  # Normalize image

    if self.transform:
        image = self.transform(image)

    # Use '6_way_label' for the label
    label = torch.tensor(self.dataframe.iloc[idx]["6_way_label"], dtype=torch.long)

    return text_tensor, image, label

In [None]:
train_dataset = FakeNewsDataset(train_df, image_dir, tokenizer, transform=image_transforms)
test_dataset = FakeNewsDataset(test_df, image_dir, tokenizer, transform=image_transforms)

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for text_batch, image_batch, labels in train_loader:
        text_batch, image_batch, labels = text_batch.to(device), image_batch.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(text_batch, image_batch)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}, Accuracy: {correct/total:.4f}")

KeyError: 'text'

In [None]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for text_batch, image_batch, labels in test_loader:
        text_batch, image_batch, labels = text_batch.to(device), image_batch.to(device), labels.to(device)
        outputs = model(text_batch, image_batch)
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
print(f"Test Accuracy: {correct/total:.4f}")

KeyError: 'text'