# Data Preprocessing

In [None]:
import requests
import pandas as pd
from io import BytesIO
from PIL import Image, UnidentifiedImageError


def extract_samples(dataframe, subreddits, sample_size):

    dataframe = dataframe[dataframe.subreddit.isin(subreddits)].sample(frac=1, random_state=42)
    samples = []
    for _, row in dataframe.iterrows():
        try:

            # Check if title has at least 5 words
            assert len(row.clean_title.split()) >= 5

            # Check if image is available
            assert row.image_url != ''
            image = Image.open(BytesIO(requests.get(row.image_url).content))

            # Check if image is not an error image
            assert image.size != (108,54)
            assert image.size != (161,81)
            assert image.size != (130,60)

            # Save image as jpg
            if image.format == 'PNG':
                image = image.convert('RGB')
            image.save('images/' + row.id + '.jpg')
            
            samples.append(row)
            if len(samples) == sample_size:
                break

        except (AssertionError, UnidentifiedImageError):
            pass
        
    return pd.DataFrame(samples)


# Load data
data = pd.read_csv('data/multimodal_train.tsv', sep='\t')
data.fillna('', inplace=True)

# Select relevant subreddits
true_subreddits = ['nottheonion', 'neutralnews', 'usanews', 'upliftingnews', 'mildlyinteresting', 'usnews']
fake_subreddits = ['satire', 'theonion', 'misleadingthumbnails', 'fakehistoryporn']

# Extract samples
samples_per_class = 10000
true_samples = extract_samples(data, true_subreddits, samples_per_class)
fake_samples = extract_samples(data, fake_subreddits, samples_per_class)

# Merge samples
samples = pd.concat([true_samples, fake_samples])[['id', 'clean_title', 'subreddit', '2_way_label']]

# Shuffle samples
samples = samples.sample(frac=1, random_state=42)

# Save samples
samples.to_csv('data/samples.csv', index=False)

# OpenFlamingo Loading

In [None]:
import torch
from open_flamingo import create_model_and_transforms


model, image_processor, tokenizer = create_model_and_transforms(
    clip_vision_encoder_path='ViT-L-14',
    clip_vision_encoder_pretrained='openai',
    lang_encoder_path='zpn/llama-7b',
    tokenizer_path='zpn/llama-7b',
    cross_attn_every_n_layers=4
)
model.load_state_dict(torch.load('/media/disk1/marcop/openFlamingo/checkpoint.pt'), strict=False)
model.half().eval().to('cuda');

# Demo News

In [1]:
import glob
import itertools
import ipyplot


# Hand selected news for few-shot learning
images, titles = [], []
lables = itertools.cycle(['Real: ', 'Fake: '])
for filename in sorted(glob.glob('data/demo/*.jpg')):
    titles.append(next(lables) + filename.split('.')[1])
    images.append(filename)
ipyplot.plot_images(images, titles, show_url=False)

# OpenFlamingo: Few-Shot Learning

In [17]:
import glob
import itertools
import pandas as pd
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import classification_report


# Define prompt format
def create_example(title, answer=None):
    example = f'<image>"{title}". Question: Is this news real? Answer:'
    if answer is not None:
        example += f' {answer}<|endofchunk|>'
    return example

# Load demo texts
answers = itertools.cycle(['Yes', 'No'])
filenames = sorted(glob.glob('data/demo/*.jpg'))
demo_texts = [create_example(name.split('.')[1], next(answers)) for name in filenames]
demo_texts = ''.join(demo_texts)

# Load demo images
demo_images = [Image.open(name) for name in filenames]
demo_images = [image_processor(image).unsqueeze(0) for image in demo_images]

# Load test data
dataframe = pd.read_csv('data/samples.csv')

outputs = []
pbar = tqdm(total=dataframe.shape[0])
tokenizer.padding_side = 'left'
for _, row in dataframe.iterrows():
    
    # Load query image
    query_image = Image.open(f'images/{row.id}.jpg')

    # Preprocess input
    lang_x = tokenizer([demo_texts + create_example(row.clean_title)], return_tensors='pt').to('cuda')
    vision_x = demo_images + [image_processor(query_image).unsqueeze(0)]
    vision_x = torch.cat(vision_x, dim=0)
    vision_x = vision_x.unsqueeze(1).unsqueeze(0).half().to('cuda')

    # Generate text
    with torch.no_grad():
        generated_text = model.generate(
            vision_x=vision_x,
            lang_x=lang_x['input_ids'],
            attention_mask=lang_x['attention_mask'],
            max_new_tokens=1,
            num_beams=3
            # Beam search reduces the risk of missing high probability word sequences by keeping the most likely num_beams hypotheses
            # at each time step and eventually choosing the hypothesis with the overall highest probability
        )
    generated_text = tokenizer.decode(generated_text[0])
    outputs.append(generated_text.split('Answer: ')[-1])
    pbar.update()

# Save outputs
pd.DataFrame(outputs, columns=['output']).to_csv('data/fewshot.csv', index=False)

# Evaluate results
predictions = [{'No': 0, 'Yes': 1}.get(o) for o in outputs]
print(classification_report(dataframe['2_way_label'], predictions, target_names=['Fake', 'Real']))

              precision    recall  f1-score   support

        Fake       0.67      0.75      0.71     10000
        Real       0.71      0.63      0.67     10000

    accuracy                           0.69     20000
   macro avg       0.69      0.69      0.69     20000
weighted avg       0.69      0.69      0.69     20000



# OpenFlamingo: Zero-Shot Generalization

In [18]:
import pandas as pd
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import classification_report


# Load test data
dataframe = pd.read_csv('data/samples.csv')

outputs = []
pbar = tqdm(total=dataframe.shape[0])
tokenizer.padding_side = 'left'
for _, row in dataframe.iterrows():
    
    # Load query image
    query_image = Image.open(f'images/{row.id}.jpg')

    # Preprocess input
    lang_x = tokenizer([f'<image>"{row.clean_title}". Question: Is this news real or fake? Explain your reasoning step by step by comparing text and image. Answer:'], return_tensors='pt').to('cuda')
    vision_x = [image_processor(query_image).unsqueeze(0)]
    vision_x = torch.cat(vision_x, dim=0)
    vision_x = vision_x.unsqueeze(1).unsqueeze(0).half().to('cuda')

    # Generate text
    with torch.no_grad():
        generated_text = model.generate(
            vision_x=vision_x,
            lang_x=lang_x['input_ids'],
            attention_mask=lang_x['attention_mask'],
            max_new_tokens=50,
            num_beams=3
            # Beam search reduces the risk of missing high probability word sequences by keeping the most likely num_beams hypotheses
            # at each time step and eventually choosing the hypothesis with the overall highest probability
        )
    generated_text = tokenizer.decode(generated_text[0])
    generated_text = generated_text.partition('Answer: ')[-1]

    # Remove repeated sentences
    while True:
        start, _, end = generated_text.rpartition('. ')
        if end not in start:
            break
        generated_text = start
    
    outputs.append(generated_text)
    pbar.update()

    # Save checkpoint
    if len(outputs) % 1000 == 0:
        pd.DataFrame(outputs, columns=['output']).to_csv('data/zeroshot.csv', index=False)


# Save outputs
pd.DataFrame(outputs, columns=['output']).to_csv('data/zeroshot.csv', index=False)

# Evaluate results
predictions = list(map(lambda x: 1 if 'is real' in x else 0, outputs))
print(classification_report(dataframe['2_way_label'], predictions, target_names=['Fake', 'Real']))

# Print sample outputs
pd.set_option('display.max_colwidth', None)
display(dataframe[['clean_title', '2_way_label']].head(20))
display(outputs[:20])

              precision    recall  f1-score   support

        Fake       0.56      0.73      0.63     10000
        Real       0.61      0.43      0.51     10000

    accuracy                           0.58     20000
   macro avg       0.59      0.58      0.57     20000
weighted avg       0.59      0.58      0.57     20000



Unnamed: 0,clean_title,2_way_label
0,debbie harry sits angrily backstage before a blondie concert in,0
1,made some raptors bean bag toss with rgb leds,1
2,this museum in berlin has touchable versions of their paintings for blind people,1
3,us now has more spanish speakers than spain only mexico has more,1
4,spiderman sporting a monster dong,0
5,cia operatives attempt to assassinate fidel castro circa,0
6,hillary clintons presidential concession speech after the election,0
7,my neighbour loves cars and just got his first one today along with his license thought id make his day with this present kacow,1
8,the apostle thomas reaction to seeing jesus christ after the resurrection ad,0
9,this one little white grape on a bunch of red grapes,1


['This news is real because it is a real picture of Sailor Moon. This is a real picture of Sailor Moon',
 'This news is fake because it is not real. It is fake because it is not real',
 'I think this is real because it is in a museum and it is in Germany',
 'This news is real because it is a fact that the U.S. has more Spanish speakers than Spain and Mexico. This is real because it is a fact that the U.S',
 'This news is fake because it is a joke. It is a joke because it is not real. It is not real because it is a joke',
 'This is a fake news article because it says that the CIA operatives attempted to assassinate Fidel Castro in 1960. This is fake because Fidel Castro was not assassinated in 1960',
 "This is a fake news because it is a parody of Hillary Clinton's presidential concession speech after the 2016 presidential election",
 'I think this is fake because the car looks like a toy car and the license looks like a toy license',
 'This news is fake because the apostle thomas react

# Embeddings Computation

In [None]:
import resource
import pandas as pd
import numpy as np
from PIL import Image
from sentence_transformers import SentenceTransformer


mpnet_model = SentenceTransformer('all-mpnet-base-v2')

# Compute title embeddings
titles = pd.read_csv('data/samples.csv').clean_title.tolist()
title_embeddings = mpnet_model.encode(titles, show_progress_bar=True, device='mps')
np.save('features/titles.npy', title_embeddings)

# Compute rationale embeddings
rationales = pd.read_csv('data/zeroshot.csv').output.tolist()
rationale_embeddings = mpnet_model.encode(rationales, show_progress_bar=True, device='mps')
np.save('features/rationales.npy', rationale_embeddings)

clip_model = SentenceTransformer('clip-ViT-L-14')

# Compute image embeddings
ids = pd.read_csv('data/samples.csv').id.tolist()
resource.setrlimit(resource.RLIMIT_NOFILE, (25000, 25000))
image_embeddings = clip_model.encode([Image.open(f'images/{id}.jpg') for id in ids], show_progress_bar=True, device='mps')
np.save('features/images.npy', image_embeddings)

# Classifier Evaluation

In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold, cross_val_score


# Load embeddings
titles = np.load('features/titles.npy')
images = np.load('features/images.npy')
rationales = np.load('features/rationales.npy')
titles_images = np.concatenate([titles, images], axis=1)
titles_rationales = np.concatenate([titles, rationales], axis=1)
images_rationales = np.concatenate([images, rationales], axis=1)
titles_images_rationales = np.concatenate([titles, images, rationales], axis=1)

# Load labels
labels = pd.read_csv('data/samples.csv')['2_way_label'].tolist()

# Evaluate accuracy
results = []
for embeddings in (titles, images, rationales, titles_images, titles_rationales, images_rationales, titles_images_rationales):
    classifier = MLPClassifier(max_iter=500, random_state=42)
    k_fold = KFold(n_splits=10)
    accuracy = cross_val_score(classifier, embeddings, labels, cv=k_fold)
    results.append(accuracy.mean())

pd.DataFrame(results, ['Title', 'Image', 'Rationale', 'Title + Image', 'Title + Rationale', 'Image + Rationale', 'Title + Image + Rationale'], columns=['Acccuracy'])

Unnamed: 0,Acccuracy
Title,0.8444
Image,0.85095
Rationale,0.7581
Title + Image,0.8958
Title + Rationale,0.84895
Image + Rationale,0.8682
Title + Image + Rationale,0.89895
