# Download images

## Imports

In [1]:
# %pip install google-search-results
# %pip install ftfy regex tqdm
# %pip install git+https://github.com/openai/CLIP.git
# %pip install tree

import os, urllib.request, json
from serpapi import GoogleSearch
from urllib.error import HTTPError
import pandas as pd
import numpy as np
import torch
import clip
import os
import skimage
import IPython.display
import matplotlib.pyplot as plt
from PIL import Image
from collections import OrderedDict

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

pd.set_option("colwidth", None)


## Functions to download images

In [2]:
def check_folders(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [3]:
def get_config(index, lang, titles, num_images):
    config = {
        "lang": lang,
        "query": titles.at[index, lang],
        "label": 'fake' * titles.at[index, 'label'] + 'legit' * (1 - titles.at[index, 'label']),
        "filename": titles.at[index, 'filename'][:-4],
        "num_images": num_images
            }
    return config

In [4]:
def get_google_images(config, serpapi_key, verbosity=0):
    params = {
      "api_key": serpapi_key,
      "engine": "google", #"google",
      "q": config["query"],
      "tbm": "isch"
    }
    path = f"./images/{config['label']}/{config['filename']}/{config['lang']}/"
    check_folders(path)

    search = GoogleSearch(params)
    results = search.get_dict()
    k = 0

    if 'images_results' in results.keys():
        for image in results['images_results']:
            if k == config["num_images"]:
                if verbosity > 0:
                    print(f"Downloaded images for {config['filename']} news in {config['lang']} language.")
                break
            if verbosity > 1:
                print(f"Downloading {k} image for {config['filename']} news in {config['lang']} language.")
            opener=urllib.request.build_opener()
            opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
            urllib.request.install_opener(opener)
            try:
                urllib.request.urlretrieve(image['original'], f"{path}{config['filename']}_{config['lang']}_img{k}.jpg")
                k += 1
            except Exception:
                if verbosity > 1:
                    print("Image skipped due to error")

In [5]:
def download_one_news(index, titles, languages, num_images, serpapi_key, verbosity=0):
    for lang in languages:
        config = get_config(index, lang, titles, num_images)
        get_google_images(config, serpapi_key, verbosity=verbosity)
    print(f"Downloaded all images for {config['filename']} news.")

## Load the titles

In [6]:
titles = pd.read_csv("./125fake_125legit.csv")
titles.head()

Unnamed: 0,file,headline,fr,de,es,ru,is_fake
0,007fake.txt,Jennifer Aniston on the Exact Moment She Had It With the Pregnancy Rumors,Jennifer Aniston au moment exact où elle a eu avec les rumeurs de grossesse.,"Jennifer Aniston in dem Moment, in dem sie es mit den Schwangerschaftsgerüchten hatte.",Jennifer Aniston en el momento exacto que lo tuvo con los rumores de embarazo.,"Дженнифер Энистон в тот момент, когда у нее есть слухи о беременности.",1
1,022fake.txt,Taylor Swift 'files documents to launch streaming service',Taylor Swift «Documents de fichiers pour lancer le service de streaming».,"Taylor Swift 'Dateien dokumentiert, um den Streaming -Dienst zu starten'.",Taylor Swift 'archiva documentos para iniciar el servicio de transmisión'.,Taylor Swift 'Files Documents для запуска потоковой службы'.,1
2,009fake.txt,Miley Cyrus Wedding Rumors: Will Miley Force Liam Hemsworth To Sign A Prenup?,Miley Cyrus Rumeurs de mariage: Miley forcera-t-elle Liam Hemsworth à signer un contrat de contrat?.,"Miley Cyrus Hochzeitsgerüchte: Wird Miley Liam Hemsworth dazu zwingen, ein Prenup zu unterschreiben?.",Rumores de boda de Miley Cyrus: ¿Miley forzará a Liam Hemsworth firmar un prenupc?.,Свадебные слухи Майли Сайрус: Сможет ли Майли Форс Лиам Хемсворт подписать премьер?.,1
3,024fake.txt,Justin Bieber pokes fun at himself after stepping out with an wet mark,Justin Bieber se moque de lui-même après avoir quitté une marque humide.,"Justin Bieber macht sich lustig an sich, nachdem er mit einer nassen Marke ausgetreten ist.",Justin Bieber se burla de sí mismo después de salir con una marca húmeda.,"Джастин Бибер высмеивает себя после того, как вышел с влажной отметки.",1
4,002fake.txt,Brad Pitt Texts Jennifer Aniston Nonstop: Seeks Intense Emotional Support After Angelina Jolie Divorce! | Celeb Dirty Laundry,Brad Pitt envoie des SMS à Jennifer Aniston sans escale: cherche un soutien émotionnel intense après le divorce d'Angelina Jolie! | Laverie sale de célébrité.,Brad Pitt Texte Jennifer Aniston Nonstop: sucht intensive emotionale Unterstützung nach der Scheidung von Angelina Jolie! | Promi Dirty Wäsche.,Brad Pitt texts Jennifer Aniston Nonstop: ¡busca un intenso apoyo emocional después de Angelina Jolie Divorce! | Celeb Dirty Laundry.,Брэд Питт текстов Дженнифер Энистон Неустановка: ищет интенсивную эмоциональную поддержку после развода Анджелины Джоли! | Знаменитость грязное прачечная.,1


In [7]:
titles = titles.rename(columns={'headline': 'en', 'file': 'filename', 'is_fake': 'label'})
titles.head()

Unnamed: 0,filename,en,fr,de,es,ru,label
0,007fake.txt,Jennifer Aniston on the Exact Moment She Had It With the Pregnancy Rumors,Jennifer Aniston au moment exact où elle a eu avec les rumeurs de grossesse.,"Jennifer Aniston in dem Moment, in dem sie es mit den Schwangerschaftsgerüchten hatte.",Jennifer Aniston en el momento exacto que lo tuvo con los rumores de embarazo.,"Дженнифер Энистон в тот момент, когда у нее есть слухи о беременности.",1
1,022fake.txt,Taylor Swift 'files documents to launch streaming service',Taylor Swift «Documents de fichiers pour lancer le service de streaming».,"Taylor Swift 'Dateien dokumentiert, um den Streaming -Dienst zu starten'.",Taylor Swift 'archiva documentos para iniciar el servicio de transmisión'.,Taylor Swift 'Files Documents для запуска потоковой службы'.,1
2,009fake.txt,Miley Cyrus Wedding Rumors: Will Miley Force Liam Hemsworth To Sign A Prenup?,Miley Cyrus Rumeurs de mariage: Miley forcera-t-elle Liam Hemsworth à signer un contrat de contrat?.,"Miley Cyrus Hochzeitsgerüchte: Wird Miley Liam Hemsworth dazu zwingen, ein Prenup zu unterschreiben?.",Rumores de boda de Miley Cyrus: ¿Miley forzará a Liam Hemsworth firmar un prenupc?.,Свадебные слухи Майли Сайрус: Сможет ли Майли Форс Лиам Хемсворт подписать премьер?.,1
3,024fake.txt,Justin Bieber pokes fun at himself after stepping out with an wet mark,Justin Bieber se moque de lui-même après avoir quitté une marque humide.,"Justin Bieber macht sich lustig an sich, nachdem er mit einer nassen Marke ausgetreten ist.",Justin Bieber se burla de sí mismo después de salir con una marca húmeda.,"Джастин Бибер высмеивает себя после того, как вышел с влажной отметки.",1
4,002fake.txt,Brad Pitt Texts Jennifer Aniston Nonstop: Seeks Intense Emotional Support After Angelina Jolie Divorce! | Celeb Dirty Laundry,Brad Pitt envoie des SMS à Jennifer Aniston sans escale: cherche un soutien émotionnel intense après le divorce d'Angelina Jolie! | Laverie sale de célébrité.,Brad Pitt Texte Jennifer Aniston Nonstop: sucht intensive emotionale Unterstützung nach der Scheidung von Angelina Jolie! | Promi Dirty Wäsche.,Brad Pitt texts Jennifer Aniston Nonstop: ¡busca un intenso apoyo emocional después de Angelina Jolie Divorce! | Celeb Dirty Laundry.,Брэд Питт текстов Дженнифер Энистон Неустановка: ищет интенсивную эмоциональную поддержку после развода Анджелины Джоли! | Знаменитость грязное прачечная.,1


## Downloading images

In [15]:
serpapi_key = 'Your API key here'
languages = ['de', 'fr', 'en', 'es', 'ru']
num_images = 10

for index in range(len(titles)):
    print(f'Downloading images for {index} news...')
    download_one_news(index, titles, languages, num_images, serpapi_key, verbosity=2)

# Make new features with CLIP

## Load images from folders

In [8]:
def load_images(index, titles, languages = ['de', 'es', 'ru', 'en', 'fr']):
    config = get_config(index, languages[0], titles, 0)
    data_dir = f"./images/{config['label']}/{config['filename']}/"
    original_images = {}
    images = {}

    for lang in languages:
        config = get_config(index, lang, titles, 0)
        original_images[lang] = []
        images[lang] = []
        data_dir_lang = data_dir + f"{config['lang']}/"            
        n_images_exist = len(os.listdir(data_dir_lang))
        for i in range(n_images_exist):
            try:
                
                filename = data_dir_lang + f"{config['filename']}_{config['lang']}_img{i}.jpg"
                image = Image.open(filename).convert("RGB")
                original_images[lang].append(image)
                images[lang].append(preprocess(image))
            except FileNotFoundError:
                pass
            
    return original_images, images

## Calculate cosine similarities between images

In [9]:
def cos_sims_for_one_news(index, titles, languages, language_pairs, model):
    original_images, images = load_images(index, titles, languages = languages)
    lang_images_dict = {}
    
    for lang in images:
        if len(images[lang]) > 0:
            lang_images_dict[lang] = torch.tensor(np.stack(images[lang])).cuda()
        else:
            lang_images_dict[lang] = None
    
    def sim_score(pair, image_features):
        return (image_features[pair[0]].cpu().numpy() @ image_features[pair[1]].cpu().numpy().T).mean()
    
    with torch.no_grad():
        image_features = {}
        for lang in lang_images_dict.keys():
            if lang_images_dict[lang] is not None:
                image_features[lang] = model.encode_image(lang_images_dict[lang]).float()
                image_features[lang] /= image_features[lang].norm(dim=-1, keepdim=True)
            else:
                image_features[lang] = torch.zeros([1, 512]).cuda()
        similarities = {pair: sim_score(pair, image_features) for pair in language_pairs}
    return similarities

## Initialize model, calculate similarities 

In [46]:
# #!g1.1
model, preprocess = clip.load("ViT-B/32")
model.cuda().eval()
language_pairs = [('en', 'de'), ('en', 'es'), ('en', 'fr'), ('en', 'ru')]
for pair in language_pairs:
    titles[f'{pair[0]}-{pair[1]}'] = 0

In [11]:
titles_with_image_similarities = pd.read_csv('titles_with_image_similarities.csv')

In [34]:
#!g1.1
for index in list(range(35)) + list(range(125, 145)) + list(range(151, 170)):
    print(f'Calculating similarities for {index + 1} news...')
    try:
        similarities = cos_sims_for_one_news(index, titles, languages=languages, language_pairs=language_pairs, model=model)
        print(similarities)
    except:
        print('The folder doest not exist')
    for pair in similarities:
        titles_with_image_similarities.loc[index, f'{pair[0]}-{pair[1]}'] = similarities[pair]   


Calculating similarities for 1 news...
{('en', 'de'): 0.7408916, ('en', 'es'): 0.7318494, ('en', 'fr'): 0.6617593, ('en', 'ru'): 0.68455535}
Calculating similarities for 2 news...




{('en', 'de'): 0.45779538, ('en', 'es'): 0.0, ('en', 'fr'): 0.70839745, ('en', 'ru'): 0.7495389}
Calculating similarities for 3 news...
{('en', 'de'): 0.7692871, ('en', 'es'): 0.5929646, ('en', 'fr'): 0.7557569, ('en', 'ru'): 0.69038534}
Calculating similarities for 4 news...
{('en', 'de'): 0.0, ('en', 'es'): 0.6007725, ('en', 'fr'): 0.5921936, ('en', 'ru'): 0.6267605}
Calculating similarities for 5 news...
{('en', 'de'): 0.6701048, ('en', 'es'): 0.559618, ('en', 'fr'): 0.6555879, ('en', 'ru'): 0.6701938}
Calculating similarities for 6 news...
{('en', 'de'): 0.7621966, ('en', 'es'): 0.73825717, ('en', 'fr'): 0.69917417, ('en', 'ru'): 0.75094604}
Calculating similarities for 7 news...
{('en', 'de'): 0.70103335, ('en', 'es'): 0.69034773, ('en', 'fr'): 0.69672483, ('en', 'ru'): 0.6592448}
Calculating similarities for 8 news...
{('en', 'de'): 0.65023696, ('en', 'es'): 0.7421864, ('en', 'fr'): 0.68216604, ('en', 'ru'): 0.70168245}
Calculating similarities for 9 news...
{('en', 'de'): 0.6406

{('en', 'de'): 0.82417387, ('en', 'es'): 0.64521146, ('en', 'fr'): 0.8059664, ('en', 'ru'): 0.88558257}
Calculating similarities for 158 news...
{('en', 'de'): 0.6169108, ('en', 'es'): 0.5438195, ('en', 'fr'): 0.5620978, ('en', 'ru'): 0.49502602}
Calculating similarities for 159 news...
{('en', 'de'): 0.588392, ('en', 'es'): 0.5960218, ('en', 'fr'): 0.6580648, ('en', 'ru'): 0.0}
Calculating similarities for 160 news...
{('en', 'de'): 0.28622127, ('en', 'es'): 0.61300826, ('en', 'fr'): 0.5623956, ('en', 'ru'): 0.0}
Calculating similarities for 161 news...
{('en', 'de'): 0.27845994, ('en', 'es'): 0.72166663, ('en', 'fr'): 0.7106166, ('en', 'ru'): 0.6549938}
Calculating similarities for 162 news...
The folder doest not exist
Calculating similarities for 163 news...
{('en', 'de'): 0.62160575, ('en', 'es'): 0.6241213, ('en', 'fr'): 0.6619949, ('en', 'ru'): 0.6400223}
Calculating similarities for 164 news...
{('en', 'de'): 0.7450821, ('en', 'es'): 0.7400896, ('en', 'fr'): 0.0, ('en', 'ru'): 

In [68]:
subsample = titles_with_image_similarities[titles_with_image_similarities.iloc[:, -4:].sum(axis=1) > 0]

## Results

In [65]:
mean_by_lang = subsample.groupby('label')[subsample.columns[-4:]].mean()
mean_by_lang.round(4)

Unnamed: 0_level_0,en-de,en-es,en-fr,en-ru
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.6363,0.6478,0.6541,0.5484
1,0.5799,0.6335,0.6285,0.5767


In [66]:
mean_by_lang.mean(axis=1).round(4)

label
0    0.6217
1    0.6046
dtype: float64

## Save csv file with new features

In [70]:
subsample.to_csv('subsample.csv', index=False)

titles_with_image_similarities.to_csv("./titles_with_image_similarities.csv", index = False)