# Multi-Modal Deep Learning Model for Fake News Detection

In [1]:
import os
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Concatenate, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.metrics import classification_report
import pandas as pd
from keras.layers import Embedding


In [2]:
tweet_df = pd.read_csv('mediaeval2015/devset/tweets.txt', delimiter='\t', header=1, names=['tweetId', 'tweetText', 'userId', 'image_id','username','timestamp', 'label'])
#image_df = pd.read_csv('mediaeval2015/devset/images.txt', delimiter='\t', header=None, names=['image_id', 'image_url', 'annotation', 'event'])

In [3]:
tweet_df

Unnamed: 0,tweetId,tweetText,userId,image_id,username,timestamp,label
0,262995061304852481,@milenagimon: Miren a Sandy en NY! Tremenda i...,192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,fake
1,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a...",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake
2,262996108400271360,Scary shit #hurricane #NY http://t.co/e4JLBUfH,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,fake
3,263018881839411200,My fave place in the world #nyc #hurricane #sa...,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,fake
4,263364439582060545,42nd #time #square #NYC #subway #hurricane htt...,163674788,sandyA_fake_23,classycg,Tue Oct 30 19:39:10 +0000 2012,fake
...,...,...,...,...,...,...,...
14271,443231991593304064,@BobombDom *slaps TweetDeck with the PigFish h...,2179310905,pigFish_01,Da_Vault_Hunter,Tue Mar 11 03: 48: 36 +0000 2014,fake
14272,443086239127076865,New Species of Fish found in Brazil or just Re...,254843101,pigFish_01,DjSituation_RC,Mon Mar 10 18: 09: 26 +0000 2014,fake
14273,442978105238753280,What do we call this? #pigFISH http: \/\/t.co\...,2367553228,pigFish_01,Vivo1Vuyo,Mon Mar 10 10: 59: 45 +0000 2014,fake
14274,442753479782989824,Pigfish ? E dopo il pescecane c'è il pesce mai...,603120231,pigFish_01,CosimoTarta,Sun Mar 09 20: 07: 10 +0000 2014,fake


In [4]:
image_df = pd.read_csv('mediaeval2015/devset/images.txt', delimiter='\t', header=1, names=['image_id', 'image_url', 'label', 'event'])

In [5]:
image_df

Unnamed: 0,image_id,image_url,label,event
0,boston_fake_02,http://instagram.com/p/YN7_ThPXrU/,fake,boston
1,boston_fake_03,https://o.twimg.com/2/proxy.jpg?t=HBgeaHR0cDov...,fake,boston
2,boston_fake_04,http://media.tumblr.com/a813460e72a178d8127b50...,fake,boston
3,boston_fake_05,http://i.imgur.com/uxAh4wwh.jpg,fake,boston
4,boston_fake_06,http://www.nowtheendbegins.com/blog/wp-content...,fake,boston
...,...,...,...,...
355,livr_02,https://pbs.twimg.com/media/Bic4fA5CQAAT5KY.jpg,Livr,
356,livr_03,https://pbs.twimg.com/media/BialJuECIAATXE2.jpg,Livr,
357,livr_04,https://pbs.twimg.com/media/BiVisqpCIAAbgju.jpg,Livr,
358,passport_01,https://pbs.twimg.com/media/Bo-CY5ACcAAiRAw.jpg,Passport,


# Filtering the images that have labels 'real' or 'fake'

In [6]:
image_df = image_df[image_df['label'].isin(['fake', 'real'])]

In [7]:
image_df

Unnamed: 0,image_id,image_url,label,event
0,boston_fake_02,http://instagram.com/p/YN7_ThPXrU/,fake,boston
1,boston_fake_03,https://o.twimg.com/2/proxy.jpg?t=HBgeaHR0cDov...,fake,boston
2,boston_fake_04,http://media.tumblr.com/a813460e72a178d8127b50...,fake,boston
3,boston_fake_05,http://i.imgur.com/uxAh4wwh.jpg,fake,boston
4,boston_fake_06,http://www.nowtheendbegins.com/blog/wp-content...,fake,boston
...,...,...,...,...
333,bringback_fake_03,https://pbs.twimg.com/media/BoBFly8IQAEK881.jpg,fake,bringback
334,bringback_fake_04,https://pbs.twimg.com/media/BoBFzniIEAABAaP.jpg,fake,bringback
335,bringback_fake_05,https://pbs.twimg.com/media/BnLFu3DIUAECtam.jpg,fake,bringback
336,bringback_fake_06,https://pbs.twimg.com/media/BmjchgkIQAAUbds.jpg,fake,bringback


In [8]:
#Fetching the path of image directory to a new column image_dir

img_dir = 'mediaeval2015\devset\MediaEval2015_DevSet_Images\Medieval2015_DevSet_Images'
folders = os.listdir(img_dir)

folder_tuples = [(folder, folder.lower().replace(' ', '')) for folder in folders]
folder_tuples

for index, row in image_df.iterrows():
    event = row['event']
    for folder_tuple in folder_tuples:
        if folder_tuple[1].find(event.lower().replace(' ', '')) != -1:
            image_df.at[index, 'image_dir'] = img_dir+'\\'+folder_tuple[0]+'\\'+ row['label']+'s'
            break
            
image_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  image_df.at[index, 'image_dir'] = img_dir+'\\'+folder_tuple[0]+'\\'+ row['label']+'s'


Unnamed: 0,image_id,image_url,label,event,image_dir
0,boston_fake_02,http://instagram.com/p/YN7_ThPXrU/,fake,boston,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
1,boston_fake_03,https://o.twimg.com/2/proxy.jpg?t=HBgeaHR0cDov...,fake,boston,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
2,boston_fake_04,http://media.tumblr.com/a813460e72a178d8127b50...,fake,boston,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
3,boston_fake_05,http://i.imgur.com/uxAh4wwh.jpg,fake,boston,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
4,boston_fake_06,http://www.nowtheendbegins.com/blog/wp-content...,fake,boston,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
...,...,...,...,...,...
333,bringback_fake_03,https://pbs.twimg.com/media/BoBFly8IQAEK881.jpg,fake,bringback,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
334,bringback_fake_04,https://pbs.twimg.com/media/BoBFzniIEAABAaP.jpg,fake,bringback,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
335,bringback_fake_05,https://pbs.twimg.com/media/BnLFu3DIUAECtam.jpg,fake,bringback,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
336,bringback_fake_06,https://pbs.twimg.com/media/BmjchgkIQAAUbds.jpg,fake,bringback,mediaeval2015\devset\MediaEval2015_DevSet_Imag...


# Filtering the tweets that have images

In [9]:
tweet_ids_with_images = set(image_df['image_id'].tolist())
tweet_df = tweet_df[tweet_df['image_id'].isin(tweet_ids_with_images)]

In [10]:
tweet_df

Unnamed: 0,tweetId,tweetText,userId,image_id,username,timestamp,label
0,262995061304852481,@milenagimon: Miren a Sandy en NY! Tremenda i...,192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,fake
1,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a...",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake
2,262996108400271360,Scary shit #hurricane #NY http://t.co/e4JLBUfH,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,fake
3,263018881839411200,My fave place in the world #nyc #hurricane #sa...,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,fake
4,263364439582060545,42nd #time #square #NYC #subway #hurricane htt...,163674788,sandyA_fake_23,classycg,Tue Oct 30 19:39:10 +0000 2012,fake
...,...,...,...,...,...,...,...
14248,443653548698386432,@ZakiEzarik: MH370 found ..LOL http://t.co/MS9...,183570877,malaysia_fake_24,xaex_,Wed Mar 12 07:43:43 +0000 2014,humor
14249,443597162253807616,MH370 found ..LOL http://t.co/9rsOdub8oF,377976842,malaysia_fake_24,ZakiEzarik,Wed Mar 12 03:59:39 +0000 2014,humor
14250,444123242442129408,They found flight MH370!!!!! Lol http://t.co/R...,1593889836,malaysia_fake_24,ibrahimbathich,Thu Mar 13 14:50:07 +0000 2014,humor
14251,443320727572205569,Tak lawak pukimak cibai . \n\n“@SYAFIQBAKHARI:...,1101252169,malaysia_fake_24,xxxerul,Tue Mar 11 09:41:12 +0000 2014,humor


Merging both the daraframes on image dataframe

In [11]:
tweet_df = tweet_df[tweet_df['label'].isin(['fake', 'real'])]
merged_df = tweet_df.merge(image_df, on='image_id')
merged_df

Unnamed: 0,tweetId,tweetText,userId,image_id,username,timestamp,label_x,image_url,label_y,event,image_dir
0,262995061304852481,@milenagimon: Miren a Sandy en NY! Tremenda i...,192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,fake,https://pbs.twimg.com/media/A6Y9e5sCAAAYJqS.jpg,fake,sandy,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
1,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a...",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake,https://pbs.twimg.com/media/A6Y9e5sCAAAYJqS.jpg,fake,sandy,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
2,263047501433688064,Craziest picture ever #hurricane http://t.co/K...,267388495,sandyA_fake_09,BLangley401,Mon Oct 29 22:39:46 +0000 2012,fake,https://pbs.twimg.com/media/A6Y9e5sCAAAYJqS.jpg,fake,sandy,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
3,263183410581893121,Terrifying. #NY #hurricane #sandy #statueoflib...,44445572,sandyA_fake_09,Cheve_01,Tue Oct 30 07:39:49 +0000 2012,fake,https://pbs.twimg.com/media/A6Y9e5sCAAAYJqS.jpg,fake,sandy,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
4,263170243260723200,#hurricane http://t.co/RKNYPVU8,28214541,sandyA_fake_09,DallasMatthewss,Tue Oct 30 06:47:30 +0000 2012,fake,https://pbs.twimg.com/media/A6Y9e5sCAAAYJqS.jpg,fake,sandy,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
...,...,...,...,...,...,...,...,...,...,...,...
11251,443640792658083840,#PrayForMH370 ini gambar TIPU !pray for MH370 ...,534393225,malaysia_fake_08,Nurul_sofiyya,Wed Mar 12 06:53:02 +0000 2014,fake,https://pbs.twimg.com/media/BiNMP4cCMAERNed.jpg,fake,malaysia,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
11252,442713089784295424,Not sure it real or fake ... Pray for mh370 .....,505662558,malaysia_fake_08,adeqxqis,Sun Mar 09 17:26:40 +0000 2014,fake,https://pbs.twimg.com/media/BiNMP4cCMAERNed.jpg,fake,malaysia,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
11253,448804008380362752,#PrayForMH370 ini gambar TIPU !pray for MH370 ...,710305963,malaysia_fake_08,akilaakiid1,Wed Mar 26 12:49:48 +0000 2014,fake,https://pbs.twimg.com/media/BiNMP4cCMAERNed.jpg,fake,malaysia,mediaeval2015\devset\MediaEval2015_DevSet_Imag...
11254,442487170243125248,รูปที่มั่วกันในเน็ตรูปนี้ จริงๆเป็นเที่ยวบิน U...,436603265,malaysia_fake_08,KawinKethirun,Sun Mar 09 02:28:57 +0000 2014,fake,https://pbs.twimg.com/media/BiNMP4cCMAERNed.jpg,fake,malaysia,mediaeval2015\devset\MediaEval2015_DevSet_Imag...


In [12]:
# Prepare image data
from tensorflow.keras.preprocessing.image import load_img, img_to_array

def load_image(image_path, target_size=(224, 224)):
    # Check if file exists with valid extensions
    valid_extensions = ['.jpg', '.jpeg', '.png','.gif']
    for ext in valid_extensions:
        img_path = os.path.join(image_path + ext)
        if os.path.exists(img_path):
            break
    else:
        print(f"ERROR: Image not found: {image_path}")
        return None

    # Load and preprocess image
    img = load_img(img_path, target_size=target_size)
    img = img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img /= 255.

    return img

image_data = []
for i in range(len(merged_df)):
    img_id = merged_df.loc[i, 'image_id']
    img_dir = merged_df.loc[merged_df['image_id'] == img_id, 'image_dir'].values[0]
    img_path = img_dir + '\\' + img_id
    img = load_image(img_path, target_size=(224, 224))
    if img is None:
        print(img_path+" is none")
        merged_df = merged_df.drop(merged_df[merged_df['image_id'] == img_id].index)
    else:
        image_data.append(img)
#image_data = np.vstack(image_data)

# Prepare labels
#labels = tweet_df['label'].values

ERROR: Image not found: mediaeval2015\devset\MediaEval2015_DevSet_Images\Medieval2015_DevSet_Images\BostonMarathon\fakes\boston_fake_35
mediaeval2015\devset\MediaEval2015_DevSet_Images\Medieval2015_DevSet_Images\BostonMarathon\fakes\boston_fake_35 is none


In [13]:
image_data

[array([[[[0.27450982, 0.3647059 , 0.39607844],
          [0.2784314 , 0.36862746, 0.4       ],
          [0.26666668, 0.35686275, 0.3882353 ],
          ...,
          [0.        , 0.05098039, 0.0627451 ],
          [0.        , 0.04705882, 0.05882353],
          [0.01568628, 0.05882353, 0.07450981]],
 
         [[0.29803923, 0.3882353 , 0.41960785],
          [0.29803923, 0.3882353 , 0.41960785],
          [0.2901961 , 0.38039216, 0.4117647 ],
          ...,
          [0.01176471, 0.07058824, 0.08235294],
          [0.01568628, 0.07058824, 0.08235294],
          [0.01176471, 0.06666667, 0.07843138]],
 
         [[0.3137255 , 0.40392157, 0.43529412],
          [0.30588236, 0.39607844, 0.42745098],
          [0.30588236, 0.39607844, 0.42745098],
          ...,
          [0.01568628, 0.07450981, 0.10196079],
          [0.01960784, 0.07843138, 0.10588235],
          [0.00784314, 0.05882353, 0.09019608]],
 
         ...,
 
         [[0.67058825, 0.627451  , 0.54901963],
          [0.73725

In [14]:
print(len(image_data))
print(len(merged_df))

11255
11255


In [15]:
image_data = np.vstack(image_data)

# Building the multimodal model

In [20]:
from tensorflow.keras.layers import Conv2D, MaxPooling2D, concatenate, Dense

EMBEDDING_DIM = 100 
MAX_SEQUENCE_LENGTH = 100 
MAX_IMAGE_DIM = (224, 224) 

# Loading pre-trained GloVe embeddings
glove_path = 'glove.6B\glove.6B.100d.txt'
embeddings_index = {}
with open(glove_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Tokenizing and encoding tweet text
tokenizer = Tokenizer(num_words=MAX_SEQUENCE_LENGTH)
tokenizer.fit_on_texts(merged_df['tweetText']) # df is the dataframe with tweet text
sequences = tokenizer.texts_to_sequences(merged_df['tweetText'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Creating embedding matrix
num_words = min(MAX_SEQUENCE_LENGTH, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_SEQUENCE_LENGTH:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector



# Defining the model
text_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
embedded_sequences = embedding_layer(text_input)
dense_layer = Dense(128, activation='relu')
dense_output = dense_layer(embedded_sequences)

image_input = Input(shape=(*MAX_IMAGE_DIM, 3))
conv_layer = Conv2D(32, (3, 3), activation='relu')
pool_layer = MaxPooling2D((2, 2))
flatten_layer = Flatten()
conv_output = conv_layer(image_input)
pool_output = pool_layer(conv_output)
flatten_output = flatten_layer(pool_output)
dense_output = Dense(128, activation='relu')(flatten_output)

concatenated = concatenate([dense_output, flatten_output], axis=-1)
dense_layer = Dense(64, activation='relu')(concatenated)
output_layer = Dense(2, activation='softmax')(dense_layer)

model = Model(inputs=[text_input, image_input], outputs=output_layer)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert label strings to numeric values
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(merged_df['label_x'])

# Convert numeric labels to binary arrays
labels = to_categorical(labels, num_classes=2)

model.fit([data, image_data], labels, epochs=10, batch_size=32, validation_split=0.2, callbacks=[EarlyStopping(patience=5)])

Found 23288 unique tokens.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.callbacks.History at 0x2fc11a0cfd0>

In [21]:
#loading the test tweet data
test_tweet_df = pd.read_csv('mediaeval2015/testset/tweets.txt', delimiter='\t', header=1, names=['tweetId', 'tweetText', 'userId', 'image_id','username','timestamp', 'label'])
test_tweet_df = test_tweet_df[test_tweet_df['label'].isin(['fake', 'real'])]
test_tweet_df

Unnamed: 0,tweetId,tweetText,userId,image_id,username,timestamp,label
0,578874632670953472,Absolutely beautiful! RT @Shyman33: Eclipse fr...,344707006,eclipse_01,JaredUcanChange,Fri Mar 20 11:04:02 +0000 2015,fake
1,578891261353984000,“@Shyman33: Eclipse from ISS.... http://t.co/C...,224839607,eclipse_01,tpjp1231,Fri Mar 20 12:10:06 +0000 2015,fake
2,578846612312748032,Eclipse from ISS.... http://t.co/En87OtvsU6,134543073,eclipse_01,Shyman33,Fri Mar 20 09:12:41 +0000 2015,fake
3,578975333841551360,@ebonfigli: Éclipse vue de l'ISS... Autre chos...,1150728872,eclipse_01,Epimethee_,Fri Mar 20 17:44:11 +0000 2015,fake
4,579274670853226496,“@ebonfigli: Éclipse vue de l'ISS... Autre cho...,470889709,eclipse_01,BusineMi,Sat Mar 21 13:33:38 +0000 2015,fake
...,...,...,...,...,...,...,...
3749,578433150071775232,Un présentateur de la ZDF confesse avoir truqu...,257551211,varoufakis_1,Cdt_Sylvestre,Thu Mar 19 05:49:44 +0000 2015,fake
3750,578433646597656576,"Oh les kleine menteurs ""@CorineBarella: Un pré...",27575883,varoufakis_1,damomarc,Thu Mar 19 05:51:42 +0000 2015,fake
3751,578486910491996160,Este es el programa de ZDF en el que confirman...,2049211,varoufakis_1,javierpascual,Thu Mar 19 09:23:21 +0000 2015,fake
3752,578505023912591360,11.34 - wir haben FAST Mittag ▶ Riesen Verwirr...,262222386,varoufakis_1,aotto1968_2,Thu Mar 19 10:35:20 +0000 2015,fake


In [22]:
#Fetching the path of image directory to a new column image_dir

img_dir = 'mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages'
folders = os.listdir(img_dir)
folders

['Garissa_Attack',
 'Nepal_earthquake',
 'Samurai_ghost',
 'Solar_eclipse',
 'Syrian_boy',
 'Varoufakis_zdf']

In [23]:

for index, row in test_tweet_df.iterrows():
    prefix = row['image_id'].split('_')[0]
    for folder in folders:
        if prefix.lower() in folder.lower():
            test_tweet_df.at[index, 'image_dir'] = img_dir+'/'+folder
            break
            
test_tweet_df
test_tweet_df['image_id'] = test_tweet_df['image_id'].str.rstrip()

test_tweet_df.head()

Unnamed: 0,tweetId,tweetText,userId,image_id,username,timestamp,label,image_dir
0,578874632670953472,Absolutely beautiful! RT @Shyman33: Eclipse fr...,344707006,eclipse_01,JaredUcanChange,Fri Mar 20 11:04:02 +0000 2015,fake,mediaeval2015/testset/MediaEval2015_TestSetIma...
1,578891261353984000,“@Shyman33: Eclipse from ISS.... http://t.co/C...,224839607,eclipse_01,tpjp1231,Fri Mar 20 12:10:06 +0000 2015,fake,mediaeval2015/testset/MediaEval2015_TestSetIma...
2,578846612312748032,Eclipse from ISS.... http://t.co/En87OtvsU6,134543073,eclipse_01,Shyman33,Fri Mar 20 09:12:41 +0000 2015,fake,mediaeval2015/testset/MediaEval2015_TestSetIma...
3,578975333841551360,@ebonfigli: Éclipse vue de l'ISS... Autre chos...,1150728872,eclipse_01,Epimethee_,Fri Mar 20 17:44:11 +0000 2015,fake,mediaeval2015/testset/MediaEval2015_TestSetIma...
4,579274670853226496,“@ebonfigli: Éclipse vue de l'ISS... Autre cho...,470889709,eclipse_01,BusineMi,Sat Mar 21 13:33:38 +0000 2015,fake,mediaeval2015/testset/MediaEval2015_TestSetIma...


Loading the test images

In [24]:
len(test_tweet_df)
#test_tweet_df.dropna(subset=['image_dir'], inplace=True)
test_tweet_df = test_tweet_df[test_tweet_df['image_dir'].notna()].reset_index(drop=True)

print(test_tweet_df)

                 tweetId                                          tweetText  \
0     578874632670953472  Absolutely beautiful! RT @Shyman33: Eclipse fr...   
1     578891261353984000  “@Shyman33: Eclipse from ISS.... http://t.co/C...   
2     578846612312748032        Eclipse from ISS.... http://t.co/En87OtvsU6   
3     578975333841551360  @ebonfigli: Éclipse vue de l'ISS... Autre chos...   
4     579274670853226496  “@ebonfigli: Éclipse vue de l'ISS... Autre cho...   
...                  ...                                                ...   
1980  578433150071775232  Un présentateur de la ZDF confesse avoir truqu...   
1981  578433646597656576  Oh les kleine menteurs "@CorineBarella: Un pré...   
1982  578486910491996160  Este es el programa de ZDF en el que confirman...   
1983  578505023912591360  11.34 - wir haben FAST Mittag ▶ Riesen Verwirr...   
1984  578305144380612609  Sorry, @yanisvaroufakis! https://t.co/BSkYrbII...   

          userId      image_id         username  \


In [25]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array

def load_image(image_path, target_size=(224, 224)):
    # Check if file exists with valid extensions
    valid_extensions = ['.jpg', '.jpeg', '.png','.gif']
    for ext in valid_extensions:
        img_path = os.path.join(image_path + ext)
        if os.path.exists(img_path):
            break
        else:
            #print(f"ERROR: Image not found: {image_path}")
            return None

    # Load and preprocess image
    img = load_img(img_path, target_size=target_size)
    img = img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img /= 255.

    return img


test_image_data = []
to_drop = []
for i in range(len(test_tweet_df)):
    img_id = test_tweet_df.loc[i, 'image_id']
    img_dir = test_tweet_df.loc[test_tweet_df['image_id'] == img_id, 'image_dir'].values[0]
    img_path = img_dir + "/" + img_id
    img = load_image(img_path, target_size=(224, 224))
    if img is None:
        print(img_path + " is none")
        to_drop.append(i)
    else:
        test_image_data.append(img)

if to_drop:
    test_tweet_df = test_tweet_df.drop(test_tweet_df.index[to_drop])

mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Solar_eclipse/eclipse_video_01 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Solar_eclipse/eclipse_video_01 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_Te

mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaEval2015_TestSetImages/TestSetImages/Samurai_ghost/samurai_01,samurai_02 is none
mediaeval2015/testset/MediaE

In [26]:
len(test_image_data)

1812

In [27]:
len(test_tweet_df)

1812

In [28]:
test_image_data = np.vstack(test_image_data)

# Predicting the test labels

In [29]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Convert label strings to numeric values
label_encoder = LabelEncoder()
test_tweet_df['label'] = label_encoder.fit_transform(test_tweet_df['label'])

# Tokenize and encode tweet text
tokenizer = Tokenizer(num_words=MAX_SEQUENCE_LENGTH)
tokenizer.fit_on_texts(test_tweet_df['tweetText'])
sequences = tokenizer.texts_to_sequences(test_tweet_df['tweetText'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Get the predicted labels for the test data
y_pred = model.predict([data, test_image_data])
y_pred = np.argmax(y_pred, axis=1)

# Print the classification report for real news
print("Classification Report for Real Data:")
print(classification_report(test_tweet_df['label'], y_pred, labels=[0], target_names=["real"]))

# Print the classification report for fake news
print("Classification Report for Fake Data:")
print(classification_report(test_tweet_df['label'], y_pred, labels=[1], target_names=["fake"]))


Found 4639 unique tokens.
Classification Report for Real Data:
              precision    recall  f1-score   support

        real       0.99      0.55      0.71       610

   micro avg       0.99      0.55      0.71       610
   macro avg       0.99      0.55      0.71       610
weighted avg       0.99      0.55      0.71       610

Classification Report for Fake Data:
              precision    recall  f1-score   support

        fake       0.81      1.00      0.90      1202

   micro avg       0.81      1.00      0.90      1202
   macro avg       0.81      1.00      0.90      1202
weighted avg       0.81      1.00      0.90      1202

