<a href="https://colab.research.google.com/github/marco-siino/fake_news_spreaders_detection/blob/main/NaiveBayes_ModelNB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Automated Detection of Fake News Spreaders: An Evaluative Study of Transformers and SOTA Models on Multilingual Dataset. 
Naive Bayes Model, Training and Testing Notebook.
Code by M. Siino. 

From the paper: "Automated Detection of Fake News Spreaders: An Evaluative Study of Transformers and SOTA Models on Multilingual Dataset." by M.Siino et al.



## Importing modules.

In [1]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from keras.models import Model
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from google.colab import files
from io import open
from pathlib import Path

## Importing DS and extract in current working directory.

In [2]:
# Url obtained starting from this: https://drive.google.com/file/d/19ZcqEv88euKB71HfAWjTGN3uCKp2qsfP/ and forcing export=download.
urlTrainingSet = "https://drive.google.com/uc?export=download&id=19ZcqEv88euKB71HfAWjTGN3uCKp2qsfP"
urlTestSet="https://drive.google.com/uc?export=download&id=1nLiYvsnqcSPsS27YrBzlFjinmXwidIYa"

training_set = tf.keras.utils.get_file("pan20-author-profiling-training-2020-02-23.zip", urlTrainingSet,
                                    extract=True, archive_format='zip',cache_dir='.',
                                    cache_subdir='')
test_set = tf.keras.utils.get_file("pan20-author-profiling-test-2020-02-23.zip", urlTestSet,
                                    extract=True, archive_format='zip',cache_dir='.',
                                    cache_subdir='')

training_set_dir = os.path.join(os.path.dirname(training_set), 'pan20-author-profiling-training-2020-02-23')
test_set_dir = os.path.join(os.path.dirname(test_set), 'pan20-author-profiling-test-2020-02-23')

print(training_set)
print(training_set_dir)

!ls -A

Downloading data from https://drive.google.com/uc?export=download&id=19ZcqEv88euKB71HfAWjTGN3uCKp2qsfP
Downloading data from https://drive.google.com/uc?export=download&id=1nLiYvsnqcSPsS27YrBzlFjinmXwidIYa
./pan20-author-profiling-training-2020-02-23.zip
./pan20-author-profiling-training-2020-02-23
.config
__MACOSX
pan20-author-profiling-test-2020-02-23
pan20-author-profiling-test-2020-02-23.zip
pan20-author-profiling-training-2020-02-23
pan20-author-profiling-training-2020-02-23.zip
sample_data


## Build folders hierarchy to use Keras folders preprocessing function.

In [3]:
### Training Folders. ###

# First level directory.
if not os.path.exists('train_dir_en'):
    os.makedirs('train_dir_en')
if not os.path.exists('train_dir_es'):
    os.makedirs('train_dir_es')

# Class labels directory.
if not os.path.exists('train_dir_en/0'):
    os.makedirs('train_dir_en/0')
if not os.path.exists('train_dir_es/0'):
    os.makedirs('train_dir_es/0')
if not os.path.exists('train_dir_en/1'):
    os.makedirs('train_dir_en/1')
if not os.path.exists('train_dir_es/1'):
    os.makedirs('train_dir_es/1')

# Make Py variables.
train_dir='train_dir_'

## Test Folders. ##
# First level directory.
if not os.path.exists('test_dir_en'):
    os.makedirs('test_dir_en')
if not os.path.exists('test_dir_es'):
    os.makedirs('test_dir_es')

# Class labels directory.
if not os.path.exists('test_dir_en/0'):
    os.makedirs('test_dir_en/0')
if not os.path.exists('test_dir_es/0'):
    os.makedirs('test_dir_es/0')
if not os.path.exists('test_dir_en/1'):
    os.makedirs('test_dir_en/1')
if not os.path.exists('test_dir_es/1'):
    os.makedirs('test_dir_es/1')

# Make Py variables.
test_dir='test_dir_'

!ls -A

.config						sample_data
__MACOSX					test_dir_en
pan20-author-profiling-test-2020-02-23		test_dir_es
pan20-author-profiling-test-2020-02-23.zip	train_dir_en
pan20-author-profiling-training-2020-02-23	train_dir_es
pan20-author-profiling-training-2020-02-23.zip


## Set language and directory paths.


In [4]:
# Set en and es train_dir and test_dir paths.
language='es'

truth_file_training_dir_es=training_set_dir+'/'+language+'/'
truth_file_training_path_es = truth_file_training_dir_es+'truth.txt'

truth_file_test_dir=test_set_dir
truth_file_test_path_es = truth_file_test_dir+'/'+language+'.txt'


language='en'

truth_file_training_dir_en=training_set_dir+'/'+language+'/'
truth_file_training_path_en = truth_file_training_dir_en+'truth.txt'

truth_file_test_path_en = truth_file_test_dir+'/'+language+'.txt'

## Read truth.txt to organize training dataset folders.



In [5]:
# Organize EN folders.
language='en'
# Open the file truth.txt with read only permit.
f = open(truth_file_training_path_en, "r")
# use readline() to read the first line 
line = f.readline()
# use the read line to read further.
# If the file is not empty keep reading one line
# at a time, till the file is empty
while line:
    # Split line at :::
    x = line.split(":::")
    fNameXml = x[0]+'.xml'
    fNameTxt = x[0]+'.txt'
    # Second coord [0] gets just the first character (label) and not /n too.
    label = x[1][0]

    # Now move the file to the right folder.
    if os.path.exists(truth_file_training_dir_en+fNameXml):
      os.rename(truth_file_training_dir_en+fNameXml, './train_dir_'+language+'/'+label+'/'+fNameTxt )

    # use readline() to read next line
    line = f.readline()

language='es'
# Organize ES folders.
# Open the file truth.txt with read only permit.
f = open(truth_file_training_path_es, "r")
# use readline() to read the first line 
line = f.readline()
# use the read line to read further.
# If the file is not empty keep reading one line
# at a time, till the file is empty
while line:
    # Split line at :::
    x = line.split(":::")
    fNameXml = x[0]+'.xml'
    fNameTxt = x[0]+'.txt'
    # Second coord [0] gets just the first character (label) and not /n too.
    label = x[1][0]

    # Now move the file to the right folder.
    if os.path.exists(truth_file_training_dir_es+fNameXml):
      os.rename(truth_file_training_dir_es+fNameXml, './train_dir_'+language+'/'+label+'/'+fNameTxt )

    # use readline() to read next line
    line = f.readline()

## Read truth.txt to organize test dataset folders.

In [6]:
#Organize EN folders.
language='en'
# Open the file truth.txt with read only permit.
f = open(truth_file_test_path_en, "r")
# use readline() to read the first line 
line = f.readline()
# use the read line to read further.
# If the file is not empty keep reading one line
# at a time, till the file is empty
while line:
    # Split line at :::
    x = line.split(":::")
    fNameXml = x[0]+'.xml'
    fNameTxt = x[0]+'.txt'
    # Second coord [0] gets just the first character (label) and not /n too.
    label = x[1][0]

    # Now move the file to the right folder.
    if os.path.exists(truth_file_test_dir+'/'+language+'/'+fNameXml):
      os.rename(truth_file_test_dir+'/'+language+'/'+fNameXml, './test_dir_'+language+'/'+label+'/'+fNameTxt )

    # use readline() to read next line
    line = f.readline()

#Organize EN folders.
language='es'
# Open the file truth.txt with read only permit.
f = open(truth_file_test_path_es, "r")
# use readline() to read the first line 
line = f.readline()
# use the read line to read further.
# If the file is not empty keep reading one line
# at a time, till the file is empty
while line:
    # Split line at :::
    x = line.split(":::")
    fNameXml = x[0]+'.xml'
    fNameTxt = x[0]+'.txt'
    # Second coord [0] gets just the first character (label) and not /n too.
    label = x[1][0]

    # Now move the file to the right folder.
    if os.path.exists(truth_file_test_dir+'/'+language+'/'+fNameXml):
      os.rename(truth_file_test_dir+'/'+language+'/'+fNameXml, './test_dir_'+language+'/'+label+'/'+fNameTxt )

    # use readline() to read next line
    line = f.readline()

## Function to pre-process source text.

In [7]:
def custom_standardization(input_data):
  tag_open_CDATA_removed = tf.strings.regex_replace(input_data, '<\!\[CDATA\[', ' ')
  tag_closed_CDATA_removed = tf.strings.regex_replace(tag_open_CDATA_removed,'\]{1,}>', ' ')
  tag_author_lang_es_removed = tf.strings.regex_replace(tag_closed_CDATA_removed,'<author lang="es">', ' ')
  tag_author_lang_en_removed = tf.strings.regex_replace(tag_author_lang_es_removed,'<author lang="en">', ' ')
  tag_closed_author_removed = tf.strings.regex_replace(tag_author_lang_en_removed,'</author>', ' ')
  tag_open_documents_removed = tf.strings.regex_replace(tag_closed_author_removed,'<documents>\n(\t){0,2}', '')
  output_data = tf.strings.regex_replace(tag_open_documents_removed,'</documents>\n(\t){0,2}', ' ')
  return output_data

## First model's layer: Text Vectorization.

In [8]:
# Maximum number of words allowed 76000 in our dictionary.
max_features = 76000
# After tokenization 4060 covers all the document lenghts in our dataset.
sequence_length = 4060

vectorize_layer_es = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

vectorize_layer_en = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

## Building the dataset.

In [9]:
batch_size=1

# Build the dataset for Spanish.
language='es'

raw_train_ds_es = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir+language, 
    batch_size=batch_size, 
    #validation_split=0.0, 
    #subset='training', 
    shuffle='false',
    seed=1
    )

train_text = raw_train_ds_es.map(lambda x, y: x)
vectorize_layer_es.adapt(train_text)

raw_test_ds_es = tf.keras.preprocessing.text_dataset_from_directory(
    test_dir+language, 
    batch_size=batch_size,
    shuffle='false'
    )


# Build the dataset for Spanish.
language='en'

raw_train_ds_en = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir+language, 
    batch_size=batch_size, 
    #validation_split=0.0, 
    #subset='training', 
    shuffle='false',
    seed=1
    )

train_text = raw_train_ds_en.map(lambda x, y: x)
vectorize_layer_en.adapt(train_text)

raw_test_ds_en = tf.keras.preprocessing.text_dataset_from_directory(
    test_dir+language, 
    batch_size=batch_size,
    shuffle='false'
    )


Found 300 files belonging to 2 classes.
Found 200 files belonging to 2 classes.
Found 300 files belonging to 2 classes.
Found 200 files belonging to 2 classes.


## Model definition.

In [10]:
# Word embedding dimensions.
embedding_dim = 32

model_es = tf.keras.Sequential([
                             tf.keras.Input(shape=(1,), dtype=tf.string),
                             vectorize_layer_es,
                             layers.Embedding(max_features + 1, embedding_dim),                     
                             layers.Dropout(0.8),

                             layers.Conv1D(32,32),
                             layers.MaxPooling1D(),
                             layers.Dropout(0.5),

                             layers.Conv1D(32,16),
                             layers.MaxPooling1D(),
                             layers.Dropout(0.5),
                             
                             layers.GlobalAveragePooling1D(),
                             layers.Dense(1)
  ])

model_en = tf.keras.Sequential([
                             tf.keras.Input(shape=(1,), dtype=tf.string),
                             vectorize_layer_en,
                             layers.Embedding(max_features + 1, embedding_dim),                     
                             layers.Dropout(0.8),

                             layers.Conv1D(32,32),
                             layers.MaxPooling1D(),
                             layers.Dropout(0.5),

                             layers.Conv1D(32,16),
                             layers.MaxPooling1D(),
                             layers.Dropout(0.5),
                             
                             layers.GlobalAveragePooling1D(),
                             layers.Dense(1)
  ])

opt = tf.keras.optimizers.RMSprop()
model_es.compile(loss=losses.BinaryCrossentropy(from_logits=True), optimizer=opt, metrics=tf.metrics.BinaryAccuracy(threshold=0.0)) 
model_en.compile(loss=losses.BinaryCrossentropy(from_logits=True), optimizer=opt, metrics=tf.metrics.BinaryAccuracy(threshold=0.0)) 
model_es.summary()
model_en.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 4060)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 4060, 32)          2432032   
                                                                 
 dropout (Dropout)           (None, 4060, 32)          0         
                                                                 
 conv1d (Conv1D)             (None, 4029, 32)          32800     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 2014, 32)         0         
 )                                                               
                                                                 
 dropout_1 (Dropout)         (None, 2014, 32)          0

In [11]:
# Train and test with spanish dataset.
training_labels=[]
training_samples=[]

for element in raw_train_ds_es:
  authorDocument=element[0]
  label=element[1]
  
  #print("Sample considered is: ", authorDocument[0])
  #print("Preprocessed: ", str(custom_standardization(authorDocument[0].numpy())))
  #print("And has label: ", label[0].numpy())
  
  text_vect_layer_model = tf.keras.Model(inputs=model_es.input,
                                       outputs=model_es.get_layer('text_vectorization').output)
  text_vect_out = text_vect_layer_model(authorDocument)

  training_labels.append(label[0].numpy())
  current_sample=np.zeros(max_features)
  for current_token in text_vect_out[0][:].numpy():
    #print(current_token,end=' ')
    #print(vectorize_layer.get_vocabulary()[current_token])
    current_sample[current_token]+=1
  training_samples.append(current_sample)
  #break

training_labels=np.array(training_labels)
training_samples=np.array(training_samples)
#print("\nLE LABELS DEI CAMPIONI DI TRAINING SONO:")
#print(training_labels)
#print("\nI SAMPLE DI TRAINING DOPO LA TEXT VECTORIZATION SONO:")
#print(training_samples)

clf = MultinomialNB()
clf.fit(training_samples, training_labels)
print("\nAccuracy on Spanish training set is: ",clf.score(training_samples,training_labels))

# Ora predispongo il test set per vedere l'accuracy finale.

test_labels=[]
test_samples=[]

for element in raw_test_ds_es:
  authorDocument=element[0]
  label=element[1]
  
  text_vect_layer_model = tf.keras.Model(inputs=model_es.input,
                                       outputs=model_es.get_layer('text_vectorization').output)
  text_vect_out = text_vect_layer_model(authorDocument)

  test_labels.append(label[0].numpy())
  current_sample=np.zeros(max_features)
  for current_token in text_vect_out[0][:].numpy():
    current_sample[current_token]+=1
  test_samples.append(current_sample)

test_labels=np.array(test_labels)
test_samples=np.array(test_samples)

print("Accuracy on Spanish test set is: ",clf.score(test_samples,test_labels))

# ****************************************************

# Now train and test with English dataset.
training_labels=[]
training_samples=[]

for element in raw_train_ds_en:
  authorDocument=element[0]
  label=element[1]
  
  #print("Sample considered is: ", authorDocument[0])
  #print("Preprocessed: ", str(custom_standardization(authorDocument[0].numpy())))
  #print("And has label: ", label[0].numpy())
  
  text_vect_layer_model = tf.keras.Model(inputs=model_en.input,
                                       outputs=model_en.get_layer('text_vectorization_1').output)
  text_vect_out = text_vect_layer_model(authorDocument)

  training_labels.append(label[0].numpy())
  current_sample=np.zeros(max_features)
  for current_token in text_vect_out[0][:].numpy():
    #print(current_token,end=' ')
    #print(vectorize_layer.get_vocabulary()[current_token])
    current_sample[current_token]+=1
  training_samples.append(current_sample)
  #break

training_labels=np.array(training_labels)
training_samples=np.array(training_samples)
#print("\nLE LABELS DEI CAMPIONI DI TRAINING SONO:")
#print(training_labels)
#print("\nI SAMPLE DI TRAINING DOPO LA TEXT VECTORIZATION SONO:")
#print(training_samples)

clf = MultinomialNB()
clf.fit(training_samples, training_labels)
print("\nAccuracy on English training set is: ",clf.score(training_samples,training_labels))

# Ora predispongo il test set per vedere l'accuracy finale.

test_labels=[]
test_samples=[]

for element in raw_test_ds_en:
  authorDocument=element[0]
  label=element[1]
  
  text_vect_layer_model = tf.keras.Model(inputs=model_en.input,
                                       outputs=model_en.get_layer('text_vectorization_1').output)
  text_vect_out = text_vect_layer_model(authorDocument)

  test_labels.append(label[0].numpy())
  current_sample=np.zeros(max_features)
  for current_token in text_vect_out[0][:].numpy():
    current_sample[current_token]+=1
  test_samples.append(current_sample)

test_labels=np.array(test_labels)
test_samples=np.array(test_samples)

print("Accuracy on English test set is: ",clf.score(test_samples,test_labels))


Accuracy on Spanish training set is:  0.94
Accuracy on Spanish test set is:  0.695

Accuracy on English training set is:  0.9833333333333333
Accuracy on English test set is:  0.695


## Statistic (no need to execute).

In [None]:
# Distribuzione di probabilità dei samples nel test set.
clf.predict_log_proba(test_samples)

# Distribuzione di probabilità di un singolo sample.
clf.predict_log_proba(text_vect_out[0][:].numpy().reshape(1,-1))

In [None]:
ten_most_frequent_ngram_label0=clf.feature_count_[0]
ten_most_frequent_ngram_label1=clf.feature_count_[1]

maximum0=heapq.nlargest(400, range(len(ten_most_frequent_ngram_label0)), ten_most_frequent_ngram_label0.take)
maximum1=heapq.nlargest(400, range(len(ten_most_frequent_ngram_label1)), ten_most_frequent_ngram_label1.take)
import heapq

print(maximum0)
print(maximum1)

max0_not_in_max1=set(maximum0)-set(maximum1)
max1_not_in_max0=set(maximum1)-set(maximum0)
wordNr=1
print("\nLe 50 parole più probabili per la label 0 sono:")
for word in max0_not_in_max1:
  print(wordNr," ",vectorize_layer.get_vocabulary()[word])
  wordNr+=1


wordNr=1
print("\nLe 50 parole più probabili per la label 1 sono:")
for word in max1_not_in_max0:
  print(wordNr," ",vectorize_layer.get_vocabulary()[word])
  wordNr+=1

print(clf.feature_log_prob_[1])

[0, 2, 3, 4, 5, 6, 7, 9, 8, 10, 11, 14, 16, 13, 15, 12, 17, 21, 18, 19, 20, 22, 23, 25, 24, 27, 28, 26, 33, 30, 32, 46, 38, 34, 35, 41, 29, 40, 45, 48, 50, 43, 66, 31, 54, 39, 52, 42, 44, 61, 62, 56, 57, 60, 51, 59, 47, 55, 64, 69, 49, 63, 70, 82, 53, 84, 58, 74, 79, 71, 115, 67, 80, 77, 72, 92, 97, 76, 128, 95, 88, 86, 90, 102, 89, 87, 85, 107, 91, 75, 109, 110, 101, 78, 99, 73, 106, 117, 123, 166, 127, 100, 98, 116, 118, 132, 131, 129, 93, 126, 143, 146, 96, 122, 148, 121, 139, 81, 135, 140, 111, 112, 113, 114, 124, 147, 104, 141, 168, 182, 103, 177, 144, 94, 145, 154, 162, 120, 137, 160, 169, 83, 108, 155, 158, 130, 151, 199, 206, 119, 138, 150, 152, 149, 209, 246, 181, 211, 156, 171, 315, 221, 201, 192, 217, 133, 208, 65, 157, 179, 216, 185, 212, 105, 187, 231, 341, 163, 183, 210, 237, 348, 142, 174, 175, 196, 178, 203, 215, 230, 240, 262, 189, 161, 173, 205, 167, 190, 200, 219, 265, 170, 229, 180, 193, 197, 214, 198, 220, 225, 228, 232, 267, 134, 153, 223, 244, 273, 279, 323, 186,