In [None]:
# Bibliotecas além do gerenciador Anaconda
!pip install spacy
!python -m spacy download pt_core_news_sm
!pip install wordcloud
!pip install gensim
!pip install tensorflow

In [1]:
# Módulos básicos para manuseio de dados e arquivos
import numpy as np
import pandas as pd
import os
from os.path import isfile, join
import nltk
import re
import string
import unicodedata

# Módulos para visualização de dados
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
%matplotlib inline

# Módulo para processamento de linguagem
import spacy

## Carregamento de textos

In [170]:
limited_news_path = r'Software\Fake.br-Corpus' #\fake_10 or \true_10
news_path = r'Software\Fake.br-Corpus\full_texts' #\fake or \true

paths = [limited_news_path, news_path]

In [171]:
def sortDir(dir_path: str, is_meta=False) ->list:
    '''
    Ordena os arquivos dentro de dir_path e os retorna no formato de lista.
    '''
    if is_meta:
        number_separator = "-meta.txt"
    else:
        number_separator = ".txt"

    first_list = os.listdir(dir_path)
    int_list = [int(element.split(number_separator)[0]) for element in first_list]
    int_list.sort()
    final_list = [(str(element) + number_separator) for element in int_list]

    return final_list

def txtToDataframe(path, is_limited=True):
    '''
    Function for converting full texts to a single DataFrame.
    '''
    if is_limited:
        true_files = [path+"\\true_10\\"+f for f in sortDir(dir_path = path+'\\true_10') if isfile(join(path+'\\true_10', f))]
        fake_files = [path+"\\fake_10\\"+f for f in sortDir(dir_path = path+'\\fake_10') if isfile(join(path+'\\fake_10', f))]
    else:
        true_files = [path+"\\true\\"+f for f in sortDir(dir_path = path+'\\true') if isfile(join(path+'\\true', f))]
        fake_files = [path+"\\fake\\"+f for f in sortDir(dir_path = path+'\\fake') if isfile(join(path+'\\fake', f))]
    
    texts = []
    labels = []
    
    for file in true_files:
        with open(file, encoding='utf-8-sig') as f:
            texts.append(f.read())
            labels.append(0)
    for file in fake_files:
        with open(file, encoding='utf-8-sig') as f:
            texts.append(f.read())
            labels.append(1)
            
    df = pd.DataFrame(list(zip(texts,labels)),columns=['texts','labels'])
    
    # Com esta função, textos e labels foram inseridos em um DataFrame de maneira sequencial. Todas as notícias verdadeiras vêm
    # ANTES do bloco de notícias falsas.
    
    return df

def appendMetadata(path,df, is_limited=True):
    '''
    Function for appending metadata to previously generated news DataFrame.
    '''
    if is_limited:
        true_meta = [path+"\\true-meta-information-10\\"+f for f in sortDir(dir_path = path+'\\true-meta-information-10',is_meta=True) if isfile(join(path+'\\true-meta-information-10', f))]
        fake_meta = [path+"\\fake-meta-information-10\\"+f for f in sortDir(dir_path = path+'\\fake-meta-information-10',is_meta=True) if isfile(join(path+'\\fake-meta-information-10', f))]
    else:
        true_meta = [path+"\\true-meta-information\\"+f for f in sortDir(dir_path = path+'\\true-meta-information',is_meta=True) if isfile(join(path+'\\true-meta-information', f))]
        fake_meta = [path+"\\fake-meta-information\\"+f for f in sortDir(dir_path = path+'\\fake-meta-information',is_meta=True) if isfile(join(path+'\\fake-meta-information', f))]
    

    #true_meta e fake_meta são listas com todas os paths para arquivos de metadata.
    
    columns = ["author", "source", "category", "date","tokens","words_without_punctuation","types","number_of_links","uppercase_words","verbs","subjuntive_imperative","nouns","adjectives","adverbs","modal_verbs","singular_first_and_second_personal_pronouns","plural_first_personal_pronouns","pronouns","pausality","characters","avg_sentence_length","avg_word_length","percentage_of_spelling_errors","emotiveness","diversity"]
    
    true_metadata = pd.DataFrame(columns=columns)
    fake_metadata = pd.DataFrame(columns=columns)
    
    for file in true_meta:
        #print(file)
        aux = pd.read_csv(file, header=None, sep = '\n').transpose()
        aux.columns = columns
        true_metadata=true_metadata.append(aux)
        
        
    for file in fake_meta:
        #print(file)
        aux = pd.read_csv(file, header=None, sep = '\n').transpose()
        aux.columns = columns
        fake_metadata=fake_metadata.append(aux)
        
    
    metadata = pd.DataFrame(columns=columns)
    metadata = metadata.append(true_metadata,ignore_index=True)
    metadata = metadata.append(fake_metadata,ignore_index=True) 


    complete_df = pd.concat([df,metadata],axis=1)
    # Este DataFrame possui todos os textos/labels (2 colunas) e metadata (25 colunas).
    
    return complete_df

In [172]:
ai = int(input('''0 - Base com 10 notícias verdadeiras e 10 notícias falsas
1 - Base completa de notícias
'''))

path = paths[ai]

if ai == 0:
    data = txtToDataframe(path) # Dataframe contendo notícias e labels.
    complete_data = appendMetadata(path,data) # Dataframe contendo notícias, labels e metadata.
else:
    data = txtToDataframe(path,is_limited=False)
    complete_data = appendMetadata(path,data,is_limited=False)

0 - Base com 10 notícias verdadeiras e 10 notícias falsas
1 - Base completa de notícias
0


In [173]:
complete_data['labels']

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
Name: labels, dtype: int64

In [58]:
complete_data['texts'][14]

'temer resolve o problema de luislinda \x93liberdade ainda que tardia\x94a ministra dos direitos humanos luislinda valois entrou para a galeria dos \x93sem noção\x94 ao pedir para acumular o salário da função de ministra com a aposentadoria de desembargadora o que daria r  mil muito além do teto constitucional ao fazêlo alegou que sua condição \x93sem sombra de dúvida se assemelha a trabalho escravo\x94agora o jornal carioca o dia divulga que temer decidiu demitir luislinda e dar um fim ao seu problema ao que parece o presidente ainda busca uma saída honrosa para luislinda talvez outro cargo não sabemos aindasua substituta deve ser a deputada federal licenciada tia eron prb atual secretária municipal de promoção socialo que importa é que luislinda enfim conseguiu sua liberdade ainda que tardia'

## Preprocessamento de textos

In [7]:
# Carregando o pacote de língua portuguesa para o processador Spacy
nlp = spacy.load('pt_core_news_sm')

In [174]:
# Defininido funções de preprocessamento

def removePunct(text):
    '''
    Removes any punctuation included in string.punctuation.
    '''
    translator = text.maketrans({key:'' for key in string.punctuation+'“”'}) # Translates any punctuation into ''
    return text.translate(translator)

def removeNumbers(text):
    '''
    Removes any number character in text.
    '''
    return re.sub('[0-9]', '' , text) # Translates any number into ''

def removeStopWords(string):
    '''
    Removes any portuguese stopwords, using Spacy's standard package.
    '''
    doc = nlp(string)
    return ' '.join([token.text for token in doc if token.is_stop is False])

def lemmatize(string):
    '''
    Lemmatizes text word-by-word. Notice that lemmatizing is not as harsh as stemming, which makes the final text easier to read and understand in common language.
    '''
    doc = nlp(string)
    return ' '.join([token.lemma_ for token in doc])

def prep(string, useStopWords = False, lemma = False):
    '''
    Executes previously defined preprocessing in text.
    '''

    result = removeNumbers(removePunct(string)).lower()
    
    if useStopWords and lemma:
        doc = nlp(result)
        result = ' '.join([token.lemma_ for token in doc if token.is_stop is False])
    elif useStopWords:
        doc = nlp(result)
        result = ' '.join([token.text for token in doc if token.is_stop is False])
    elif lemma:
        doc = nlp(result)
        result = ' '.join([token.lemma_ for token in doc])
    return result.replace('\n',"")

In [175]:
# Realizando preprocessamento de textos presentes no Dataframe de notícias completo.

complete_data['texts'] = complete_data['texts'].apply(prep)

In [24]:
complete_data['texts'][14]

'temer resolve o problema de luislinda \x93liberdade ainda que tardia\x94a ministra dos direitos humanos luislinda valois entrou para a galeria dos \x93sem noção\x94 ao pedir para acumular o salário da função de ministra com a aposentadoria de desembargadora o que daria r  mil muito além do teto constitucional ao fazêlo alegou que sua condição \x93sem sombra de dúvida se assemelha a trabalho escravo\x94agora o jornal carioca o dia divulga que temer decidiu demitir luislinda e dar um fim ao seu problema ao que parece o presidente ainda busca uma saída honrosa para luislinda talvez outro cargo não sabemos aindasua substituta deve ser a deputada federal licenciada tia eron prb atual secretária municipal de promoção socialo que importa é que luislinda enfim conseguiu sua liberdade ainda que tardia'

In [176]:
# Assignando variáveis dependentes e independentes

y = complete_data['labels'].values # y is strings for labels; but should be fake-0/true-1
X = [d.split() for d in complete_data['texts'].tolist()] # X is a list of lists of words.

In [27]:
print(X[0])

['o', 'podemos', 'decidiu', 'expulsar', 'o', 'deputado', 'federal', 'carlos', 'gaguim', 'do', 'partido', 'após', 'a', 'polícia', 'federal', 'fazer', 'buscas', 'a', 'apreensões', 'no', 'gabinete', 'dele', 'na', 'câmara', 'com', 'isso', 'a', 'legenda', 'abre', 'espaço', 'para', 'receber', 'a', 'senadora', 'expulsa', 'pelo', 'pmdb', 'katia', 'abreu', 'por', 'meio', 'de', 'nota', 'a', 'legenda', 'informou', 'que', 'o', 'afastamento', 'do', 'parlamentar', 'já', 'era', 'algo', 'acordado', 'entre', 'os', 'filiados', 'da', 'sigla', 'ainda', 'que', 'o', 'parlamentar', 'tenha', 'comunicado', 'a', 'conclusão', 'de', 'sua', 'desfiliação', 'para', 'esta', 'semana', 'diante', 'dos', 'fatos', 'noticiados', 'hoje', 'a', 'executiva', 'nacional', 'do', 'podemos', 'solicita', 'o', 'imediato', 'cancelamento', 'de', 'sua', 'filiação', 'dos', 'quadros', 'do', 'partidoo', 'partido', 'que', 'no', 'passado', 'chegou', 'a', 'cogitar', 'lançar', 'o', 'parlamentar', 'como', 'candidato', 'ao', 'senado', 'diz', 'qu

## Tokenization (TensorFlow)

In [160]:
# Importando módulos para tokenização de textos
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [177]:
# Assignando bases de teste e treino

test_limit = 0.20
training_sentences,testing_sentences,training_labels,testing_labels = train_test_split(X,y,test_size=test_limit, random_state=42)

#training_sentences = X[0:test_limit]
#testing_sentences = X[test_limit:]

#training_labels = y[0:test_limit]
#testing_labels = y[test_limit:]

In [209]:
type(training_labels)

list

In [178]:
# Gerando dicionário de tokens (com base nos textos de treinamento)

tokenizer = Tokenizer()#oov_token='<OOV>')

tokenizer.fit_on_texts(training_sentences)

In [128]:
tokenizer.word_index

{'de': 1,
 'que': 2,
 'o': 3,
 'a': 4,
 'do': 5,
 'e': 6,
 'da': 7,
 'em': 8,
 'para': 9,
 'não': 10,
 'no': 11,
 'um': 12,
 'uma': 13,
 'na': 14,
 'foi': 15,
 'ao': 16,
 'é': 17,
 'com': 18,
 'por': 19,
 'à': 20,
 'as': 21,
 'como': 22,
 'se': 23,
 'sua': 24,
 'os': 25,
 'dos': 26,
 'ser': 27,
 'são': 28,
 'ele': 29,
 'pelo': 30,
 'seu': 31,
 'partido': 32,
 'bolsonaro': 33,
 'federal': 34,
 'das': 35,
 'vai': 36,
 'disse': 37,
 'diz': 38,
 'público': 39,
 'presidente': 40,
 'ter': 41,
 'governo': 42,
 'tem': 43,
 'paulo': 44,
 'pela': 45,
 'pt': 46,
 'também': 47,
 'mas': 48,
 'ou': 49,
 'silvio': 50,
 'prefeito': 51,
 'psdb': 52,
 'anos': 53,
 'alckmin': 54,
 'já': 55,
 'ministério': 56,
 'contra': 57,
 'estado': 58,
 'até': 59,
 'isso': 60,
 'pode': 61,
 'dia': 62,
 'sobre': 63,
 'ainda': 64,
 'dinheiro': 65,
 'após': 66,
 'muito': 67,
 'projeto': 68,
 'campanha': 69,
 'r': 70,
 'está': 71,
 'deve': 72,
 'você': 73,
 'santos': 74,
 'mais': 75,
 'candidato': 76,
 'suas': 77,
 'kátia

In [179]:
# Realizando a sequencialização das bases de treinamento e teste

training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

In [180]:
# Analisar notícias maiores do que max_length

min_length = 150
max_length = 150

#plt.hist([len(x) for x in X], bins = 750)
#plt.show()

nos = np.array([len(x) for x in training_sentences])
print("There are "+ str(len(nos[nos>=max_length])) + " news equal or longer than "+ str(max_length) + " words (will be truncated).")

nos = np.array([len(x) for x in training_sentences])
print("There are "+ str(len(nos[nos<min_length])) + " news shorter than "+ str(min_length) + " words (will be killed).")

There are 13 news equal or longer than 150 words (will be truncated).
There are 3 news shorter than 150 words (will be killed).


In [181]:
# Matar notícias menores do que min_length

flag_dict = {}
for i in range(0,len(training_sequences)):
    if len(training_sequences[i]) < min_length:
        flag_dict[i] = True
    else:
        flag_dict[i] = False
        
training_labels = [training_labels[i] for i in range(0,len(flag_dict)) if (flag_dict[i] == False)]

training_sequences = [seq for seq in training_sequences if len(seq)>=min_length]

In [182]:
for i in range(0,len(training_sequences)):    
    print(len(training_sequences[i]))

221
275
291
194
281
197
818
245
319
217
419
177
436


In [183]:
# Truncar (padding) das sequências tokenizadas

training_padded = pad_sequences(training_sequences,maxlen=max_length,padding='post',truncating='pre')
#testing_padded = pad_sequences(testing_sequences,maxlen=max_length,padding='post',truncating='pre')

In [117]:
for i in range(0,len(training_padded)):    
    print(len(training_padded[i]))

150
150
150
150
150
150
150
150
150
150
150
150
150


In [184]:
vocab = tokenizer.word_index
inv_vocab = {v: k for k, v in vocab.items()} # reconverte o token para a palavra correpondente.

training_texts = []
testing_texts = []

# Com estas listas "_texts", temos os textos novamente completos, mas agora com 'OOV's para palavras desconhecidas
# na base de testes.
for sequence in training_sequences:
    news=[]
    for token in sequence:
        news.append(inv_vocab[token])  #inv_vocab[token] é a palavra correspondente ao token.
    training_texts.append(news)


for sequence in testing_sequences:
    news=[]
    for token in sequence:
        news.append(inv_vocab[token])
    testing_texts.append(news)

In [107]:
testing_texts[0]

['o',
 '<OOV>',
 'decidiu',
 '<OOV>',
 'o',
 'deputado',
 'federal',
 'carlos',
 '<OOV>',
 'do',
 'partido',
 'após',
 'a',
 'polícia',
 'federal',
 'fazer',
 '<OOV>',
 'a',
 '<OOV>',
 'no',
 '<OOV>',
 'dele',
 'na',
 'câmara',
 'com',
 'isso',
 'a',
 'legenda',
 '<OOV>',
 'espaço',
 'para',
 'receber',
 'a',
 'senadora',
 '<OOV>',
 'pelo',
 'pmdb',
 '<OOV>',
 'abreu',
 'por',
 'meio',
 'de',
 'nota',
 'a',
 'legenda',
 'informou',
 'que',
 'o',
 '<OOV>',
 'do',
 '<OOV>',
 'já',
 'era',
 'algo',
 '<OOV>',
 'entre',
 'os',
 '<OOV>',
 'da',
 'sigla',
 'ainda',
 'que',
 'o',
 '<OOV>',
 'tenha',
 '<OOV>',
 'a',
 '<OOV>',
 'de',
 'sua',
 '<OOV>',
 'para',
 '<OOV>',
 'semana',
 'diante',
 'dos',
 '<OOV>',
 '<OOV>',
 'hoje',
 'a',
 '<OOV>',
 'nacional',
 'do',
 '<OOV>',
 '<OOV>',
 'o',
 '<OOV>',
 '<OOV>',
 'de',
 'sua',
 'filiação',
 'dos',
 '<OOV>',
 'do',
 '<OOV>',
 'partido',
 'que',
 'no',
 'passado',
 'chegou',
 'a',
 '<OOV>',
 '<OOV>',
 'o',
 '<OOV>',
 'como',
 'candidato',
 'ao',
 'sen

## Vectorization (gensim)

In [54]:
import gensim

In [185]:
DIM = 100
w2v_model = gensim.models.Word2Vec(sentences=training_sentences, vector_size=DIM, window=10, min_count=1)

In [186]:
w2v_model.wv.most_similar('bolsonaro')

[('que', 0.7961148023605347),
 ('ao', 0.782345175743103),
 ('em', 0.78111732006073),
 ('de', 0.775870680809021),
 ('o', 0.7723456621170044),
 ('não', 0.7696959376335144),
 ('a', 0.7681581377983093),
 ('com', 0.7658106684684753),
 ('do', 0.7648646831512451),
 ('da', 0.762366771697998)]

In [187]:
w2v_model.wv['bolsonaro']

array([-1.08384872e-02,  2.24295957e-03,  1.08779222e-02,  1.09694432e-03,
        1.20860373e-03, -2.32031234e-02, -4.74540470e-03,  1.38203548e-02,
       -2.57660099e-03, -1.30627202e-02,  3.52965028e-04, -1.96763892e-02,
        1.50028232e-03,  1.05501935e-02,  3.20692034e-03, -1.27249239e-02,
        7.42186233e-03, -6.39519375e-03,  4.31142002e-03, -1.53123429e-02,
        3.92288761e-03,  4.21803631e-03,  1.62784960e-02, -1.50171816e-02,
        5.38172154e-03,  2.06759153e-03, -1.24571305e-02,  4.12423583e-03,
       -1.06467372e-02,  1.27509721e-02,  1.84684712e-02,  5.60031086e-03,
        7.98126496e-03, -1.55025041e-02, -2.18587019e-03,  1.66077744e-02,
        5.84457535e-03, -1.15320152e-02, -4.37451852e-03, -1.58331171e-02,
        6.17942214e-03, -1.74000263e-02,  3.41450330e-03,  4.83150827e-03,
        1.70688871e-02,  5.87627510e-05, -1.32182920e-02, -1.47889473e-03,
        5.61891869e-03, -3.84384044e-03,  1.32964216e-02,  8.59728723e-04,
        6.30329596e-04, -

In [188]:
vocab_size = len(tokenizer.word_index)+1

def get_weight_matrix(model):
    ''' Inserts every word vector in a NumPy array, ordering them by token icon (which is determined by their appearing frequencies). '''
    weight_matrix = np.zeros((vocab_size, DIM))

    for word, token in vocab.items():
        weight_matrix[token] = model.wv[word]

    return weight_matrix

embedding_vectors = get_weight_matrix(model=w2v_model)

## Criação da rede neural

In [147]:
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [189]:
model = Sequential()
model.add(Embedding(vocab_size, output_dim=DIM, weights = [embedding_vectors], input_length=max_length, trainable=False))
model.add(LSTM(units=128))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 150, 100)          170500    
                                                                 
 lstm_1 (LSTM)               (None, 128)               117248    
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 287,877
Trainable params: 117,377
Non-trainable params: 170,500
_________________________________________________________________


In [201]:
np.asarray(training_labels)

array([0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0], dtype=int64)

In [200]:
len(training_padded)

13

In [204]:
testing_list

[[array([[   3],
         [   0],
         [1591],
         [   0],
         [   3],
         [ 421],
         [  34],
         [ 349],
         [   0],
         [   5],
         [  32],
         [  66],
         [   4],
         [ 274],
         [  34],
         [ 135],
         [   0],
         [   4],
         [   0],
         [  11],
         [   0],
         [ 434],
         [  14],
         [ 239],
         [  18],
         [  60],
         [   4],
         [ 406],
         [   0],
         [ 502],
         [   9],
         [ 342],
         [   4],
         [1534],
         [   0],
         [  30],
         [ 159],
         [   0],
         [ 260],
         [  19],
         [1279],
         [   1],
         [ 533],
         [   4],
         [ 406],
         [ 528],
         [   2],
         [   3],
         [   0],
         [   5],
         [   0],
         [  55],
         [ 120],
         [ 206],
         [   0],
         [ 196],
         [  25],
         [   0],
         [   7

In [213]:
type(_padded[0])

numpy.ndarray

In [214]:
model.fit(training_padded, np.asarray(training_labels), validation_split=0.3, epochs=10)

testing_list = []

for test_news in testing_sentences:
    aux_list = []
    sequences = tokenizer.texts_to_sequences(test_news)
    padded = pad_sequences(sequences)
    aux_list.append(np.asarray(padded))
    testing_list.append(aux_list)

    
y_pred = (model.predict(testing_list) >=0.5).astype(int)

accuracy_score(testing_labels, y_pred)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


ValueError: Data cardinality is ambiguous:
  x sizes: 145, 170, 185, 288
Make sure all arrays contain the same number of samples.

In [164]:
type(testing_labels)

numpy.ndarray