In [1]:
# Bibliotecas além do gerenciador Anaconda
!pip install spacy
!python -m spacy download pt_core_news_sm
!pip install wordcloud
!pip install gensim
!pip install tensorflow

Collecting pt-core-news-sm==3.2.0

2022-05-25 23:32:06.187788: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-05-25 23:32:06.188929: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.



  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.2.0/pt_core_news_sm-3.2.0-py3-none-any.whl (22.2 MB)
[+] Download and installation successful
You can now load the package via spacy.load('pt_core_news_sm')


In [6]:
# Módulos básicos para manuseio de dados e arquivos
import numpy as np
import pandas as pd
import os
from os.path import isfile, join
import nltk
import re
import string
import unicodedata

# Módulos para visualização de dados
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
%matplotlib inline

# Módulo para processamento de linguagem
import spacy

## Carregamento de textos

In [7]:
limited_news_path = r'Software\Fake.br-Corpus' #\fake_10 or \true_10
news_path = r'Software\Fake.br-Corpus\full_texts' #\fake or \true

paths = [limited_news_path, news_path]

In [8]:
def sortDir(dir_path: str, is_meta=False) ->list:
    '''
    Ordena os arquivos dentro de dir_path e os retorna no formato de lista.
    '''
    if is_meta:
        number_separator = "-meta.txt"
    else:
        number_separator = ".txt"

    first_list = os.listdir(dir_path)
    int_list = [int(element.split(number_separator)[0]) for element in first_list]
    int_list.sort()
    final_list = [(str(element) + number_separator) for element in int_list]

    return final_list

def txtToDataframe(path, is_limited=True):
    '''
    Function for converting full texts to a single DataFrame.
    '''
    if is_limited:
        true_files = [path+"\\true_10\\"+f for f in sortDir(dir_path = path+'\\true_10') if isfile(join(path+'\\true_10', f))]
        fake_files = [path+"\\fake_10\\"+f for f in sortDir(dir_path = path+'\\fake_10') if isfile(join(path+'\\fake_10', f))]
    else:
        true_files = [path+"\\true\\"+f for f in sortDir(dir_path = path+'\\true') if isfile(join(path+'\\true', f))]
        fake_files = [path+"\\fake\\"+f for f in sortDir(dir_path = path+'\\fake') if isfile(join(path+'\\fake', f))]
    
    texts = []
    labels = []
    
    for file in true_files:
        with open(file, encoding='utf-8-sig') as f:
            texts.append(f.read())
            labels.append(0)
    for file in fake_files:
        with open(file, encoding='utf-8-sig') as f:
            texts.append(f.read())
            labels.append(1)
            
    df = pd.DataFrame(list(zip(texts,labels)),columns=['texts','labels'])
    
    # Com esta função, textos e labels foram inseridos em um DataFrame de maneira sequencial. Todas as notícias verdadeiras vêm
    # ANTES do bloco de notícias falsas.
    
    return df

def appendMetadata(path,df, is_limited=True):
    '''
    Function for appending metadata to previously generated news DataFrame.
    '''
    if is_limited:
        true_meta = [path+"\\true-meta-information-10\\"+f for f in sortDir(dir_path = path+'\\true-meta-information-10',is_meta=True) if isfile(join(path+'\\true-meta-information-10', f))]
        fake_meta = [path+"\\fake-meta-information-10\\"+f for f in sortDir(dir_path = path+'\\fake-meta-information-10',is_meta=True) if isfile(join(path+'\\fake-meta-information-10', f))]
    else:
        true_meta = [path+"\\true-meta-information\\"+f for f in sortDir(dir_path = path+'\\true-meta-information',is_meta=True) if isfile(join(path+'\\true-meta-information', f))]
        fake_meta = [path+"\\fake-meta-information\\"+f for f in sortDir(dir_path = path+'\\fake-meta-information',is_meta=True) if isfile(join(path+'\\fake-meta-information', f))]
    

    #true_meta e fake_meta são listas com todas os paths para arquivos de metadata.
    
    columns = ["author", "source", "category", "date","tokens","words_without_punctuation","types","number_of_links","uppercase_words","verbs","subjuntive_imperative","nouns","adjectives","adverbs","modal_verbs","singular_first_and_second_personal_pronouns","plural_first_personal_pronouns","pronouns","pausality","characters","avg_sentence_length","avg_word_length","percentage_of_spelling_errors","emotiveness","diversity"]
    
    true_metadata = pd.DataFrame(columns=columns)
    fake_metadata = pd.DataFrame(columns=columns)
    
    for file in true_meta:
        #print(file)
        aux = pd.read_csv(file, header=None, sep = '\n').transpose()
        aux.columns = columns
        true_metadata=true_metadata.append(aux)
        
        
    for file in fake_meta:
        #print(file)
        aux = pd.read_csv(file, header=None, sep = '\n').transpose()
        aux.columns = columns
        fake_metadata=fake_metadata.append(aux)
        
    
    metadata = pd.DataFrame(columns=columns)
    metadata = metadata.append(true_metadata,ignore_index=True)
    metadata = metadata.append(fake_metadata,ignore_index=True) 


    complete_df = pd.concat([df,metadata],axis=1)
    # Este DataFrame possui todos os textos/labels (2 colunas) e metadata (25 colunas).
    
    return complete_df

In [9]:
ai = int(input('''0 - Base com 10 notícias verdadeiras e 10 notícias falsas
1 - Base completa de notícias
'''))

path = paths[ai]

if ai == 0:
    data = txtToDataframe(path) # Dataframe contendo notícias e labels.
    complete_data = appendMetadata(path,data) # Dataframe contendo notícias, labels e metadata.
else:
    data = txtToDataframe(path,is_limited=False)
    complete_data = appendMetadata(path,data,is_limited=False)

0 - Base com 10 notícias verdadeiras e 10 notícias falsas
1 - Base completa de notícias
1


TypeError: 'module' object is not callable

In [None]:
complete_data['texts'][14]

## Preprocessamento de textos

In [None]:
# Carregando o pacote de língua portuguesa para o processador Spacy
nlp = spacy.load('pt_core_news_sm')

In [None]:
# Defininido funções de preprocessamento

def removePunct(text):
    '''
    Removes any punctuation included in string.punctuation.
    '''
    translator = text.maketrans({key:'' for key in string.punctuation+'“”'}) # Translates any punctuation into ''
    return text.translate(translator)

def removeNumbers(text):
    '''
    Removes any number character in text.
    '''
    return re.sub('[0-9]', '' , text) # Translates any number into ''

def removeStopWords(string):
    '''
    Removes any portuguese stopwords, using Spacy's standard package.
    '''
    doc = nlp(string)
    return ' '.join([token.text for token in doc if token.is_stop is False])

def lemmatize(string):
    '''
    Lemmatizes text word-by-word. Notice that lemmatizing is not as harsh as stemming, which makes the final text easier to read and understand in common language.
    '''
    doc = nlp(string)
    return ' '.join([token.lemma_ for token in doc])

def prep(string, useStopWords = False, lemma = False):
    '''
    Executes previously defined preprocessing in text.
    '''

    result = removeNumbers(removePunct(string)).lower()
    
    if useStopWords and lemma:
        doc = nlp(result)
        result = ' '.join([token.lemma_ for token in doc if token.is_stop is False])
    elif useStopWords:
        doc = nlp(result)
        result = ' '.join([token.text for token in doc if token.is_stop is False])
    elif lemma:
        doc = nlp(result)
        result = ' '.join([token.lemma_ for token in doc])

    result = result.replace('\n',"")
    
    return result

In [None]:
# Realizando preprocessamento de textos presentes no Dataframe de notícias completo.

complete_data['texts'] = complete_data['texts'].apply(prep)

In [None]:
complete_data['texts'][14]

In [None]:
# Assignando variáveis dependentes e independentes

y = complete_data['labels'].values # y is strings for labels; but should be fake-0/true-1
X = [d.split() for d in complete_data['texts'].tolist()] # X is a list of lists of words.

In [None]:
print(X[14])

## Tokenization (TensorFlow)

In [None]:
# Importando módulos para tokenização de textos
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [None]:
# Assignando bases de teste e treino

test_limit = 0.10
training_sentences,testing_sentences,training_labels,testing_labels = train_test_split(X,y,test_size=test_limit, random_state=42)

#training_sentences = X[0:test_limit]
#testing_sentences = X[test_limit:]

#training_labels = y[0:test_limit]
#testing_labels = y[test_limit:]

In [None]:
type(training_sequences)

In [None]:
# Gerando dicionário de tokens (com base nos textos de treinamento)

tokenizer = Tokenizer()#oov_token='<OOV>')

tokenizer.fit_on_texts(training_sentences)

In [None]:
tokenizer.word_index

In [None]:
# Realizando a sequencialização das bases de treinamento e teste

training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

In [None]:
# Analisar notícias maiores do que max_length

min_length = 250
max_length = 250

#plt.hist([len(x) for x in X], bins = 750)
#plt.show()

nos = np.array([len(x) for x in training_sentences])
print("There are "+ str(len(nos[nos>=max_length])) + " news equal or longer than "+ str(max_length) + " words (will be truncated).")

nos = np.array([len(x) for x in training_sentences])
print("There are "+ str(len(nos[nos<min_length])) + " news shorter than "+ str(min_length) + " words (will be killed).")

In [None]:
# Matar notícias menores do que min_length

flag_dict = {}
for i in range(0,len(training_sequences)):
    if len(training_sequences[i]) < min_length:
        flag_dict[i] = True
    else:
        flag_dict[i] = False
        
training_labels = [training_labels[i] for i in range(0,len(flag_dict)) if (flag_dict[i] == False)]

training_sequences = [seq for seq in training_sequences if len(seq)>=min_length]

In [None]:
for i in range(0,len(training_sequences)):    
    print(len(training_sequences[i]))

In [None]:
# Truncar (padding) das sequências tokenizadas

training_padded = pad_sequences(training_sequences,maxlen=max_length,padding='post',truncating='pre')
#testing_padded = pad_sequences(testing_sequences,maxlen=max_length,padding='post',truncating='pre')

In [None]:
for i in range(0,len(training_padded)):    
    print(len(training_padded[i]))

In [None]:
vocab = tokenizer.word_index
inv_vocab = {v: k for k, v in vocab.items()} # reconverte o token para a palavra correpondente.

training_texts = []
testing_texts = []

# Com estas listas "_texts", temos os textos novamente completos, mas agora com 'OOV's para palavras desconhecidas
# na base de testes.
for sequence in training_sequences:
    news=[]
    for token in sequence:
        news.append(inv_vocab[token])  #inv_vocab[token] é a palavra correspondente ao token.
    training_texts.append(news)


for sequence in testing_sequences:
    news=[]
    for token in sequence:
        news.append(inv_vocab[token])
    testing_texts.append(news)

In [None]:
testing_texts[0]

## Vectorization (gensim)

In [None]:
import gensim

In [None]:
DIM = 100
w2v_model = gensim.models.Word2Vec(sentences=training_sentences, vector_size=DIM, window=10, min_count=1)

In [None]:
w2v_model.wv.most_similar('bolsonaro')

In [None]:
w2v_model.wv['bolsonaro']

In [None]:
vocab_size = len(tokenizer.word_index)+1

def get_weight_matrix(model):
    ''' Inserts every word vector in a NumPy array, ordering them by token icon (which is determined by their appearing frequencies). '''
    weight_matrix = np.zeros((vocab_size, DIM))

    for word, token in vocab.items():
        weight_matrix[token] = model.wv[word]

    return weight_matrix

embedding_vectors = get_weight_matrix(model=w2v_model)

## Criação da rede neural

In [None]:
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=DIM, weights = [embedding_vectors], input_length=max_length, trainable=False))
model.add(LSTM(units=128))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

model.summary()

In [None]:
model.fit(training_padded, np.transpose(np.asarray(training_labels)), validation_split=0.3, epochs=10)

In [None]:
sequences = tokenizer.texts_to_sequences(testing_sentences) # Sequencing testing sentences.
padded = pad_sequences(testing_sequences,maxlen=max_length,padding='post',truncating='pre')

In [None]:
y_pred = (model.predict(padded) >=0.5).astype(int)

accuracy_score(testing_labels, y_pred)