<a href="https://colab.research.google.com/github/matt-fidelis-95/BERT-Fake-News/blob/master/artigo_mestrado_portugues_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Combina o conjunto de dados e a modelagem


In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score
from sklearn.ensemble import RandomForestClassifier
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer

nltk.download('punkt')
nltk.download('rslp')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Unzipping stemmers/rslp.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
os.chdir('/content/drive/My Drive/script_python')

In [None]:
# le os conjuntos de dados
from tqdm import tqdm

from sklearn.model_selection import train_test_split

def fnc(path):

    h = []
    b = []
    l = []
    with open(path,encoding="ISO-8859-1") as fdata:  # Body ID,articleBody
        data = fdata.readlines()
    for line in data:
        stripped_line = line.strip()
        li = line.split(';')
        if(len(li) == 3):
          rm_brkl = (li[2]).strip()
          if rm_brkl == '-1':
            l.append(0)
          else:
            l.append(1)
          h.append(li[0])
          b.append(li[1])

    return h, b, l

#headlines, bodies, labels = fnc('Facom_Vitor.csv')
headlines_aux, bodies_aux, labels_aux = fnc('brCorpus.csv')
#headlines.extend(headlines_aux)
#bodies.extend(bodies_aux)
#labels.extend(labels_aux)
list_of_tuples = list(zip(headlines_aux, bodies_aux, labels_aux))#list_of_tuples = list(zip(headlines, bodies, labels))
df_all = pd.DataFrame(list_of_tuples, columns=['id', 'text', 'label'])

In [None]:
# Passo - a : Remove linhas em branco
df_all['text'].dropna(inplace=True)
# Passo - c : Tokenizacao : Cada entrada no corpus e quebrada em um conjunto de palavras
df_all['text']= [word_tokenize(entry, language = 'portuguese') for entry in df_all['text']]
# Passo - d : Remove palavras de parada, nao-numericos e faz o Stemming.

for index,entry in enumerate(df_all['text']):
    Final_words = []
    # Inicializa o stemmer para portugues RSLPStemmer()
    word_RSLPStemmer = RSLPStemmer()
    for word in entry:
        # Checa se e uma palavra de parada e considera apenas o alfabeto
        if word not in stopwords.words('portuguese') and word.isalpha():
            word_Final = word_RSLPStemmer.stem(word)
            Final_words.append(word_Final)
    # O conjunto final de palavras processadas para cada iteracao sera salvo em 'text_final'
    df_all.loc[index,'text_final'] = str(Final_words)

In [None]:

print (df_all.shape)
df_all.info()

(7197, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7197 entries, 0 to 7196
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          7197 non-null   object
 1   text        7197 non-null   object
 2   label       7197 non-null   int64 
 3   text_final  7197 non-null   object
dtypes: int64(1), object(3)
memory usage: 225.0+ KB


# Modeling

In [None]:
X_body_text = df_all['text_final']
y = df_all['label']

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,2),max_df= 0.85, min_df= 0.01)

In [None]:
X_body_tfidf = tfidf.fit_transform(X_body_text)

In [None]:
X_body_tfidf_train, X_body_tfidf_test, y_body_train, y_body_test = train_test_split(X_body_tfidf,y, test_size = 0.3, random_state=1234)

# Random Forest
##### Usa apenas o corpo das notícias

In [None]:
rcf_body = RandomForestClassifier(n_estimators=100,n_jobs=3)

In [None]:
rcf_body.fit(X_body_tfidf_train, y_body_train)
y_rc_body_pred = rcf_body.predict(X_body_tfidf_test)

In [None]:
# printa as metricas
print ("Random Forest F1 and Accuracy Scores : \n")
print ( "F1 score {:.4}%".format( f1_score(y_body_test, y_rc_body_pred, average='macro')*100 ) )
print ( "Accuracy score {:.4}%".format(accuracy_score(y_body_test, y_rc_body_pred)*100) )

Random Forest F1 and Accuracy Scores : 

F1 score 86.8%
Accuracy score 86.81%


In [None]:
LABELS = [0, 1]
RELATED = [0, 1, 2]

def print_confusion_matrix(cm):
    lines = ['CONFUSION MATRIX:']
    header = "|{:^11}|{:^11}|{:^11}|".format('', *LABELS)
    line_len = len(header)
    lines.append("-"*line_len)
    lines.append(header)
    lines.append("-"*line_len)
    hit = 0
    total = 0
    for i, row in enumerate(cm):
        hit += row[i]
        total += sum(row)
        lines.append("|{:^11}|{:^11}|{:^11}|".format(LABELS[i], *row))
        lines.append("-"*line_len)
    lines.append("ACCURACY: {:.3f}".format((hit / total)*100) + "%")
    print('\n'.join(lines))

def fnc_score_cm(predicted_labels, target):
    score = 0.0
    cm = [[0, 0],
          [0, 0]]
    for i, (g, t) in enumerate(zip(predicted_labels, target)):
            if g == t:
                score += 0.25
                if g != 3:
                    score += 0.50
            if g in RELATED and t in RELATED:
                score += 0.25

            cm[g][t] += 1
    return score,  cm

fnc_score, cm_test = fnc_score_cm(y_rc_body_pred, y_body_test)
print_confusion_matrix(cm_test)

CONFUSION MATRIX:
-------------------------------------
|           |     0     |     1     |
-------------------------------------
|     0     |    955    |    148    |
-------------------------------------
|     1     |    137    |    920    |
-------------------------------------
ACCURACY: 86.806%
