1. Data Collection and Exploration

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# carregar o dataset
news = pd.read_csv('../dataset/WELFake_Dataset.csv')

# mostra a primeira linha do dataset
print("News Sample:")
print(news.head())

# plota um gráfico com a distribuição das das News e o tamanho dos artigos x frequencia
news['text_length'] = news['text'].str.len()

plt.hist(news['text_length'], bins=100, alpha=0.5, label='News')
plt.legend(loc='upper right')
plt.xlabel('Article Length')
plt.ylabel('Frequency')
plt.title('Distribution of Article Lengths')
plt.show()

TRATAR O DATASET

In [None]:
# esse comando mostra quantos valores não nulos tem em cada campo,
news.info()
# no title tem "71576" e no text tem "72095" o que é menos que o total "72134"
# como os dois campos interassam vou apagar os registros que nao tenham um deles com os comandos abaixo:
news.dropna(subset=["title", "text"], inplace=True) # implace ao inves de retornar um novo dataframe ele altera o original direto

print("Após apagar:")
news.info()
# assim fecha em "71537" registros completos


In [None]:
from collections import Counter
import nltk

#downloading stopwords and punkt
nltk.download('stopwords')
nltk.download('punkt')

def get_most_common_words(texts, num_words=10):
    all_words = []
    for text in texts:
        all_words.extend(nltk.word_tokenize(text.lower()))
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = [word for word in all_words if word.isalpha() and word not in stop_words]
    word_counts = Counter(words)
    return word_counts.most_common(num_words)

words = get_most_common_words(news['text'])

print('News:', words)

2. Text Preprocessing

PASSO A PASSO:
    1. Lowercasing the text
    2. Removing punctuation and digits
    3. Removing stop words
    4. Stemming or lemmatizing the text


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove punctuation and digits
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stop words
    words = [word for word in words if word not in stop_words]

    # Stem or lemmatize the words
    words = [stemmer.stem(word) for word in words]
   
        # Join the words back into a string
    text = ' '.join(words)

    return text

In [None]:
news['text'] = news['text'].apply(preprocess_text)

3. Model Training

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse as sp
import numpy as np

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(news['text']) 

y = news['label'].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42, max_iter=200)
clf.fit(X_train, y_train)


4. Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)