# 1. Carregar o dataset e explorar a estrutura do dataframe

In [None]:
import pandas as pd

# carregar o dataframe, embaralhar os dados, resetar os index e remover coluna "Unnamed" inútil
dataframe = (
    pd.read_csv('../dataset/WELFake_Dataset.csv')
    .sample(frac=1)
    .reset_index(drop=True)
    .drop("Unnamed: 0", axis="columns")
)

# mostra as primeiras 5 linhas do df
print("Primeiras 5 linhas do dataframe:")
print(dataframe.head())

# mostra a quantidade de linhas e colunas
print(f"\nQuantidade de linhas x colunas: {dataframe.shape}")

# 2. Pré processamento: remover nulos, criar amostras menores e preprocessar os textos

In [None]:
# mostra informações do dataframe
print("Informações sobre os tipos de dados e quantidades de nulos:")
dataframe.info()

# apaga as linhas que tem "title" ou "text" nulos, 
dataframe.dropna(subset=["title", "text"], inplace=True) # implace ao invés de retornar um novo dataframe ele altera o original direto

print("\nApós apagar os nulos:")
dataframe.info()

# verificar distribuição entre notícias reais e falsas

In [None]:
print("\nContagem de labels após remover nulos  (1 = real, 0 = fake):")
print(dataframe['label'].value_counts())

# criar dataframes com tamanho reduzido

In [None]:
df_0k = dataframe[:100].copy()
df_1k = dataframe[:1000].copy()
df_10k = dataframe[:10000].copy()
df_30k = dataframe[:30000].copy()
df_50k = dataframe[:50000].copy()

In [None]:
# escolher o dataframe que será utilizado para o modelo clássico e para o modelo quântico
df_classic = df_1k.copy()
df_quantum = df_0k.copy()

print("\nContagem de fake news no df clássico (1 = real, 0 = fake):")
print(df_classic['label'].value_counts())

print("\nContagem de fake news no df quântico (1 = real, 0 = fake):")
print(df_quantum['label'].value_counts())

# pré processamento do texto

passo a passo:
1. Converter o texto para minúsculas
2. Remover pontuação e dígitos
3. Remover palavras irrelevantes como artigos e pronomes(stop words)
4. Aplicar Stemming (remover o sufixo das palavaras exemplo "mudando" "mudaria" para "mud") ou lemmatizing (mais sofisticado reduz a palavra para a forma base "mudaria" ou "mudado" viraria mudar)

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import nltk

nltk.download("punkt") # remove as pontuações
nltk.download("stopwords") # remove as stop words 
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # letras minúsculas
    text = text.lower()

    # remove a pontuação e os digitos
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))

    # tokeniza o texto
    words = word_tokenize(text)

    # remover as stop words
    words = [word for word in words if word not in stop_words]

    # aplicar lemmatizer nas palavras
    words = [lemmatizer.lemmatize(word) for word in words]
   
    # junta as palavras de volta numa string
    text = ' '.join(words)

    return text

In [None]:
# aplicar pré processamentos nos dataframes
df_classic['text_clean'] = df_classic['text'].apply(preprocess_text)
df_quantum['text_clean'] = df_quantum['text'].apply(preprocess_text)

# 3. Treinamento dos modelos

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer_classic = CountVectorizer()  # Cria o "transformador" BOW para os algoritmos clássicos
# X_classic = vectorizer_classic.fit_transform(df['text_clean'])  # Aplica BOW nos textos para os algoritmos clássicos
# vectorizer_quantum = CountVectorizer(max_features = 12)  # Cria o "transformador" BOW para os algoritmos quânticos
# X_quantum = vectorizer_quantum.fit_transform(df['text_clean'])  # Aplica BOW nos textos para os algoritmos quânticos

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

# definir o y (varíavel alvo) para cada um dos modelos
y_classic = df_classic['label'].values
y_quantum = df_quantum['label'].values

vectorizer_classic = TfidfVectorizer()  # Cria o "transformador" TF-IDF para os algoritmos clássicos
X_classic = vectorizer_classic.fit_transform(df_classic['text_clean'])  # Aplica TF-IDF nos textos para os algoritmos clássicos
X_quantum = vectorizer_classic.fit_transform(df_quantum['text_clean'])  # Aplica TF-IDF nos textos para os algoritmos quânticos

# vectorizer_quantum = TfidfVectorizer(max_features = 12)  # Cria o "transformador" TF-IDF para os algoritmos quânticos
# X_quantum = vectorizer_quantum.fit_transform(df['text_clean'])  # Aplica TF-IDF nos textos para os algoritmos quânticos

lasso_selector = SelectFromModel(
    LogisticRegression(random_state=42),
    max_features=12
)

X_quantum = lasso_selector.fit_transform(X_quantum, y_quantum)

scaler = StandardScaler(with_mean=False)
X_quantum_scaled = scaler.fit_transform(X_quantum)

In [None]:
from sklearn.model_selection import train_test_split

X_train_classic, X_test_classic, y_train_classic, y_test_classic = train_test_split(X_classic, y_classic, test_size=0.2, random_state=42)

X_train_quantum, X_test_quantum, y_train_quantum, y_test_quantum = train_test_split(X_quantum_scaled, y_quantum, test_size=0.2, random_state=42)

## 3.1 Treinamento dos modelos clássicos

### 3.1.1 Treinamento com Regressão Logística

In [None]:
def train_logistic_regression(X_train, y_train):
    from sklearn.linear_model import LogisticRegression
    import time
    
    model = LogisticRegression(random_state=42, max_iter=200)
    
    start = time.time()
    model.fit(X_train_classic, y_train_classic)
    elapsed = time.time() - start
    
    print(f"Training time: {round(elapsed)} seconds")
    return model

### 3.1.2 Treinamento com Árvores Aleatórias

In [None]:
def train_random_forest(X_train, y_train):
    from sklearn.ensemble import RandomForestClassifier
    import time
    
    model = RandomForestClassifier()
    
    start = time.time()
    model.fit(X_train_classic, y_train_classic)
    elapsed = time.time() - start
    
    print(f"Training time: {round(elapsed)} seconds")
    return model

### 3.1.3 Treinamento com Support Vector Classifier 

In [None]:
def train_svm(X_train, y_train):
    from sklearn.svm import SVC
    import time
    
    model = SVC(kernel='linear', probability=True)
    
    start = time.time()
    model.fit(X_train_classic, y_train_classic)
    elapsed = time.time() - start
    
    print(f"Training time: {round(elapsed)} seconds")
    return model

## 3.2 Treinamento dos modelos quânticos

### 3.2.1 Treinamento com Variational Quantum Classifier

In [None]:
def train_vqc(X_train, y_train, num_features):
    from qiskit_machine_learning.algorithms.classifiers import VQC
    from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes
    from qiskit_machine_learning.optimizers import COBYLA
    from qiskit.primitives import StatevectorSampler as Sampler
    import time
    
    feature_map = ZZFeatureMap(feature_dimension=num_features, reps=2)
    ansatz = RealAmplitudes(num_qubits=num_features, reps=3)
    optimizer = COBYLA(maxiter=100)
    sampler = Sampler()
    
    model = VQC(
        sampler=sampler,
        feature_map=feature_map,
        ansatz=ansatz,
        optimizer=optimizer,
    )
    
    start = time.time()
    model.fit(X_train, y_train)
    elapsed = time.time() - start
    
    print(f"VQC - Training time: {round(elapsed)} seconds")
    return model

# 4. Avaliação dos modelos

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_lcr = lcr.predict(X_test_classic)
y_pred_rfc = rfc.predict(X_test_classic)
y_pred_svm = svm.predict(X_test_classic)

def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 Score:', f1, "\n")

print("Regressão Logística")
evaluate_model(y_test_classic, y_pred_lcr)

print("Floresta Aleatória")
evaluate_model(y_test_classic, y_pred_rfc)

print("Support Vector Classifier ")
evaluate_model(y_test_classic, y_pred_svm)

print("Variational Quantum Classifier")
evaluate_model(y_test_quantum, y_pred_vqc)