# UTILS

In [None]:
import pandas as pd
import re
import string
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, recall_score, f1_score, precision_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC

In [None]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

## BD1

### Pre-processing

In [None]:
df_fake = pd.read_csv("./ISOT/Fake.csv")
df_true = pd.read_csv("./ISOT/True.csv")

df_fake["class"] = 0
df_true["class"] = 1

df_merge = pd.concat([df_fake, df_true], axis=0)
df_bd1 = df_merge.drop(["title", "subject", "date"], axis=1)
df_bd1 = df_bd1.sample(frac=1)
df_bd1.reset_index(inplace=True)
df_bd1.drop(["index"], axis=1, inplace=True)


In [None]:
df_bd1

## BD2

### Pre-processing

In [None]:
df_bd2 = pd.read_csv("./BD2/data.csv")

In [None]:
df_bd2 = df_bd2[['Body', 'Label']]

In [None]:
df_bd2 = df_bd2[df_bd2['Body'].apply(lambda x: isinstance(x, str))].reset_index(drop=True)

In [None]:
df_bd2.columns = ['text', 'class']

In [None]:
df_bd2

## BD3

### Pre-processing

In [None]:
df_bd3 = pd.read_csv("./BD3/fake-news/train.csv")

In [None]:
df_bd3 = df_bd3[['text', 'label']]

In [None]:
df_bd3 = df_bd3[df_bd3['text'].apply(lambda x: isinstance(x, str))].reset_index(drop=True)

In [None]:
df_bd3.columns = ['text', 'class']

In [None]:
df_bd3

## Preprocessing all data

In [21]:
df = pd.concat([df_bd1, df_bd2, df_bd3], ignore_index=True)

In [22]:
df

Unnamed: 0,text,class
0,21st Century Wire says NSA and police state ch...,0
1,Is there ever a time the Left isn t portrayed ...,0
2,"BRIDGEWATER, N.J. (Reuters) - U.S. President D...",1
3,WASHINGTON (Reuters) - American intelligence a...,1
4,Are we the mutation nation now? Is this person...,0
...,...,...
69642,Rapper T. I. unloaded on black celebrities who...,0
69643,When the Green Bay Packers lost to the Washing...,0
69644,The Macy’s of today grew from the union of sev...,0
69645,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [23]:
df["text"] = df["text"].apply(wordopt)

print(df)

                                                    text  class
0       century wire says nsa and police state cheerl...      0
1      is there ever a time the left isn t portrayed ...      0
2      bridgewater  n j   reuters    u s  president d...      1
3      washington  reuters    american intelligence a...      1
4      are we the mutation nation now  is this person...      0
...                                                  ...    ...
69642  rapper t  i  unloaded on black celebrities who...      0
69643  when the green bay packers lost to the washing...      0
69644  the macy s of today grew from the union of sev...      0
69645  nato  russia to hold parallel exercises in bal...      1
69646    david swanson is an author  activist  journa...      1

[69647 rows x 2 columns]


In [24]:
X = df["text"]
y = df["class"]

vectorizer = TfidfVectorizer()
X_vect = vectorizer.fit_transform(X)

X_vect.shape

(69647, 222607)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)  

(55717, 222607)
(13930, 222607)


## Base Learners

### Decision Tree

In [26]:
param_grid_dt = {
    'max_depth': [None, 10],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 2]
}

In [27]:
dt = DecisionTreeClassifier()

In [28]:
grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, n_jobs=-1, verbose=2)

In [29]:
grid_search_dt.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [30]:
with open('grid_search_dt.pkl', 'wb') as file:
    pickle.dump(grid_search_dt, file)

In [31]:
print("Melhor score de validação:", grid_search_dt.best_score_)

Melhor score de validação: 0.9004791717977587
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time= 2.6min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10; total time= 2.2min
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2; total time= 2.1min
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10; total time= 2.1min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time= 2.7min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10; total time= 2.4min
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10; total time= 2.1min
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10; total time= 2.1min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time= 2.5min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10; total time= 2.4min
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2; total time= 2.3min
[CV] END max_

### Logistic Regression

In [None]:
param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [100, 200]
}

In [None]:
lr = LogisticRegression()

In [None]:
grid_search_lr = GridSearchCV(estimator=lr, param_grid=param_grid_lr, cv=5, n_jobs=-1, verbose=2)

In [None]:
grid_search_lr.fit(X_train, y_train)

In [None]:
with open('grid_search_lr.pkl', 'wb') as file:
    pickle.dump(grid_search_lr, file)

In [None]:
print("Melhor score de validação:", grid_search_lr.best_score_)

### Naive Bayes

In [None]:
param_grid_nb = {
    'alpha': [0.1, 0.5, 1.0, 1.5],
    'fit_prior': [True, False]     
}

In [None]:
nb = MultinomialNB()

In [None]:
grid_search_nb = GridSearchCV(estimator=nb, param_grid=param_grid_nb, cv=5, n_jobs=-1, verbose=2)

In [None]:
grid_search_nb.fit(X_train, y_train)

In [None]:
with open('grid_search_nb.pkl', 'wb') as model_file:
    pickle.dump(grid_search_nb, model_file)

In [None]:
print("Melhor score de validação:", grid_search_nb.best_score_)

### SVM

In [None]:
param_grid_linear_svm = {
    'C': [0.1, 1, 10],
    'class_weight': [None, 'balanced'],
    'max_iter': [1000, 2000, 5000]  # Opcional: ajuste conforme necessário
}

In [None]:
linear_svm = LinearSVC(random_state=42)

In [None]:
grid_search_linear_svm = GridSearchCV(estimator=linear_svm, param_grid=param_grid_linear_svm, cv=5, n_jobs=-1, verbose=2)

In [None]:
grid_search_linear_svm.fit(X_train, y_train)

In [None]:
with open('grid_search_linear_svm.pkl', 'wb') as model_file:
    pickle.dump(grid_search_linear_svm, model_file)

In [None]:
print("Melhor score de validação:", grid_search_linear_svm.best_score_)

## Bagging

### Random Forest

In [None]:
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 10]
}


In [None]:
rf = RandomForestClassifier(random_state=42)

In [None]:
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)

In [None]:
grid_search_rf.fit(X_train, y_train)

In [None]:
with open('grid_search_rf.pkl', 'wb') as file:
    pickle.dump(grid_search_rf, file)

In [None]:
print("Melhor score de validação:", grid_search_rf.best_score_)

## Boosting

### AdaBoost

In [None]:
param_grid_ada = {
    'n_estimators': [50, 100, 200], 
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'algorithm': ['SAMME', 'SAMME.R'], 
}

In [None]:
ada = AdaBoostClassifier(random_state=42)

In [None]:
grid_search_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=5, n_jobs=-1, verbose=2)

In [None]:
grid_search_ada.fit(X_train, y_train)

In [None]:
with open('grid_search_ada.pkl', 'wb') as f:
    pickle.dump(grid_search_ada, f)

In [None]:
print("Melhor score de validação:", grid_search_ada.best_score_)

### XGBoost

In [None]:
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.3],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0],
    'gamma': [0, 0.1]
}


In [None]:
xgb = XGBClassifier(random_state=42)

In [None]:
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, n_jobs=-1, verbose=2)

In [None]:
grid_search_xgb.fit(X_train, y_train)

In [None]:
with open('grid_search_xgb.pkl', 'wb') as f:
    pickle.dump(grid_search_xgb, f)

In [None]:
print("Melhor score de validação:", grid_search_xgb.best_score_)

## Stacking

In [None]:
param_grid_stacking = {
    'final_estimator__C': [0.01, 0.1, 1.0, 10.0],  
    'final_estimator__solver': ['liblinear', 'saga'],  
    'passthrough': [False, True]  
}

In [None]:
with open('./CV_RESULTS_ISOT/grid_search_dt.pkl', 'rb') as f:
    grid_search_dt = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_nb.pkl', 'rb') as f:
    grid_search_nb = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_lr.pkl', 'rb') as f:
    grid_search_lr = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_linear_svm.pkl', 'rb') as f:
    grid_search_linear_svm = pickle.load(f)

In [None]:
base_estimators = [
    ('lr', grid_search_lr.best_estimator_),
    ('nb', grid_search_nb.best_estimator_),
    ('dt', grid_search_dt.best_estimator_),
    ('svm', grid_search_linear_svm.best_estimator_)
]

In [None]:
meta_estimator = LogisticRegression(max_iter=1000, random_state=42)

In [None]:
stacking = StackingClassifier(estimators=base_estimators, final_estimator=meta_estimator)

In [None]:
grid_search_stacking = GridSearchCV(estimator=stacking, param_grid=param_grid_stacking, cv=5, verbose=1, n_jobs=-1)

In [None]:
grid_search_stacking.fit(X_train, y_train)

In [None]:
with open('grid_search_stacking.pkl', 'wb') as f:
    pickle.dump(grid_search_stacking, f)

In [None]:
print("Melhor score de validação:", grid_search_stacking.best_score_)

# RESULTS

In [None]:
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

with open('./CV_RESULTS_ISOT/grid_search_dt.pkl', 'rb') as f:
    grid_search_dt = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_nb.pkl', 'rb') as f:
    grid_search_nb = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_lr.pkl', 'rb') as f:
    grid_search_lr = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_linear_svm.pkl', 'rb') as f:
    grid_search_linear_svm = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_rf.pkl', 'rb') as f:
    grid_search_rf = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_ada.pkl', 'rb') as f:
    grid_search_ada = pickle.load(f)

#with open('grid_search_xgb.pkl', 'rb') as f:
#    grid_search_xgb = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_stacking.pkl', 'rb') as f:
    grid_search_stacking = pickle.load(f)

#Extract the best models from the grid search
best_dt = grid_search_dt.best_estimator_
best_nb = grid_search_nb.best_estimator_
best_lr = grid_search_lr.best_estimator_
best_linear_svm = grid_search_linear_svm.best_estimator_
best_rf = grid_search_rf.best_estimator_
best_ada = grid_search_ada.best_estimator_
#best_xgb = grid_search_xgb.best_estimator_
best_stacking = grid_search_stacking.best_estimator_


# Define a function to compute and print metrics
def print_metrics(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    
    print(f"Metrics for {model_name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.9f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.9f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.9f}")
    print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.9f}")
    print("-" * 40)

# Assuming you have the test data X_test, y_test
# Replace these with your actual test data
# X_test, y_test = ...

# Print metrics for each model
print_metrics(best_dt, X_test, y_test, 'Decision Tree')
print_metrics(best_nb, X_test, y_test, 'Naive Bayes')
print_metrics(best_lr, X_test, y_test, 'Logistic Regression')
print_metrics(best_linear_svm, X_test, y_test, 'Linear SVM')
print_metrics(best_rf, X_test, y_test, 'Random Forest')
print_metrics(best_ada, X_test, y_test, 'AdaBoost')
#print_metrics(best_xgb, X_test, y_test, 'XGBoost')
print_metrics(best_stacking, X_test, y_test, 'Stacking')