# UTILS

In [1]:
import pandas as pd
import re
import string
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, recall_score, f1_score, precision_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC

In [2]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

## ISOT

### Pre-processing

In [3]:
df_fake = pd.read_csv("./ISOT/Fake.csv")
df_true = pd.read_csv("./ISOT/True.csv")

df_fake["class"] = 0
df_true["class"] = 1

df_merge = pd.concat([df_fake, df_true], axis=0)
df = df_merge.drop(["title", "subject", "date"], axis=1)
df = df.sample(frac=1)
df.reset_index(inplace=True)
df.drop(["index"], axis=1, inplace=True)

print(df)

                                                    text  class
0      ISLAMABAD (Reuters) - The Khyber Pass border b...      1
1      BEIJING (Reuters) - A top Chinese coal miner, ...      1
2      Robert Reich, the former Secretary of Labor un...      0
3       (Please note: profanity in sixth paragraph) B...      1
4      TOKYO (Reuters) - Tokyo Governor Yuriko Koike,...      1
...                                                  ...    ...
44893  (Reuters) - U.S. President-elect Donald Trump ...      1
44894  KINSHASA (Reuters) - Democratic Republic of Co...      1
44895  SEOUL/PYONGYANG (Reuters) - U.S. Vice Presiden...      1
44896  While at a rally in Wilmington, North Carolina...      0
44897  WASHINGTON (Reuters) - The United States sees ...      1

[44898 rows x 2 columns]


In [4]:
df["text"] = df["text"].apply(wordopt)

print(df)

                                                    text  class
0      islamabad  reuters    the khyber pass border b...      1
1      beijing  reuters    a top chinese coal miner  ...      1
2      robert reich  the former secretary of labor un...      0
3        please note  profanity in sixth paragraph  b...      1
4      tokyo  reuters    tokyo governor yuriko koike ...      1
...                                                  ...    ...
44893   reuters    u s  president elect donald trump ...      1
44894  kinshasa  reuters    democratic republic of co...      1
44895  seoul pyongyang  reuters    u s  vice presiden...      1
44896  while at a rally in wilmington  north carolina...      0
44897  washington  reuters    the united states sees ...      1

[44898 rows x 2 columns]


In [5]:
X = df["text"]
y = df["class"]

vectorizer = TfidfVectorizer()
X_vect = vectorizer.fit_transform(X)

X_vect.shape

(44898, 105809)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)  

(35918, 105809)
(8980, 105809)


## Base Learners

### Decision Tree

In [14]:
param_grid_dt = {
    'max_depth': [None, 10],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 2]
}

In [15]:
dt = DecisionTreeClassifier()

In [16]:
grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, n_jobs=-1, verbose=2)

In [17]:
grid_search_dt.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [19]:
with open('grid_search_dt.pkl', 'wb') as file:
    pickle.dump(grid_search_dt, file)

In [20]:
print("Melhor score de validação:", grid_search_dt.best_score_)

Melhor score de validação: 0.9957124468283347


### Logistic Regression

In [21]:
param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [100, 200]
}

In [22]:
lr = LogisticRegression()

In [23]:
grid_search_lr = GridSearchCV(estimator=lr, param_grid=param_grid_lr, cv=5, n_jobs=-1, verbose=2)

In [24]:
grid_search_lr.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [25]:
with open('grid_search_lr.pkl', 'wb') as file:
    pickle.dump(grid_search_lr, file)

In [26]:
print("Melhor score de validação:", grid_search_lr.best_score_)

Melhor score de validação: 0.9919538895195196


### Naive Bayes

In [31]:
param_grid_nb = {
    'alpha': [0.1, 0.5, 1.0, 1.5],
    'fit_prior': [True, False]     
}

In [32]:
nb = MultinomialNB()

In [33]:
grid_search_nb = GridSearchCV(estimator=nb, param_grid=param_grid_nb, cv=5, n_jobs=-1, verbose=2)

In [34]:
grid_search_nb.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [35]:
with open('grid_search_nb.pkl', 'wb') as model_file:
    pickle.dump(grid_search_nb, model_file)

In [36]:
print("Melhor score de validação:", grid_search_nb.best_score_)

Melhor score de validação: 0.941422137985413


### SVM

In [11]:
param_grid_linear_svm = {
    'C': [0.1, 1, 10],
    'class_weight': [None, 'balanced'],
    'max_iter': [1000, 2000, 5000]  # Opcional: ajuste conforme necessário
}

In [12]:
linear_svm = LinearSVC(random_state=42)

In [13]:
grid_search_linear_svm = GridSearchCV(estimator=linear_svm, param_grid=param_grid_linear_svm, cv=5, n_jobs=-1, verbose=2)

In [14]:
grid_search_linear_svm.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [15]:
with open('grid_search_linear_svm.pkl', 'wb') as model_file:
    pickle.dump(grid_search_linear_svm, model_file)

In [16]:
print("Melhor score de validação:", grid_search_linear_svm.best_score_)

Melhor score de validação: 0.9944039564462864


## Bagging

### Random Forest

In [10]:
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 10]
}


In [11]:
rf = RandomForestClassifier(random_state=42)

In [12]:
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)

In [13]:
grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [14]:
with open('grid_search_rf.pkl', 'wb') as file:
    pickle.dump(grid_search_rf, file)

In [15]:
print("Melhor score de validação:", grid_search_rf.best_score_)

Melhor score de validação: 0.9883901244493696


## Boosting

### AdaBoost

In [16]:
param_grid_ada = {
    'n_estimators': [50, 100, 200], 
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'algorithm': ['SAMME', 'SAMME.R'], 
}

In [17]:
ada = AdaBoostClassifier(random_state=42)

In [18]:
grid_search_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=5, n_jobs=-1, verbose=2)

In [None]:
grid_search_ada.fit(X_train, y_train)

In [20]:
with open('grid_search_ada.pkl', 'wb') as f:
    pickle.dump(grid_search_ada, f)

In [21]:
print("Melhor score de validação:", grid_search_ada.best_score_)

Melhor score de validação: 0.9972158418463293


### XGBoost

In [None]:
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.3],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0],
    'gamma': [0, 0.1]
}


In [None]:
xgb = XGBClassifier(random_state=42)

In [None]:
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, n_jobs=-1, verbose=2)

In [None]:
grid_search_xgb.fit(X_train, y_train)

In [None]:
with open('grid_search_xgb.pkl', 'wb') as f:
    pickle.dump(grid_search_xgb, f)

In [None]:
print("Melhor score de validação:", grid_search_xgb.best_score_)

## Stacking

In [17]:
param_grid_stacking = {
    'final_estimator__C': [0.01, 0.1, 1.0, 10.0],  
    'final_estimator__solver': ['liblinear', 'saga'],  
    'passthrough': [False, True]  
}

In [19]:
with open('./CV_RESULTS_ISOT/grid_search_dt.pkl', 'rb') as f:
    grid_search_dt = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_nb.pkl', 'rb') as f:
    grid_search_nb = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_lr.pkl', 'rb') as f:
    grid_search_lr = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_linear_svm.pkl', 'rb') as f:
    grid_search_linear_svm = pickle.load(f)

In [21]:
base_estimators = [
    ('lr', grid_search_lr.best_estimator_),
    ('nb', grid_search_nb.best_estimator_),
    ('dt', grid_search_dt.best_estimator_),
    ('svm', grid_search_linear_svm.best_estimator_)
]

In [22]:
meta_estimator = LogisticRegression(max_iter=1000, random_state=42)

In [23]:
stacking = StackingClassifier(estimators=base_estimators, final_estimator=meta_estimator)

In [24]:
grid_search_stacking = GridSearchCV(estimator=stacking, param_grid=param_grid_stacking, cv=5, verbose=1, n_jobs=-1)

In [25]:
grid_search_stacking.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits




[CV] END ............C=0.1, class_weight=None, max_iter=1000; total time=   0.8s
[CV] END ............C=0.1, class_weight=None, max_iter=2000; total time=   0.8s
[CV] END ............C=0.1, class_weight=None, max_iter=5000; total time=   0.8s
[CV] END ........C=0.1, class_weight=balanced, max_iter=1000; total time=   0.9s
[CV] END ........C=0.1, class_weight=balanced, max_iter=2000; total time=   0.9s
[CV] END ..............C=1, class_weight=None, max_iter=1000; total time=   1.2s
[CV] END ..............C=1, class_weight=None, max_iter=2000; total time=   1.1s
[CV] END ..............C=1, class_weight=None, max_iter=5000; total time=   1.3s
[CV] END ..........C=1, class_weight=balanced, max_iter=1000; total time=   1.3s
[CV] END ..........C=1, class_weight=balanced, max_iter=2000; total time=   1.4s
[CV] END .............C=10, class_weight=None, max_iter=1000; total time=   3.5s
[CV] END .............C=10, class_weight=None, max_iter=2000; total time=   3.6s
[CV] END .............C=10, 

In [26]:
with open('grid_search_stacking.pkl', 'wb') as f:
    pickle.dump(grid_search_stacking, f)

In [27]:
print("Melhor score de validação:", grid_search_stacking.best_score_)

Melhor score de validação: 0.9976056317393797
[CV] END ............C=0.1, class_weight=None, max_iter=2000; total time=   0.8s
[CV] END ............C=0.1, class_weight=None, max_iter=5000; total time=   0.9s
[CV] END ........C=0.1, class_weight=balanced, max_iter=1000; total time=   0.9s
[CV] END ........C=0.1, class_weight=balanced, max_iter=2000; total time=   0.9s
[CV] END ........C=0.1, class_weight=balanced, max_iter=5000; total time=   0.9s
[CV] END ..............C=1, class_weight=None, max_iter=1000; total time=   1.2s
[CV] END ..............C=1, class_weight=None, max_iter=2000; total time=   1.2s
[CV] END ..........C=1, class_weight=balanced, max_iter=1000; total time=   1.2s
[CV] END ..........C=1, class_weight=balanced, max_iter=2000; total time=   1.2s
[CV] END ..........C=1, class_weight=balanced, max_iter=5000; total time=   1.3s
[CV] END .............C=10, class_weight=None, max_iter=1000; total time=   4.9s
[CV] END .............C=10, class_weight=None, max_iter=2000; t

# RESULTS

In [29]:
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

with open('./CV_RESULTS_ISOT/grid_search_dt.pkl', 'rb') as f:
    grid_search_dt = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_nb.pkl', 'rb') as f:
    grid_search_nb = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_lr.pkl', 'rb') as f:
    grid_search_lr = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_linear_svm.pkl', 'rb') as f:
    grid_search_linear_svm = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_rf.pkl', 'rb') as f:
    grid_search_rf = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_ada.pkl', 'rb') as f:
    grid_search_ada = pickle.load(f)

#with open('grid_search_xgb.pkl', 'rb') as f:
#    grid_search_xgb = pickle.load(f)

with open('./CV_RESULTS_ISOT/grid_search_stacking.pkl', 'rb') as f:
    grid_search_stacking = pickle.load(f)

#Extract the best models from the grid search
best_dt = grid_search_dt.best_estimator_
best_nb = grid_search_nb.best_estimator_
best_lr = grid_search_lr.best_estimator_
best_linear_svm = grid_search_linear_svm.best_estimator_
best_rf = grid_search_rf.best_estimator_
best_ada = grid_search_ada.best_estimator_
#best_xgb = grid_search_xgb.best_estimator_
best_stacking = grid_search_stacking.best_estimator_


# Define a function to compute and print metrics
def print_metrics(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    
    print(f"Metrics for {model_name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.9f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.9f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.9f}")
    print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.9f}")
    print("-" * 40)

# Assuming you have the test data X_test, y_test
# Replace these with your actual test data
# X_test, y_test = ...

# Print metrics for each model
print_metrics(best_dt, X_test, y_test, 'Decision Tree')
print_metrics(best_nb, X_test, y_test, 'Naive Bayes')
print_metrics(best_lr, X_test, y_test, 'Logistic Regression')
print_metrics(best_linear_svm, X_test, y_test, 'Linear SVM')
print_metrics(best_rf, X_test, y_test, 'Random Forest')
print_metrics(best_ada, X_test, y_test, 'AdaBoost')
#print_metrics(best_xgb, X_test, y_test, 'XGBoost')
print_metrics(best_stacking, X_test, y_test, 'Stacking')

Metrics for Decision Tree:
Accuracy: 0.999109131
Precision: 0.999109214
Recall: 0.999109131
F1-Score: 0.999109123
----------------------------------------
Metrics for Naive Bayes:
Accuracy: 0.952672606
Precision: 0.952673389
Recall: 0.952672606
F1-Score: 0.952667778
----------------------------------------
Metrics for Logistic Regression:
Accuracy: 0.997216036
Precision: 0.997216035
Recall: 0.997216036
F1-Score: 0.997216023
----------------------------------------
Metrics for Linear SVM:
Accuracy: 0.995211581
Precision: 0.995211562
Recall: 0.995211581
F1-Score: 0.995211559
----------------------------------------
Metrics for Random Forest:
Accuracy: 0.997995546
Precision: 0.997995608
Recall: 0.997995546
F1-Score: 0.997995527
----------------------------------------
Metrics for AdaBoost:
Accuracy: 0.999443207
Precision: 0.999443802
Recall: 0.999443207
F1-Score: 0.999443194
----------------------------------------
Metrics for Stacking:
Accuracy: 0.997438753
Precision: 0.997439488
Recall:

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

frases = [
    "gato gato" ,
    "cachorro",
    "ave"
]

vectorizer = TfidfVectorizer(norm=None)

tfidf_matrix = vectorizer.fit_transform(frases)

palavras = vectorizer.get_feature_names_out()

df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=palavras)

print(df_tfidf)

        ave  cachorro      gato
0  0.000000  0.000000  3.386294
1  0.000000  1.693147  0.000000
2  1.693147  0.000000  0.000000


In [None]:
TF * IDF

1 * 

TF -> FREQUENCIA QUE A PALAVRA PARECE NO SETENCÇA
IDF -> LOG (NUMERO DE DOCUMENTOS / A QUANTIDADE DE DOCUMENTOS QUE TEM O TERMO)