# UTILS

In [13]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting nvidia-nccl-cu12
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import re
import string
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, recall_score, f1_score, precision_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [2]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

## LIAR

### Pre-processing

In [3]:
column_names = [
    'id', 'label', 'statement', 'subjects', 'speaker', 'job_title', 'state_info', 
    'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 
    'mostly_true_counts', 'pants_on_fire_counts', 'context'
]

In [4]:
df_test = pd.read_csv('./LIAR/test.tsv', sep='\t', names=column_names)
df_train = pd.read_csv('./LIAR/train.tsv', sep='\t', names=column_names)

In [5]:
df = pd.concat([df_train[['label', 'statement']], df_test[['label', 'statement']]], ignore_index=True)
df

Unnamed: 0,label,statement
0,false,Says the Annies List political group supports ...
1,half-true,When did the decline of coal start? It started...
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo..."
3,false,Health care reform legislation is likely to ma...
4,half-true,The economic turnaround started at the end of ...
...,...,...
11502,half-true,Says his budget provides the highest state fun...
11503,barely-true,Ive been here almost every day.
11504,barely-true,"In the early 1980s, Sen. Edward Kennedy secret..."
11505,barely-true,Says an EPA permit languished under Strickland...


In [6]:
class_mapping = {
    'true': 1,
    'mostly-true': 1,
    'half-true': 1,
    'barely-true': 0,
    'false': 0,
    'pants-fire': 0
}

In [7]:
df['class'] = df['label'].map(class_mapping)
df

Unnamed: 0,label,statement,class
0,false,Says the Annies List political group supports ...,0
1,half-true,When did the decline of coal start? It started...,1
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",1
3,false,Health care reform legislation is likely to ma...,0
4,half-true,The economic turnaround started at the end of ...,1
...,...,...,...
11502,half-true,Says his budget provides the highest state fun...,1
11503,barely-true,Ive been here almost every day.,0
11504,barely-true,"In the early 1980s, Sen. Edward Kennedy secret...",0
11505,barely-true,Says an EPA permit languished under Strickland...,0


In [8]:
df["statement"] = df["statement"].apply(wordopt)
df

Unnamed: 0,label,statement,class
0,false,says the annies list political group supports ...,0
1,half-true,when did the decline of coal start it started...,1
2,mostly-true,hillary clinton agrees with john mccain by vo...,1
3,false,health care reform legislation is likely to ma...,0
4,half-true,the economic turnaround started at the end of ...,1
...,...,...,...
11502,half-true,says his budget provides the highest state fun...,1
11503,barely-true,ive been here almost every day,0
11504,barely-true,in the early sen edward kennedy secretly of...,0
11505,barely-true,says an epa permit languished under strickland...,0


In [9]:
X = df["statement"]
y = df["class"]

In [10]:
vectorizer = TfidfVectorizer()
xv = vectorizer.fit_transform(X)
xv.shape  

(11507, 12201)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(xv, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)  

(9205, 12201)
(2302, 12201)


## Base Learners

### Decision Tree

In [14]:
param_grid_dt = {
    'max_depth': [None, 10],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 2]
}

In [15]:
dt = DecisionTreeClassifier()

In [16]:
grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, n_jobs=-1, verbose=2)

In [17]:
grid_search_dt.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [18]:
with open('grid_search_dt.pkl', 'wb') as file:
    pickle.dump(grid_search_dt, file)

In [19]:
print("Melhor score de validação:", grid_search_dt.best_score_)

Melhor score de validação: 0.5609994568169473


### Logistic Regression

In [20]:
param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [100, 200]
}

In [21]:
lr = LogisticRegression()

In [22]:
grid_search_lr = GridSearchCV(estimator=lr, param_grid=param_grid_lr, cv=5, n_jobs=-1, verbose=2)

In [23]:
grid_search_lr.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [24]:
with open('grid_search_lr.pkl', 'wb') as file:
    pickle.dump(grid_search_lr, file)

In [25]:
print("Melhor score de validação:", grid_search_lr.best_score_)

Melhor score de validação: 0.6085822922324823


### Naive Bayes

In [26]:
param_grid_nb = {
    'alpha': [0.1, 0.5, 1.0, 1.5],
    'fit_prior': [True, False]     
}

In [27]:
nb = MultinomialNB()

In [28]:
grid_search_nb = GridSearchCV(estimator=nb, param_grid=param_grid_nb, cv=5, n_jobs=-1, verbose=2)

In [29]:
grid_search_nb.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [30]:
with open('grid_search_nb.pkl', 'wb') as model_file:
    pickle.dump(grid_search_nb, model_file)

In [31]:
print("Melhor score de validação:", grid_search_nb.best_score_)

Melhor score de validação: 0.6045627376425855


### SVM

In [34]:
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced']
}

In [35]:
svm = SVC(random_state=42)

In [36]:
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=5, n_jobs=-1, verbose=2)

In [None]:
grid_search_svm.fit(X_train, y_train)

In [None]:
with open('grid_search_svm.pkl', 'wb') as model_file:
    pickle.dump(grid_search_svm, model_file)

In [None]:
print("Melhor score de validação:", grid_search_svm.best_score_)

## Bagging

### Random Forest

In [32]:
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 10]
}


In [33]:
rf = RandomForestClassifier(random_state=42)

In [34]:
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)

In [35]:
grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [36]:
with open('grid_search_rf.pkl', 'wb') as file:
    pickle.dump(grid_search_rf, file)

In [37]:
print("Melhor score de validação:", grid_search_rf.best_score_)

Melhor score de validação: 0.6154263986963607


## Boosting

### AdaBoost

In [38]:
param_grid_ada = {
    'n_estimators': [50, 100, 200], 
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'algorithm': ['SAMME', 'SAMME.R'], 
}

In [39]:
ada = AdaBoostClassifier(random_state=42)

In [40]:
grid_search_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=5, n_jobs=-1, verbose=2)

In [41]:
grid_search_ada.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits




In [42]:
with open('grid_search_ada.pkl', 'wb') as f:
    pickle.dump(grid_search_ada, f)

In [43]:
print("Melhor score de validação:", grid_search_ada.best_score_)

Melhor score de validação: 0.5902227050516025


### XGBoost

In [44]:
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.3],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0],
    'gamma': [0, 0.1]
}


In [45]:
xgb = XGBClassifier(random_state=42)

In [46]:
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, n_jobs=-1, verbose=2)

In [47]:
grid_search_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [48]:
with open('grid_search_xgb.pkl', 'wb') as f:
    pickle.dump(grid_search_xgb, f)

In [49]:
print("Melhor score de validação:", grid_search_xgb.best_score_)

Melhor score de validação: 0.6018468223791418


## Stacking

In [None]:
param_grid_stacking = {
    'final_estimator__C': [0.01, 0.1, 1.0, 10.0],  
    'final_estimator__solver': ['liblinear', 'saga'],  
    'passthrough': [False, True]  
}

In [None]:
base_estimators = [
    ('lr', grid_search_lr.best_estimator__),
    ('nb', grid_search_nb.best_estimator__,
    ('dt', grid_search_dt.best_estimator__)
    ('svm', grid_search_svm.best_estimator__)
]

In [None]:
meta_estimator = LogisticRegression(max_iter=1000, random_state=42)

In [None]:
stacking = StackingClassifier(estimators=base_estimators, final_estimator=meta_estimator)

In [None]:
grid_search_stacking = GridSearchCV(estimator=stacking, param_grid=param_grid_stacking, cv=5, verbose=1, n_jobs=-1)

In [None]:
grid_search_stacking.fit(X_train, y_train)

In [None]:
with open('grid_search_stacking.pkl', 'wb') as f:
    pickle.dump(grid_search_stacking, f)

In [None]:
print("Melhor score de validação:", grid_search_stacking.best_score_)

# RESULTS

In [12]:
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the saved grid search objects
with open('CV_RESULTS_LIAR/grid_search_dt.pkl', 'rb') as f:
    grid_search_dt = pickle.load(f)

with open('CV_RESULTS_LIAR/grid_search_nb.pkl', 'rb') as f:
    grid_search_nb = pickle.load(f)

with open('CV_RESULTS_LIAR/grid_search_lr.pkl', 'rb') as f:
    grid_search_lr = pickle.load(f)

with open('CV_RESULTS_LIAR/grid_search_rf.pkl', 'rb') as f:
    grid_search_rf = pickle.load(f)

with open('CV_RESULTS_LIAR/grid_search_ada.pkl', 'rb') as f:
    grid_search_ada = pickle.load(f)

with open('CV_RESULTS_LIAR/grid_search_xgb.pkl', 'rb') as f:
    grid_search_xgb = pickle.load(f)

# Extract the best models from the grid search
best_dt = grid_search_dt.best_estimator_
best_nb = grid_search_nb.best_estimator_
best_lr = grid_search_lr.best_estimator_
best_rf = grid_search_rf.best_estimator_
best_ada = grid_search_ada.best_estimator_
best_xgb = grid_search_xgb.best_estimator_


# Define a function to compute and print metrics
def print_metrics(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    
    print(f"Metrics for {model_name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.9f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.9f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.9f}")
    print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.9f}")
    print("-" * 40)

# Assuming you have the test data X_test, y_test
# Replace these with your actual test data
# X_test, y_test = ...

# Print metrics for each model
print_metrics(best_dt, X_test, y_test, 'Decision Tree')
print_metrics(best_nb, X_test, y_test, 'Naive Bayes')
print_metrics(best_lr, X_test, y_test, 'Logistic Regression')
print_metrics(best_rf, X_test, y_test, 'Random Forest')
print_metrics(best_ada, X_test, y_test, 'AdaBoost')
print_metrics(best_xgb, X_test, y_test, 'XGBoost')

Metrics for Decision Tree:
Accuracy: 0.571242398
Precision: 0.567537565
Recall: 0.571242398
F1-Score: 0.568940043
----------------------------------------
Metrics for Naive Bayes:
Accuracy: 0.617289314
Precision: 0.608062077
Recall: 0.617289314
F1-Score: 0.598852080
----------------------------------------
Metrics for Logistic Regression:
Accuracy: 0.622502172
Precision: 0.616077883
Recall: 0.622502172
F1-Score: 0.616125927
----------------------------------------
Metrics for Random Forest:
Accuracy: 0.622067767
Precision: 0.614004788
Recall: 0.622067767
F1-Score: 0.610822890
----------------------------------------
Metrics for AdaBoost:
Accuracy: 0.605125977
Precision: 0.596069578
Recall: 0.605125977
F1-Score: 0.594814663
----------------------------------------
Metrics for XGBoost:
Accuracy: 0.619895743
Precision: 0.613813918
Recall: 0.619895743
F1-Score: 0.614301762
----------------------------------------
