In [1]:
import os
import pickle
import sqlite3
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# Connect to database
path = os.path.join('..', 'Database', 'news.db')
conn = sqlite3.connect(path)
cursor = conn.cursor()

In [3]:
# Load TF-IDF features
cursor.execute("SELECT data FROM features WHERE type = 'tfidf'")
X_tfidf_compressed = cursor.fetchone()[0]
X_tfidf = pickle.loads(X_tfidf_compressed)
print("Features loaded successfully!")

Features loaded successfully!


In [4]:
# Load labels from the original dataset
df= pd.read_sql("SELECT id, label FROM cleanedText", conn) 
df['label'] = df['label'].apply(lambda x: 1 if x == 'real' else (0 if x == 'fake' else None))

In [5]:
# Assuming X_tfidf is the extracted features
y = df['label'].values # Labels (0: Fake, 1: Real) 

In [6]:
# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=1311)
print("Data split into training and testing sets!")
print("Train size: ", X_train.shape, "Test size: ", X_test.shape)

Data split into training and testing sets!
Train size:  (35918, 5000) Test size:  (8980, 5000)


In [7]:
# Define a wide range of C values for regularization strength
C_values = np.logspace(-4, 4, 10) # 10 values from 0.0001 to 10,000

# Define the hyperparameter grid
param_grid = {
    "C": C_values,
    "penalty": ["l1", "l2"],  # Test both L1 and L2 regularization
    "solver": ["liblinear", "saga"]  # Suitable solvers for L1 and L2
}

# Initialize Grid Search with 10-Fold Cross Validation
grid_search = GridSearchCV(
    LogisticRegression(max_iter=5000),  # Increase max_iter to ensure convergence
    param_grid,
    cv=10,  # 10-Fold Cross Validation
    scoring="accuracy",
    n_jobs=-1,  # Use all CPU cores for faster execution
    verbose=10  # Display detailed progress
)

In [8]:
# Fit the model on the training data
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 40 candidates, totalling 400 fits
[CV 1/10; 2/40] START C=0.0001, penalty=l1, solver=saga.........................
[CV 1/10; 2/40] END C=0.0001, penalty=l1, solver=saga;, score=0.523 total time=   0.3s
[CV 7/10; 2/40] START C=0.0001, penalty=l1, solver=saga.........................
[CV 7/10; 2/40] END C=0.0001, penalty=l1, solver=saga;, score=0.477 total time=   0.2s
[CV 1/10; 4/40] START C=0.0001, penalty=l2, solver=saga.........................
[CV 1/10; 4/40] END C=0.0001, penalty=l2, solver=saga;, score=0.523 total time=   2.1s
[CV 5/10; 6/40] START C=0.000774263682681127, penalty=l1, solver=saga...........
[CV 5/10; 6/40] END C=0.000774263682681127, penalty=l1, solver=saga;, score=0.523 total time=   0.4s
[CV 9/10; 6/40] START C=0.000774263682681127, penalty=l1, solver=saga...........
[CV 9/10; 6/40] END C=0.000774263682681127, penalty=l1, solver=saga;, score=0.523 total time=   0.7s
[CV 1/10; 8/40] START C=0.000774263682681127, penalty=l2, solver=saga

In [9]:
# Print the best hyperparameters and the corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 21.54434690031882, 'penalty': 'l1', 'solver': 'liblinear'}
Best Cross-Validation Accuracy: 0.9951834548013192
