In [1]:
import os
import pickle
import sqlite3
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
# Connect to database
path = os.path.join('..', 'Database', 'news.db')
conn = sqlite3.connect(path)
cursor = conn.cursor()

In [3]:
# Load TF-IDF features
cursor.execute("SELECT data FROM features WHERE type = 'tfidf'")
X_tfidf_compressed = cursor.fetchone()[0]
X_tfidf = pickle.loads(X_tfidf_compressed)
print("Features loaded successfully!")

Features loaded successfully!


In [4]:
# Load labels from the original dataset
df= pd.read_sql("SELECT id, label FROM cleanedText", conn) 
df['label'] = df['label'].apply(lambda x: 1 if x == 'real' else (0 if x == 'fake' else None))

In [5]:
# Assuming X_tfidf is the extracted features
y = df['label'].values # Labels (0: Fake, 1: Real) 

In [6]:
# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=4893)
print("Data split into training and testing sets!")
print("Train size: ", X_train.shape, "Test size: ", X_test.shape)

Data split into training and testing sets!
Train size:  (35918, 5000) Test size:  (8980, 5000)


In [7]:
# Train onest model (Logistic Regression)
lr_model = LogisticRegression(C=21.5443, penalty='l1', solver='liblinear')
lr_model.fit(X_train, y_train)

In [8]:
# Calculate training accuracy
train_accuracy_lr = accuracy_score(y_train, lr_model.predict(X_train))
test_accuracy_lr = accuracy_score(y_test, lr_model.predict(X_test))

print (f"Logistic REgression: Train Accuracy = {train_accuracy_lr}, Test Accuracy = {test_accuracy_lr}")

Logistic REgression: Train Accuracy = 0.9999443176123393, Test Accuracy = 0.99543429844098


In [9]:
# Perform 5-fold cross-validation for Logistic Regression
cv_scores = cross_val_score(lr_model, X_train, y_train, cv=5)

print(f"Logistic Regression Cross-Validation Accuracy: {cv_scores.mean()} ± {cv_scores.std()}")

Logistic Regression Cross-Validation Accuracy: 0.9952948017885586 ± 0.0007122599612922106


In [11]:
path = os.path.join('..', 'Models', 'lrModel.model')
with open(path, 'wb') as f:
    pickle.dump(lr_model, f)