In [21]:
import os
import pickle
import sqlite3
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 1 - Logistic Regression
from sklearn.linear_model import LogisticRegression

# 2 - Support Vector Machine
from sklearn.svm import SVC

# 3 - Random forest
from sklearn.ensemble import RandomForestClassifier 

# 4 - Naive Bayes
from sklearn.naive_bayes import MultinomialNB


In [2]:
# Connect to database
path = os.path.join('..', 'Database', 'news.db')
conn = sqlite3.connect(path)
cursor = conn.cursor()

In [3]:
# Load TF-IDF features
cursor.execute("SELECT data FROM features WHERE type = 'tfidf'")
X_tfidf_compressed = cursor.fetchone()[0]
X_tfidf = pickle.loads(X_tfidf_compressed)
print("Features loaded successfully!")

Features loaded successfully!


In [4]:
# Load labels from the original dataset
df= pd.read_sql("SELECT id, label FROM cleanedText", conn) 
df['label'] = df['label'].apply(lambda x: 1 if x == 'real' else (0 if x == 'fake' else None))

In [5]:
# Assuming X_tfidf is the extracted features
y = df['label'].values # Labels (0: Fake, 1: Real)

In [6]:
# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=1311)
print("Data split into training and testing sets!")
print("Train size: ", X_train.shape, "Test size: ", X_test.shape)

Data split into training and testing sets!
Train size:  (35918, 5000) Test size:  (8980, 5000)


In [7]:
# Train onest model (Logistic Regression)
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [8]:
# Predict 
y_pred = lr_model.predict(X_test)

In [9]:
# Evaluate accuracy 
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", accuracy)

Logistic Regression Accuracy: 0.9880846325167038


In [11]:
# Train SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)

In [12]:
# Predict SVM model
y_pred = svm_model.predict(X_test)

In [13]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("✅ SVM Accuracy:", accuracy)

✅ SVM Accuracy: 0.9951002227171493


In [16]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf_model.fit(X_train, y_train)

In [19]:
# Predict
y_pred = rf_model.predict(X_test)

In [20]:
# Evaluate accuracy 
accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Accuracy: ", accuracy)

Random Forest Accuracy:  0.9983296213808464


In [None]:
# Train Naïve Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict
y_pred = nb_model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("✅ Naïve Bayes Accuracy:", accuracy)

In [23]:
# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [24]:
# Predict
y_pred = nb_model.predict(X_test)

In [25]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Naive Bayes Accuracy: ", accuracy)

Naive Bayes Accuracy:  0.9361915367483297
