In [1]:
import os
import json
import pickle
import sqlite3
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Connect to the database and load cleaned data
path = os.path.join('..', 'Database', 'news.db')
conn = sqlite3.connect(path)
df = pd.read_sql("SELECT * FROM cleanedText", conn)
conn.close()

In [3]:
df.head()

Unnamed: 0,id,cleaned_text,label
0,1,washington reuters head conservative republica...,real
1,2,washington reuters transgender people allowed ...,real
2,3,washington reuters special counsel investigati...,real
3,4,washington reuters trump campaign adviser geor...,real
4,5,seattlewashington reuters president donald tru...,real


In [4]:
# Initialize CountVectorizer (Bag of Words)
vectorizer = CountVectorizer(max_features=5000)  # Use top 5000 words
X_bow = vectorizer.fit_transform(df["cleaned_text"])

# Convert to DataFrame for better visualization
X_bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())

print("Bag of Words Representation:")
X_bow_df.head()

Bag of Words Representation:


Unnamed: 0,abandon,abandoned,abbas,abc,abdullah,abe,abedin,ability,able,aboard,...,youre,youth,youtube,ypg,zealand,zero,zika,zimbabwe,zone,zuma
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df["cleaned_text"])

# Convert to DataFrame
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("TF-IDF Representation:")
X_tfidf_df.head()

TF-IDF Representation:


Unnamed: 0,abandon,abandoned,abbas,abc,abdullah,abe,abedin,ability,able,aboard,...,youre,youth,youtube,ypg,zealand,zero,zika,zimbabwe,zone,zuma
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Convert BoW and TF-IDF matrices to a compressed format
X_bow_compressed = pickle.dumps(X_bow)  # Convert to binary format
X_tfidf_compressed = pickle.dumps(X_tfidf)

# Connect to database
conn = sqlite3.connect("fake_news.db")
cursor = conn.cursor()

# Create table for compressed features
cursor.execute("""
CREATE TABLE IF NOT EXISTS features (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    type TEXT,  -- 'bow' or 'tfidf'
    data BLOB   -- Binary large object to store compressed data
)
""")

# Insert compressed features
cursor.execute("INSERT INTO features (type, data) VALUES (?, ?)", ("bow", X_bow_compressed))
cursor.execute("INSERT INTO features (type, data) VALUES (?, ?)", ("tfidf", X_tfidf_compressed))

# Commit and close
conn.commit()
conn.close()

print("Features compressed and saved to the database!")

Features compressed and saved to the database!


In [7]:
# Connect to database
conn = sqlite3.connect("fake_news.db")
cursor = conn.cursor()

# Load TF-IDF features
cursor.execute("SELECT data FROM features WHERE type = 'tfidf'")
X_tfidf_compressed = cursor.fetchone()[0]

# Decompress
X_tfidf = pickle.loads(X_tfidf_compressed)

print("TF-IDF features loaded successfully!")

TF-IDF features loaded successfully!


In [8]:
# Connect to database
conn = sqlite3.connect("fake_news.db")
cursor = conn.cursor()

# Load TF-IDF features
cursor.execute("SELECT data FROM features WHERE type = 'bow'")
X_bow_compressed = cursor.fetchone()[0]

# Decompress
X_bow = pickle.loads(X_bow_compressed)

print("bowF features loaded successfully!")

bowF features loaded successfully!
