In [None]:
from logging import warning

import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [None]:
# load data
data = pd.read_csv("data/latest_news.csv")


print("Initial shape:", data.shape)
data.head()

In [None]:
# keep relevant columns
data = data[['title', 'text', 'label']]

# remove missing values
data.dropna(inplace=True)

print("After cleaning:", data.shape)
data.head()

In [None]:
# encode labels
label_map = {'FAKE': 0, 'REAL': 1}
data['label'] = data['label'].map(label_map)

print("Label distribution:")
print(data['label'].value_counts())

In [None]:
# combine title and text
data['content'] = data['title'].astype(str) + " " + data['text'].astype(str)

X = data['content']
y = data['label']

In [None]:
# split train/test data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

In [None]:
# vectorization
vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.9
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF feature size:", X_train_tfidf.shape[1])


In [None]:
# train model
model = LogisticRegression(
    max_iter=1000,
    solver="liblinear"
)

model.fit(X_train_tfidf, y_train)

print("VeritasAI model trained")

In [None]:
# evaluate model
y_pred = model.predict(X_test_tfidf)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# save model and vectorizer

with open("veritasai_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("veritasai_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("label_map.pkl", "wb") as f:
    pickle.dump(label_map, f)

print("Model, vectorizer, and label map saved")

In [None]:
# load and test saved model
with open("veritasai_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

with open("veritasai_vectorizer.pkl", "rb") as f:
    loaded_vectorizer = pickle.load(f)

with open("label_map.pkl", "rb") as f:
    label_map = pickle.load(f)

inverse_label_map = {v: k for k, v in label_map.items()}

# realistic test sample
sample_title = "Mexico president asks South Korea for more BTS concerts"
sample_text = (
    "The scramble for tickets to K-pop band BTS' comeback tour, which comes after a four-year hiatus, has seen Mexico's president appealing to her South Korean counterpart to add more shows in her country. 'I wrote a letter to the [president] of Korea... I still haven't received the answer, but let's hope it's positive,' Mexico's president Claudia Sheinbaum said on Monday. BTS will hold three shows in Mexico City in May, as part of its 79-date world tour after a four-year hiatus. Tickets were wiped out in less than 40 minutes, local media reported. Some fans have also accused Ticketmaster and resale platforms of dynamic pricing, prompting an investigation."
)

sample_content = sample_title + " " + sample_text

sample_tfidf = loaded_vectorizer.transform([sample_content])

print("Non-zero features:", sample_tfidf.nnz)

prediction = loaded_model.predict(sample_tfidf)
probabilities = loaded_model.predict_proba(sample_tfidf)

print("Prediction:", inverse_label_map[prediction[0]])
print("Confidence (FAKE, REAL):", probabilities[0])

real_conf = probabilities[0][1]

if real_conf > 0.7:
    verdict = "Likely REAL"
elif real_conf < 0.3:
    verdict = "Likely FAKE"
else:
    verdict = "UNCERTAIN"

print("Verdict:", verdict)

if sample_tfidf.nnz < 40:
    warning = "Text may be outside training domain"
    print(warning)


In [None]:
with open("latest_news.pkl", "wb") as f:
    pickle.dump(data, f)

print("Latest news dataset saved as PKL")

In [None]:
with open("latest_news.pkl", "rb") as f:
    latest_data = pickle.load(f)

print(latest_data.shape)
latest_data.head()

In [None]:
train_pred = model.predict(X_train_tfidf)

print("Train accuracy:", accuracy_score(y_train, train_pred))
print("Test accuracy:", accuracy_score(y_test, y_pred))


In [None]:
# remove exact duplicate articles
before = data.shape[0]

data = data.drop_duplicates(subset=["content"])

after = data.shape[0]
print(f"Removed {before - after} duplicate rows")


In [None]:
data = data.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
leakage_words = [
    "reuters", "associated press", "fact check",
    "fake news", "snopes", "politifact"
]

pattern = "|".join(leakage_words)

data["content"] = data["content"].str.replace(
    pattern, "", case=False, regex=True
)


In [None]:
from sklearn.model_selection import train_test_split

X = data["content"]
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.85
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    C=0.3,              # stronger regularization
    class_weight="balanced"
)


model.fit(X_train_tfidf, y_train)


In [None]:
from sklearn.metrics import accuracy_score

train_pred = model.predict(X_train_tfidf)
test_pred = model.predict(X_test_tfidf)

print("Train accuracy:", accuracy_score(y_train, train_pred))
print("Test accuracy:", accuracy_score(y_test, test_pred))
