In [2]:
from logging import warning

import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [21]:
# load data
data = pd.read_csv("data/news.csv")

print("Initial shape:", data.shape)
data.head()

Initial shape: (6335, 4)


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [22]:
# keep relevant columns
data = data[['title', 'text', 'label']]

# remove missing values
data.dropna(inplace=True)

print("After cleaning:", data.shape)
data.head()

After cleaning: (6335, 3)


Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [23]:
# encode labels
label_map = {'FAKE': 0, 'REAL': 1}
data['label'] = data['label'].map(label_map)

print("Label distribution:")
print(data['label'].value_counts())

Label distribution:
label
1    3171
0    3164
Name: count, dtype: int64


In [24]:
# combine title and text
data['content'] = data['title'].astype(str) + " " + data['text'].astype(str)

X = data['content']
y = data['label']

In [25]:
# split train/test data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

Train size: 5068
Test size: 1267


In [26]:
# vectorization
vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.9
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF feature size:", X_train_tfidf.shape[1])


TF-IDF feature size: 125110


In [27]:
# train model
model = LogisticRegression(
    max_iter=1000,
    solver="liblinear"
)

model.fit(X_train_tfidf, y_train)

print("VeritasAI model trained")

VeritasAI model trained


In [28]:
# evaluate model
y_pred = model.predict(X_test_tfidf)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9273875295974744

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.95      0.93       633
           1       0.95      0.90      0.93       634

    accuracy                           0.93      1267
   macro avg       0.93      0.93      0.93      1267
weighted avg       0.93      0.93      0.93      1267



In [29]:
# save model and vectorizer

with open("veritasai_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("veritasai_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("label_map.pkl", "wb") as f:
    pickle.dump(label_map, f)

print("Model, vectorizer, and label map saved")

Model, vectorizer, and label map saved


In [3]:
# load and test saved model
with open("veritasai_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

with open("veritasai_vectorizer.pkl", "rb") as f:
    loaded_vectorizer = pickle.load(f)

with open("label_map.pkl", "rb") as f:
    label_map = pickle.load(f)

inverse_label_map = {v: k for k, v in label_map.items()}

# realistic test sample
sample_title = "Mexico president asks South Korea for more BTS concerts"
sample_text = (
    "The scramble for tickets to K-pop band BTS' comeback tour, which comes after a four-year hiatus, has seen Mexico's president appealing to her South Korean counterpart to add more shows in her country. 'I wrote a letter to the [president] of Korea... I still haven't received the answer, but let's hope it's positive,' Mexico's president Claudia Sheinbaum said on Monday. BTS will hold three shows in Mexico City in May, as part of its 79-date world tour after a four-year hiatus. Tickets were wiped out in less than 40 minutes, local media reported. Some fans have also accused Ticketmaster and resale platforms of dynamic pricing, prompting an investigation."
)

sample_content = sample_title + " " + sample_text

sample_tfidf = loaded_vectorizer.transform([sample_content])

print("Non-zero features:", sample_tfidf.nnz)

prediction = loaded_model.predict(sample_tfidf)
probabilities = loaded_model.predict_proba(sample_tfidf)

print("Prediction:", inverse_label_map[prediction[0]])
print("Confidence (FAKE, REAL):", probabilities[0])

real_conf = probabilities[0][1]

if real_conf > 0.7:
    verdict = "Likely REAL"
elif real_conf < 0.3:
    verdict = "Likely FAKE"
else:
    verdict = "UNCERTAIN"

print("Verdict:", verdict)

if sample_tfidf.nnz < 40:
    warning = "Text may be outside training domain"
    print(warning)


Non-zero features: 61
Prediction: FAKE
Confidence (FAKE, REAL): [0.6183679 0.3816321]
Verdict: UNCERTAIN
