In [1]:
import pandas as pd

df = pd.read_csv("../data/processed_misinfo.csv")
df.head()

# Removing empty / NaN texts to ensure compatibility with vectorizers
df = df.dropna(subset=["clean_text"])
df = df[df["clean_text"].str.strip() != ""]

df.reset_index(drop=True, inplace=True)



In [2]:
from sklearn.model_selection import train_test_split

X = df["clean_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

from sklearn.feature_extraction.text import TfidfVectorizer

# Removing empty / NaN texts to ensure compatibility with vectorizers
df.isnull().sum()

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)



In [3]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)

lr_preds = lr.predict(X_test_tfidf)


In [4]:
from sklearn.linear_model import PassiveAggressiveClassifier

pac = PassiveAggressiveClassifier(max_iter=1000)
pac.fit(X_train_tfidf, y_train)

pac_preds = pac.predict(X_test_tfidf)




In [5]:
from sklearn.metrics import accuracy_score, classification_report

print("Logistic Regression Accuracy:",
      accuracy_score(y_test, lr_preds))
print(classification_report(y_test, lr_preds))

print("\nPassive Aggressive Accuracy:",
      accuracy_score(y_test, pac_preds))
print(classification_report(y_test, pac_preds))


Logistic Regression Accuracy: 0.9852503558028205
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      3491
           1       0.98      0.99      0.99      4238

    accuracy                           0.99      7729
   macro avg       0.99      0.98      0.99      7729
weighted avg       0.99      0.99      0.99      7729


Passive Aggressive Accuracy: 0.9924957950575753
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3491
           1       0.99      1.00      0.99      4238

    accuracy                           0.99      7729
   macro avg       0.99      0.99      0.99      7729
weighted avg       0.99      0.99      0.99      7729



In [6]:
import pickle

with open("../models/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

with open("../models/logistic_model.pkl", "wb") as f:
    pickle.dump(lr, f)

with open("../models/pac_model.pkl", "wb") as f:
    pickle.dump(pac, f)
