In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib
import numpy as np

# Load dataset
df = pd.read_csv("WELFake_Dataset.csv", encoding="latin1")

# Drop rows with missing title, text, or label
df = df.dropna(subset=['title', 'text', 'label'])

# Binary labels: 0 (real), 1 (fake)
df = df[df['label'].isin([0, 1])]
df['label'] = df['label'].astype(int)

# Combine title + text
df['content'] = df['title'].astype(str) + " " + df['text'].astype(str)

X = df['content']
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Logistic Regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)

# Save models
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
joblib.dump(clf, "logistic_model.joblib")

# Example: predict probability and word scores for a single text
def predict_example(text):
    vec = vectorizer.transform([text])
    prob = clf.predict_proba(vec)[0]
    pred = int(clf.predict(vec)[0])
    
    # Word-level scores
    feature_names = vectorizer.get_feature_names_out()
    word_scores = {}
    words = text.lower().split()
    for w in words:
        if w in feature_names:
            idx = np.where(feature_names == w)[0][0]
            word_scores[w] = prob[1] * vec[0, idx]  # simplistic contribution score
    return {"label": pred, "probability": prob[1], "word_scores": word_scores}

print(predict_example("Breaking news: something happened here"))


{'label': 1, 'probability': np.float64(0.998892035148406), 'word_scores': {'breaking': np.float64(0.6867727632543422), 'happened': np.float64(0.6179266833130268)}}
