In [15]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import warnings

warnings.filterwarnings("ignore")
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# ===========================================
# Step 1: Load Dataset
true_df = pd.read_csv("True.csv")
fake_df = pd.read_csv("Fake.csv")

# ===========================================

true_df['label'] = 0  # Real
fake_df['label'] = 1  # Fake

# Combine datasets
df = pd.concat([true_df[['text', 'label']], fake_df[['text', 'label']]], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle

# ===========================================
# Step 2: Text Preprocessing
# ===========================================
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = text.split()
    stop_words = set(stopwords.words('english'))
    filtered = [w for w in words if w not in stop_words]
    return " ".join(filtered)

df['clean_text'] = df['text'].apply(preprocess)

# ===========================================
# Step 3: Vectorization
# ===========================================
X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# ===========================================
# Step 4: Model Training
# ===========================================
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# ===========================================
# Step 5: Evaluation
# ===========================================
y_pred = model.predict(X_test_vec)
print("âœ… Accuracy:", accuracy_score(y_test, y_pred))
print("\nðŸ§¾ Classification Report:\n", classification_report(y_test, y_pred))

# ===========================================
# Step 6: Prediction Function
# ===========================================
def predict_news(text):
    cleaned = preprocess(text)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    return "FAKE NEWS" if prediction == 1 else "REAL NEWS"

# ===========================================
# Step 7: Example Prediction
# ===========================================
example_text = "NASA confirms water found on the surface of the moon."
print("\nðŸ“£ Test Example:\n", example_text)
print("ðŸ”Ž Prediction:", predict_news(example_text))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: 'True.csv'